diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml new file mode 100644 index 000000000..a2da8e6d5 --- /dev/null +++ b/.github/workflows/benchmarks.yml @@ -0,0 +1,71 @@ +name: Benchmarks + +on: + workflow_dispatch: + inputs: + dataset_name: + description: 'The name of the dataset used to benchmark (songs or wiki)' + required: false + default: 'songs' + +jobs: + benchmarks: + name: Run and upload benchmarks + runs-on: self-hosted + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + # Set variables + - name: Set current branch name + shell: bash + run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})" + id: current_branch + - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 + shell: bash + run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" + id: normalized_current_branch + - name: Set shorter commit SHA + shell: bash + run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)" + id: commit_sha + - name: Set file basename with format "dataset_branch_commitSHA" + shell: bash + run: echo "##[set-output name=basename;]$(echo ${{ github.event.inputs.dataset_name }}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + id: file + + # Run benchmarks + - name: Run benchmarks - Dataset ${{ github.event.inputs.dataset_name }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} + run: | + cd benchmarks + cargo bench --bench ${{ github.event.inputs.dataset_name }} -- --save-baseline ${{ steps.file.outputs.basename }} + + # Generate critcmp files + - name: Install critcmp + run: cargo install critcmp + - name: Export cripcmp file + run: | + critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json + + # Upload benchmarks + - name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3 + uses: BetaHuhn/do-spaces-action@v2 + with: + access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }} + secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }} + space_name: ${{ secrets.DO_SPACES_SPACE_NAME }} + space_region: ${{ secrets.DO_SPACES_SPACE_REGION }} + source: ${{ steps.file.outputs.basename }}.json + out_dir: critcmp_results + + # Helper + - name: 'README: compare with another benchmark' + run: | + echo "${{ steps.file.outputs.basename }}.json has just been pushed." + echo 'How to compare this benchmark with another one?' + echo ' - Check the available files with: ./benchmarks/scripts/list.sh' + echo " - Run the following command: ./benchmaks/scipts/compare.sh ${{ steps.file.outputs.basename }}.json " diff --git a/Cargo.lock b/Cargo.lock index 0b1da2b3f..04fd284c6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -122,6 +122,20 @@ version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" +[[package]] +name = "benchmarks" +version = "0.1.0" +dependencies = [ + "anyhow", + "bytes 1.0.1", + "convert_case", + "criterion", + "flate2", + "heed", + "milli", + "reqwest", +] + [[package]] name = "big_s" version = "1.0.2" @@ -327,6 +341,12 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "convert_case" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" + [[package]] name = "cow-utils" version = "0.1.2" @@ -506,6 +526,15 @@ version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" +[[package]] +name = "encoding_rs" +version = "0.8.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80df024fbc5ac80f87dfef0d9f5209a252f2a497f7f42944cff24d8253cac065" +dependencies = [ + "cfg-if 1.0.0", +] + [[package]] name = "fake-simd" version = "0.1.2" @@ -750,12 +779,31 @@ dependencies = [ "http", "indexmap", "slab", - "tokio", - "tokio-util", + "tokio 0.2.25", + "tokio-util 0.3.1", "tracing", "tracing-futures", ] +[[package]] +name = "h2" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "825343c4eef0b63f541f8903f395dc5beb362a979b5799a84062527ef1e37726" +dependencies = [ + "bytes 1.0.1", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap", + "slab", + "tokio 1.6.0", + "tokio-util 0.6.7", + "tracing", +] + [[package]] name = "half" version = "1.7.1" @@ -893,6 +941,17 @@ dependencies = [ "http", ] +[[package]] +name = "http-body" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60daa14be0e0786db0f03a9e57cb404c9d756eed2b6c62b9ea98ec5743ec75a9" +dependencies = [ + "bytes 1.0.1", + "http", + "pin-project-lite 0.2.6", +] + [[package]] name = "http-ui" version = "0.2.1" @@ -922,7 +981,7 @@ dependencies = [ "stderrlog", "structopt", "tempfile", - "tokio", + "tokio 0.2.25", "warp", ] @@ -960,20 +1019,59 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", - "h2", + "h2 0.2.7", "http", - "http-body", + "http-body 0.3.1", "httparse", "httpdate", "itoa", "pin-project 1.0.5", - "socket2", - "tokio", + "socket2 0.3.19", + "tokio 0.2.25", "tower-service", "tracing", "want", ] +[[package]] +name = "hyper" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bf09f61b52cfcf4c00de50df88ae423d6c02354e385a86341133b5338630ad1" +dependencies = [ + "bytes 1.0.1", + "futures-channel", + "futures-core", + "futures-util", + "h2 0.3.3", + "http", + "http-body 0.4.2", + "httparse", + "httpdate", + "itoa", + "pin-project 1.0.5", + "socket2 0.4.0", + "tokio 1.6.0", + "tower-service", + "tracing", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f9f7a97316d44c0af9b0301e65010573a853a9fc97046d7331d7f6bc0fd5a64" +dependencies = [ + "futures-util", + "hyper 0.14.5", + "log", + "rustls", + "tokio 1.6.0", + "tokio-rustls", + "webpki", +] + [[package]] name = "idna" version = "0.2.2" @@ -1029,6 +1127,12 @@ dependencies = [ "libc", ] +[[package]] +name = "ipnet" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47be2f14c678be2fdcab04ab1171db51b2762ce6f0a8ee87c8dd4a04ed216135" + [[package]] name = "itertools" version = "0.9.0" @@ -1261,7 +1365,6 @@ dependencies = [ "bstr", "byteorder", "chrono", - "criterion", "crossbeam-channel", "csv", "either", @@ -1343,6 +1446,19 @@ dependencies = [ "winapi 0.2.8", ] +[[package]] +name = "mio" +version = "0.7.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf80d3e903b34e0bd7282b218398aec54e082c840d9baf8339e0080a0c542956" +dependencies = [ + "libc", + "log", + "miow 0.3.7", + "ntapi", + "winapi 0.3.9", +] + [[package]] name = "mio-named-pipes" version = "0.1.7" @@ -1350,7 +1466,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0840c1c50fd55e521b247f949c241c9997709f23bd7f023b9762cd561e935656" dependencies = [ "log", - "mio", + "mio 0.6.23", "miow 0.3.7", "winapi 0.3.9", ] @@ -1363,7 +1479,7 @@ checksum = "afcb699eb26d4332647cc848492bbc15eafb26f08d0304550d5aa1f612e066f0" dependencies = [ "iovec", "libc", - "mio", + "mio 0.6.23", ] [[package]] @@ -1441,6 +1557,15 @@ dependencies = [ "version_check", ] +[[package]] +name = "ntapi" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44" +dependencies = [ + "winapi 0.3.9", +] + [[package]] name = "num-integer" version = "0.1.44" @@ -1956,12 +2081,62 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "reqwest" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2296f2fac53979e8ccbc4a1136b25dcefd37be9ed7e4a1f6b05a6029c84ff124" +dependencies = [ + "base64 0.13.0", + "bytes 1.0.1", + "encoding_rs", + "futures-core", + "futures-util", + "http", + "http-body 0.4.2", + "hyper 0.14.5", + "hyper-rustls", + "ipnet", + "js-sys", + "lazy_static", + "log", + "mime", + "percent-encoding", + "pin-project-lite 0.2.6", + "rustls", + "serde", + "serde_urlencoded 0.7.0", + "tokio 1.6.0", + "tokio-rustls", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "webpki-roots", + "winreg", +] + [[package]] name = "retain_mut" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53552c6c49e1e13f1a203ef0080ab3bbef0beb570a528993e83df057a9d9bba1" +[[package]] +name = "ring" +version = "0.16.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc" +dependencies = [ + "cc", + "libc", + "once_cell", + "spin", + "untrusted", + "web-sys", + "winapi 0.3.9", +] + [[package]] name = "roaring" version = "0.6.6" @@ -1982,6 +2157,19 @@ dependencies = [ "semver", ] +[[package]] +name = "rustls" +version = "0.19.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35edb675feee39aec9c99fa5ff985081995a06d594114ae14cbe797ad7b7a6d7" +dependencies = [ + "base64 0.13.0", + "log", + "ring", + "sct", + "webpki", +] + [[package]] name = "ryu" version = "1.0.5" @@ -2015,6 +2203,16 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +[[package]] +name = "sct" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b362b83898e0e69f38515b82ee15aa80636befe47c3b6d3d89a911e78fc228ce" +dependencies = [ + "ring", + "untrusted", +] + [[package]] name = "search" version = "0.2.1" @@ -2108,6 +2306,18 @@ dependencies = [ "url", ] +[[package]] +name = "serde_urlencoded" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edfa57a7f8d9c1d260a549e7224100f6c43d43f9103e06dd8b4095a9b2b43ce9" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + [[package]] name = "sha-1" version = "0.8.2" @@ -2193,6 +2403,22 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "socket2" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e3dfc207c526015c632472a77be09cf1b6e46866581aecae5cc38fb4235dea2" +dependencies = [ + "libc", + "winapi 0.3.9", +] + +[[package]] +name = "spin" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" + [[package]] name = "static_assertions" version = "1.1.0" @@ -2386,7 +2612,7 @@ dependencies = [ "lazy_static", "libc", "memchr", - "mio", + "mio 0.6.23", "mio-named-pipes", "mio-uds", "num_cpus", @@ -2397,6 +2623,21 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "tokio" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd3076b5c8cc18138b8f8814895c11eb4de37114a5d127bafdc5e55798ceef37" +dependencies = [ + "autocfg", + "bytes 1.0.1", + "libc", + "memchr", + "mio 0.7.11", + "num_cpus", + "pin-project-lite 0.2.6", +] + [[package]] name = "tokio-macros" version = "0.2.6" @@ -2408,6 +2649,17 @@ dependencies = [ "syn 1.0.64", ] +[[package]] +name = "tokio-rustls" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc6844de72e57df1980054b38be3a9f4702aba4858be64dd700181a8a6d0e1b6" +dependencies = [ + "rustls", + "tokio 1.6.0", + "webpki", +] + [[package]] name = "tokio-tungstenite" version = "0.11.0" @@ -2417,7 +2669,7 @@ dependencies = [ "futures-util", "log", "pin-project 0.4.27", - "tokio", + "tokio 0.2.25", "tungstenite", ] @@ -2432,7 +2684,21 @@ dependencies = [ "futures-sink", "log", "pin-project-lite 0.1.12", - "tokio", + "tokio 0.2.25", +] + +[[package]] +name = "tokio-util" +version = "0.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1caa0b0c8d94a049db56b5acf8cba99dc0623aab1b26d5b5f5e2d945846b3592" +dependencies = [ + "bytes 1.0.1", + "futures-core", + "futures-sink", + "log", + "pin-project-lite 0.2.6", + "tokio 1.6.0", ] [[package]] @@ -2578,6 +2844,12 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" +[[package]] +name = "untrusted" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" + [[package]] name = "url" version = "2.2.1" @@ -2654,7 +2926,7 @@ dependencies = [ "futures", "headers", "http", - "hyper", + "hyper 0.13.10", "log", "mime", "mime_guess", @@ -2663,8 +2935,8 @@ dependencies = [ "scoped-tls", "serde", "serde_json", - "serde_urlencoded", - "tokio", + "serde_urlencoded 0.6.1", + "tokio 0.2.25", "tokio-tungstenite", "tower-service", "tracing", @@ -2691,6 +2963,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8fe8f61dba8e5d645a4d8132dc7a0a66861ed5e1045d2c0ed940fab33bac0fbe" dependencies = [ "cfg-if 1.0.0", + "serde", + "serde_json", "wasm-bindgen-macro", ] @@ -2709,6 +2983,18 @@ dependencies = [ "wasm-bindgen-shared", ] +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73157efb9af26fb564bb59a009afd1c7c334a44db171d280690d0c3faaec3468" +dependencies = [ + "cfg-if 1.0.0", + "js-sys", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "wasm-bindgen-macro" version = "0.2.72" @@ -2748,6 +3034,25 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "webpki" +version = "0.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e38c0608262c46d4a56202ebabdeb094cef7e560ca7a226c6bf055188aa4ea" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "webpki-roots" +version = "0.21.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aabe153544e473b775453675851ecc86863d2a81d786d741f6b76778f2a48940" +dependencies = [ + "webpki", +] + [[package]] name = "whatlang" version = "0.9.0" @@ -2800,6 +3105,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "winreg" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0120db82e8a1e0b9fb3345a539c478767c0048d842860994d96113d5b667bd69" +dependencies = [ + "winapi 0.3.9", +] + [[package]] name = "ws2_32-sys" version = "0.2.1" diff --git a/Cargo.toml b/Cargo.toml index a60c293e3..ff0b2582a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["milli", "http-ui", "infos", "helpers", "search"] +members = ["milli", "http-ui", "benchmarks", "infos", "helpers", "search"] default-members = ["milli"] [profile.release] diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore new file mode 100644 index 000000000..1f259516b --- /dev/null +++ b/benchmarks/.gitignore @@ -0,0 +1 @@ +benches/datasets_paths.rs diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml new file mode 100644 index 000000000..6be9c79d1 --- /dev/null +++ b/benchmarks/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "benchmarks" +version = "0.1.0" +edition = "2018" +publish = false + + +[dependencies] +milli = { path = "../milli" } + +[dev-dependencies] +heed = "*" # we want to use the version milli uses +criterion = "0.3.4" + +[build-dependencies] +anyhow = "1.0" +bytes = "1.0" +flate2 = "1.0.20" +convert_case = "0.4" +reqwest = { version = "0.11.3", features = ["blocking", "rustls-tls"], default-features = false } + +[[bench]] +name = "songs" +harness = false + +[[bench]] +name = "wiki" +harness = false diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 000000000..ebe8eecdf --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,110 @@ +Benchmarks +========== + +## TOC + +- [Datasets](#datasets) +- [Run the benchmarks](#run-the-benchmarks) +- [Comparison between benchmarks](#comparison-between-benchmarks) + +## Datasets + +The benchmarks are available for the following datasets: +- `songs` +- `wiki` + +### Songs + +`songs` is a subset of the [`songs.csv` dataset](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz). + +It was generated with this command: + +```bash +xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv +``` + +_[Download the generated `songs` dataset](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)._ + +### Wiki + +`wiki` is a subset of the [`wikipedia-articles.csv` dataset](https://meili-datasets.s3.fr-par.scw.cloud/wikipedia-articles.csv.gz). + +It was generated with the following command: + +```bash +xsv sample --seed 42 500000 wikipedia-articles.csv -o smol-wikipedia-articles.csv +``` + +_[Download the generated `wiki` dataset](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-wikipedia-articles.csv.gz)._ + +## Run the benchmarks + +### On our private server + +The Meili team has self-hosted his own GitHub runner to run benchmarks on our dedicated bare metal server. + +To trigger the benchmark workflow: +- Go to the `Actions` tab of this repository. +- Select the `Benchmarks` workflow on the left. +- Click on `Run workflow` in the blue banner. +- Select the branch on which you want to run the benchmarks and select the dataset you want (default: `songs`). +- Finally, click on `Run workflow`. + +This GitHub workflow will run the benchmarks and push the `critcmp` report to a DigitalOcean Space (= S3). + +The name of the uploaded file is displayed in the workflow. + +_[More about critcmp](https://github.com/BurntSushi/critcmp)._ + +💡 To compare the just-uploaded benchmark with another one, check out the [next section](#comparison-between-benchmarks). + +### On your machine + +To run all the benchmarks (~4h): + +```bash +cargo bench +``` + +To run only the `songs` (~1h) or `wiki` (~3h) benchmark: + +```bash +cargo bench --bench +``` + +By default, the benchmarks will be downloaded and uncompressed automatically in the target directory.
+If you don't want to download the datasets every time you update something on the code, you can specify a custom directory with the environment variable `MILLI_BENCH_DATASETS_PATH`: + +```bash +mkdir ~/datasets +MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded +touch build.rs +MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded +``` + +## Comparison between benchmarks + +The benchmark reports we push are generated with `critcmp`. Thus, we use `critcmp` to generate comparison results between 2 benchmarks. + +We provide a script to download and display the comparison report. + +Requirements: +- `grep` +- `curl` +- [`critcmp`](https://github.com/BurntSushi/critcmp) + +List the available file in the DO Space: + +```bash +./benchmarks/script/list.sh +``` +```bash +songs_main_09a4321.json +songs_geosearch_24ec456.json +``` + +Run the comparison script: + +```bash +./benchmarks/scripts/compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json +``` diff --git a/benchmarks/benches/songs.rs b/benchmarks/benches/songs.rs new file mode 100644 index 000000000..3f2822ca3 --- /dev/null +++ b/benchmarks/benches/songs.rs @@ -0,0 +1,211 @@ +mod datasets_paths; +mod utils; + +use criterion::{criterion_group, criterion_main}; +use milli::update::Settings; +use utils::Conf; + +fn base_conf(builder: &mut Settings) { + let displayed_fields = [ + "id", "title", "album", "artist", "genre", "country", "released", "duration", + ] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "album", "artist"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_searchable_fields(searchable_fields); + + let faceted_fields = [ + "released-timestamp", + "duration-float", + "genre", + "country", + "artist", + ] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_faceted_fields(faceted_fields); +} + +const BASE_CONF: Conf = Conf { + dataset: datasets_paths::SMOL_SONGS, + queries: &[ + "john ", // 9097 + "david ", // 4794 + "charles ", // 1957 + "david bowie ", // 1200 + "michael jackson ", // 600 + "thelonious monk ", // 303 + "charles mingus ", // 142 + "marcus miller ", // 60 + "tamo ", // 13 + "Notstandskomitee ", // 4 + ], + configure: base_conf, + primary_key: Some("id"), + ..Conf::BASE +}; + +fn bench_songs(c: &mut criterion::Criterion) { + let default_criterion: Vec = milli::default_criteria() + .iter() + .map(|criteria| criteria.to_string()) + .collect(); + let default_criterion = default_criterion.iter().map(|s| s.as_str()); + let asc_default: Vec<&str> = std::iter::once("asc(released-timestamp)") + .chain(default_criterion.clone()) + .collect(); + let desc_default: Vec<&str> = std::iter::once("desc(released-timestamp)") + .chain(default_criterion.clone()) + .collect(); + + let basic_with_quote: Vec = BASE_CONF + .queries + .iter() + .map(|s| { + s.trim() + .split(' ') + .map(|s| format!(r#""{}""#, s)) + .collect::>() + .join(" ") + }) + .collect(); + let basic_with_quote: &[&str] = &basic_with_quote + .iter() + .map(|s| s.as_str()) + .collect::>(); + + let confs = &[ + /* first we bench each criterion alone */ + utils::Conf { + group_name: "proximity", + queries: &[ + "black saint sinner lady ", + "les dangeureuses 1960 ", + "The Disneyland Sing-Along Chorus ", + "Under Great Northern Lights ", + "7000 Danses Un Jour Dans Notre Vie ", + ], + criterion: Some(&["proximity"]), + optional_words: false, + ..BASE_CONF + }, + utils::Conf { + group_name: "typo", + queries: &[ + "mongus ", + "thelonius monk ", + "Disnaylande ", + "the white striper ", + "indochie ", + "indochien ", + "klub des loopers ", + "fear of the duck ", + "michel depech ", + "stromal ", + "dire straights ", + "Arethla Franklin ", + ], + criterion: Some(&["typo"]), + optional_words: false, + ..BASE_CONF + }, + utils::Conf { + group_name: "words", + queries: &[ + "the black saint and the sinner lady and the good doggo ", // four words to pop + "les liaisons dangeureuses 1793 ", // one word to pop + "The Disneyland Children's Sing-Alone song ", // two words to pop + "seven nation mummy ", // one word to pop + "7000 Danses / Le Baiser / je me trompe de mots ", // four words to pop + "Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop + "whathavenotnsuchforth and a good amount of words to pop to match the first one ", // 13 + ], + criterion: Some(&["words"]), + ..BASE_CONF + }, + utils::Conf { + group_name: "asc", + criterion: Some(&["asc(released-timestamp)"]), + ..BASE_CONF + }, + utils::Conf { + group_name: "desc", + criterion: Some(&["desc(released-timestamp)"]), + ..BASE_CONF + }, + + /* then we bench the asc and desc criterion on top of the default criterion */ + utils::Conf { + group_name: "asc + default", + criterion: Some(&asc_default[..]), + ..BASE_CONF + }, + utils::Conf { + group_name: "desc + default", + criterion: Some(&desc_default[..]), + ..BASE_CONF + }, + + /* we bench the filters with the default request */ + utils::Conf { + group_name: "basic filter: <=", + facet_condition: Some("released-timestamp <= 946728000"), // year 2000 + ..BASE_CONF + }, + utils::Conf { + group_name: "basic filter: TO", + facet_condition: Some("released-timestamp 946728000 TO 1262347200"), // year 2000 to 2010 + ..BASE_CONF + }, + utils::Conf { + group_name: "big filter", + facet_condition: Some("released-timestamp != 1262347200 AND (NOT (released-timestamp = 946728000)) AND (duration-float = 1 OR (duration-float 1.1 TO 1.5 AND released-timestamp > 315576000))"), + ..BASE_CONF + }, + + /* the we bench some global / normal search with all the default criterion in the default + * order */ + utils::Conf { + group_name: "basic placeholder", + queries: &[""], + ..BASE_CONF + }, + utils::Conf { + group_name: "basic without quote", + queries: &BASE_CONF + .queries + .iter() + .map(|s| s.trim()) // we remove the space at the end of each request + .collect::>(), + ..BASE_CONF + }, + utils::Conf { + group_name: "basic with quote", + queries: basic_with_quote, + ..BASE_CONF + }, + utils::Conf { + group_name: "prefix search", + queries: &[ + "s", // 500k+ results + "a", // + "b", // + "i", // + "x", // only 7k results + ], + ..BASE_CONF + }, + ]; + + utils::run_benches(c, confs); +} + +criterion_group!(benches, bench_songs); +criterion_main!(benches); diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs new file mode 100644 index 000000000..83367a7ca --- /dev/null +++ b/benchmarks/benches/utils.rs @@ -0,0 +1,119 @@ +use std::fs::{create_dir_all, remove_dir_all, File}; + +use criterion::BenchmarkId; +use heed::EnvOpenOptions; +use milli::{ + update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat}, + FacetCondition, Index, +}; + +pub struct Conf<'a> { + /// where we are going to create our database.mmdb directory + /// each benchmark will first try to delete it and then recreate it + pub database_name: &'a str, + /// the dataset to be used, it must be an uncompressed csv + pub dataset: &'a str, + pub group_name: &'a str, + pub queries: &'a [&'a str], + /// here you can change which criterion are used and in which order. + /// - if you specify something all the base configuration will be thrown out + /// - if you don't specify anything (None) the default configuration will be kept + pub criterion: Option<&'a [&'a str]>, + /// the last chance to configure your database as you want + pub configure: fn(&mut Settings), + pub facet_condition: Option<&'a str>, + /// enable or disable the optional words on the query + pub optional_words: bool, + /// primary key, if there is None we'll auto-generate docids for every documents + pub primary_key: Option<&'a str>, +} + +impl Conf<'_> { + pub const BASE: Self = Conf { + database_name: "benches.mmdb", + dataset: "", + group_name: "", + queries: &[], + criterion: None, + configure: |_| (), + facet_condition: None, + optional_words: true, + primary_key: None, + }; +} + +pub fn base_setup(conf: &Conf) -> Index { + match remove_dir_all(&conf.database_name) { + Ok(_) => (), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => (), + Err(e) => panic!("{}", e), + } + create_dir_all(&conf.database_name).unwrap(); + + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + options.max_readers(10); + let index = Index::new(options, conf.database_name).unwrap(); + if let Some(primary_key) = conf.primary_key { + let mut wtxn = index.write_txn().unwrap(); + index.put_primary_key(&mut wtxn, primary_key).unwrap(); + } + + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.settings(&mut wtxn, &index); + + if let Some(criterion) = conf.criterion { + builder.reset_faceted_fields(); + builder.reset_criteria(); + builder.reset_stop_words(); + + let criterion = criterion.iter().map(|s| s.to_string()).collect(); + builder.set_criteria(criterion); + } + + (conf.configure)(&mut builder); + + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.index_documents(&mut wtxn, &index); + if let None = conf.primary_key { + builder.enable_autogenerate_docids(); + } + builder.update_format(UpdateFormat::Csv); + builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); + let reader = File::open(conf.dataset) + .expect(&format!("could not find the dataset in: {}", conf.dataset)); + builder.execute(reader, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + index +} + +pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { + for conf in confs { + let index = base_setup(conf); + + let mut group = c.benchmark_group(&format!("{}: {}", conf.dataset, conf.group_name)); + + for &query in conf.queries { + group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { + b.iter(|| { + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + search.query(query).optional_words(conf.optional_words); + if let Some(facet_condition) = conf.facet_condition { + let facet_condition = + FacetCondition::from_str(&rtxn, &index, facet_condition).unwrap(); + search.facet_condition(facet_condition); + } + let _ids = search.execute().unwrap(); + }); + }); + } + group.finish(); + } +} diff --git a/benchmarks/benches/wiki.rs b/benchmarks/benches/wiki.rs new file mode 100644 index 000000000..99ecff2ce --- /dev/null +++ b/benchmarks/benches/wiki.rs @@ -0,0 +1,133 @@ +mod datasets_paths; +mod utils; + +use criterion::{criterion_group, criterion_main}; +use milli::update::Settings; +use utils::Conf; + +fn base_conf(builder: &mut Settings) { + let displayed_fields = ["title", "body", "url"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); +} + +const BASE_CONF: Conf = Conf { + dataset: datasets_paths::SMOL_WIKI_ARTICLES, + queries: &[ + "mingus ", // 46 candidates + "miles davis ", // 159 + "rock and roll ", // 1007 + "machine ", // 3448 + "spain ", // 7002 + "japan ", // 10.593 + "france ", // 17.616 + "film ", // 24.959 + ], + configure: base_conf, + ..Conf::BASE +}; + +fn bench_songs(c: &mut criterion::Criterion) { + let basic_with_quote: Vec = BASE_CONF + .queries + .iter() + .map(|s| { + s.trim() + .split(' ') + .map(|s| format!(r#""{}""#, s)) + .collect::>() + .join(" ") + }) + .collect(); + let basic_with_quote: &[&str] = &basic_with_quote + .iter() + .map(|s| s.as_str()) + .collect::>(); + + let confs = &[ + /* first we bench each criterion alone */ + utils::Conf { + group_name: "proximity", + queries: &[ + "herald sings ", + "april paris ", + "tea two ", + "diesel engine ", + ], + criterion: Some(&["proximity"]), + optional_words: false, + ..BASE_CONF + }, + utils::Conf { + group_name: "typo", + queries: &[ + "migrosoft ", + "linax ", + "Disnaylande ", + "phytogropher ", + "nympalidea ", + "aritmetric ", + "the fronce ", + "sisan ", + ], + criterion: Some(&["typo"]), + optional_words: false, + ..BASE_CONF + }, + utils::Conf { + group_name: "words", + queries: &[ + "the black saint and the sinner lady and the good doggo ", // four words to pop, 27 results + "Kameya Tokujirō mingus monk ", // two words to pop, 55 + "Ulrich Hensel meilisearch milli ", // two words to pop, 306 + "Idaho Bellevue pizza ", // one word to pop, 800 + "Abraham machin ", // one word to pop, 1141 + ], + criterion: Some(&["words"]), + ..BASE_CONF + }, + /* the we bench some global / normal search with all the default criterion in the default + * order */ + utils::Conf { + group_name: "basic placeholder", + queries: &[""], + ..BASE_CONF + }, + utils::Conf { + group_name: "basic without quote", + queries: &BASE_CONF + .queries + .iter() + .map(|s| s.trim()) // we remove the space at the end of each request + .collect::>(), + ..BASE_CONF + }, + utils::Conf { + group_name: "basic with quote", + queries: basic_with_quote, + ..BASE_CONF + }, + utils::Conf { + group_name: "prefix search", + queries: &[ + "t", // 453k results + "c", // 405k + "g", // 318k + "j", // 227k + "q", // 71k + "x", // 17k + ], + ..BASE_CONF + }, + ]; + + utils::run_benches(c, confs); +} + +criterion_group!(benches, bench_songs); +criterion_main!(benches); diff --git a/benchmarks/build.rs b/benchmarks/build.rs new file mode 100644 index 000000000..dc92a1a4c --- /dev/null +++ b/benchmarks/build.rs @@ -0,0 +1,80 @@ +use std::path::{Path, PathBuf}; +use std::{env, fs}; +use std::{ + fs::File, + io::{Cursor, Read, Seek, Write}, +}; + +use bytes::Bytes; +use convert_case::{Case, Casing}; +use flate2::read::GzDecoder; +use reqwest::IntoUrl; + +const BASE_URL: &str = "https://meili-datasets.s3.fr-par.scw.cloud/benchmarks"; + +const DATASET_SONGS: &str = "smol-songs"; +const DATASET_WIKI: &str = "smol-wiki-articles"; + +/// The name of the environment variable used to select the path +/// of the directory containing the datasets +const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH"; + +fn main() -> anyhow::Result<()> { + let out_dir = PathBuf::from(env::var(BASE_DATASETS_PATH_KEY).unwrap_or(env::var("OUT_DIR")?)); + + let benches_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?).join("benches"); + let mut manifest_paths_file = File::create(benches_dir.join("datasets_paths.rs"))?; + writeln!( + manifest_paths_file, + r#"//! This file is generated by the build script. +//! Do not modify by hand, use the build.rs file. +#![allow(dead_code)] +"# + )?; + writeln!(manifest_paths_file)?; + + for dataset in &[DATASET_SONGS, DATASET_WIKI] { + let out_path = out_dir.join(dataset); + let out_file = out_path.with_extension("csv"); + + writeln!( + &mut manifest_paths_file, + r#"pub const {}: &str = {:?};"#, + dataset.to_case(Case::ScreamingSnake), + out_file.display(), + )?; + + if out_file.exists() { + eprintln!("The dataset {} already exists on the file system and will not be downloaded again", dataset); + continue; + } + let url = format!("{}/{}.csv.gz", BASE_URL, dataset); + eprintln!("downloading: {}", url); + let bytes = download_dataset(url.clone())?; + eprintln!("{} downloaded successfully", url); + eprintln!("uncompressing in {}", out_path.display()); + uncompress_in_file(bytes, &out_file)?; + } + + Ok(()) +} + +fn download_dataset(url: U) -> anyhow::Result> { + let bytes = reqwest::blocking::Client::builder() + .timeout(None) + .build()? + .get(url) + .send()? + .bytes()?; + Ok(Cursor::new(bytes)) +} + +fn uncompress_in_file>(bytes: R, path: P) -> anyhow::Result<()> { + let path = path.as_ref(); + let mut gz = GzDecoder::new(bytes); + let mut dataset = Vec::new(); + gz.read_to_end(&mut dataset)?; + + fs::write(path, dataset)?; + Ok(()) +} diff --git a/benchmarks/scripts/compare.sh b/benchmarks/scripts/compare.sh new file mode 100755 index 000000000..067772bec --- /dev/null +++ b/benchmarks/scripts/compare.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash + +# Requirements: +# - critcmp. See: https://github.com/BurntSushi/critcmp +# - curl + +# Usage +# $ bash compare.sh json_file1 json_file1 +# ex: bash compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json + +# Checking that critcmp is installed +command -v critcmp > /dev/null 2>&1 +if [[ "$?" -ne 0 ]]; then + echo 'You must install critcmp to make this script working.' + echo '$ cargo install critcmp' + echo 'See: https://github.com/BurntSushi/critcmp' + exit 1 +fi + +if [[ $# -ne 2 ]] + then + echo 'Need 2 arguments.' + echo 'Usage: ' + echo ' $ ./compare.sh file_to_download1 file_to_download2' + echo 'Ex:' + echo ' $ ./compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json' + exit 1 +fi + +file1="$1" +file2="$2" +s3_url='https://milli-benchmarks.fra1.digitaloceanspaces.com/critcmp_results' +file1_s3_url="$s3_url/$file1" +file2_s3_url="$s3_url/$file2" +file1_local_path="/tmp/$file1" +file2_local_path="/tmp/$file2" + +if [[ ! -f "$file1_local_path" ]]; then + curl "$file1_s3_url" -O "$file1_local_path" + if [[ "$?" -ne 0 ]]; then + echo 'curl command failed.' + exit 1 + fi +else + echo "$file1 already present in /tmp, no need to download." +fi + +if [[ ! -f "$file2_local_path" ]]; then + curl "$file2_s3_url" -O "$file2_local_path" + if [[ "$?" -ne 0 ]]; then + echo 'curl command failed.' + exit 1 + fi +else + echo "$file2 already present in /tmp, no need to download." +fi + +critcmp --color always "$file1_local_path" "$file2_local_path" diff --git a/benchmarks/scripts/list.sh b/benchmarks/scripts/list.sh new file mode 100755 index 000000000..764193329 --- /dev/null +++ b/benchmarks/scripts/list.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +# Requirements: +# - curl +# - grep + +res=$(curl -s https://milli-benchmarks.fra1.digitaloceanspaces.com | grep -o '[^<]\+' | cut -c 5- | grep critcmp_results/ | cut -c 18-) + +for pattern in "$@" +do + res=$(echo "$res" | grep $pattern) +done + +echo "$res" diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs new file mode 100644 index 000000000..4281ec115 --- /dev/null +++ b/benchmarks/src/lib.rs @@ -0,0 +1,5 @@ +//! This library is only used to isolate the benchmarks +//! from the original milli library. +//! +//! It does not include interesting functions for milli library +//! users only for milli contributors. diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 3b25bb268..2af6a9042 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -53,13 +53,8 @@ tinytemplate = "=1.1.0" [dev-dependencies] big_s = "1.0.2" -criterion = "0.3.4" maplit = "1.0.2" rand = "0.8.3" [features] default = [] - -[[bench]] -name = "search" -harness = false diff --git a/milli/benches/search.rs b/milli/benches/search.rs deleted file mode 100644 index a201e241c..000000000 --- a/milli/benches/search.rs +++ /dev/null @@ -1,36 +0,0 @@ -use std::time::Duration; - -use heed::EnvOpenOptions; -use milli::Index; -use criterion::{criterion_group, criterion_main, BenchmarkId}; - -fn bench_search(c: &mut criterion::Criterion) { - let database = "books-4cpu.mmdb"; - let queries = [ - "minogue kylie", - "minogue kylie live", - ]; - - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - options.max_readers(10); - let index = Index::new(options, database).unwrap(); - - let mut group = c.benchmark_group("search"); - group.sample_size(10); - group.measurement_time(Duration::from_secs(12)); - - for query in &queries { - group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { - b.iter(|| { - let rtxn = index.read_txn().unwrap(); - let _documents_ids = index.search(&rtxn).query(*query).execute().unwrap(); - }); - }); - } - - group.finish(); -} - -criterion_group!(benches, bench_search); -criterion_main!(benches);