Merge #207

207: Benchmarks r=Kerollmops a=irevoire Co-authored-by: tamo <tamo@meilisearch.com> Co-authored-by: Clémentine Urquizar <clementine@meilisearch.com> Co-authored-by: Tamo <irevoire@hotmail.fr> Co-authored-by: Irevoire <tamo@meilisearch.com>
2025-06-17 20:27:41 +02:00 · 2021-06-02 15:29:09 +00:00 · 2021-06-02 15:29:09 +00:00 · 28962bce99
commit 28962bce99
parent 270da98c46 6dc08bf45e
15 changed files with 1161 additions and 58 deletions
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@ -0,0 +1,71 @@
+name: Benchmarks
+
+on:
+  workflow_dispatch:
+    inputs:
+      dataset_name:
+        description: 'The name of the dataset used to benchmark (songs or wiki)'
+        required: false
+        default: 'songs'
+
+jobs:
+  benchmarks:
+    name: Run and upload benchmarks
+    runs-on: self-hosted
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions-rs/toolchain@v1
+        with:
+          profile: minimal
+          toolchain: stable
+          override: true
+
+      # Set variables
+      - name: Set current branch name
+        shell: bash
+        run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})"
+        id: current_branch
+      - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3
+        shell: bash
+        run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')"
+        id: normalized_current_branch
+      - name: Set shorter commit SHA
+        shell: bash
+        run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)"
+        id: commit_sha
+      - name: Set file basename with format "dataset_branch_commitSHA"
+        shell: bash
+        run: echo "##[set-output name=basename;]$(echo ${{ github.event.inputs.dataset_name }}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})"
+        id: file
+
+      # Run benchmarks
+      - name: Run benchmarks - Dataset ${{ github.event.inputs.dataset_name }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
+        run: |
+          cd benchmarks
+          cargo bench --bench ${{ github.event.inputs.dataset_name }} -- --save-baseline ${{ steps.file.outputs.basename }}
+
+      # Generate critcmp files
+      - name: Install critcmp
+        run: cargo install critcmp
+      - name: Export cripcmp file
+        run: |
+          critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json
+
+      # Upload benchmarks
+      - name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3
+        uses: BetaHuhn/do-spaces-action@v2
+        with:
+          access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }}
+          secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }}
+          space_name: ${{ secrets.DO_SPACES_SPACE_NAME }}
+          space_region: ${{ secrets.DO_SPACES_SPACE_REGION }}
+          source: ${{ steps.file.outputs.basename }}.json
+          out_dir: critcmp_results
+
+      # Helper
+      - name: 'README: compare with another benchmark'
+        run: |
+          echo "${{ steps.file.outputs.basename }}.json has just been pushed."
+          echo 'How to compare this benchmark with another one?'
+          echo '  - Check the available files with: ./benchmarks/scripts/list.sh'
+          echo "  - Run the following command: ./benchmaks/scipts/compare.sh ${{ steps.file.outputs.basename }}.json <file-to-compare-with>"
--- a/Cargo.lock
+++ b/Cargo.lock
@ -122,6 +122,20 @@ version = "0.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd"

+[[package]]
+name = "benchmarks"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "bytes 1.0.1",
+ "convert_case",
+ "criterion",
+ "flate2",
+ "heed",
+ "milli",
+ "reqwest",
+]
+
 [[package]]
 name = "big_s"
 version = "1.0.2"
@ -327,6 +341,12 @@ dependencies = [
 "unicode-width",
 ]

+[[package]]
+name = "convert_case"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e"
+
 [[package]]
 name = "cow-utils"
 version = "0.1.2"
@ -506,6 +526,15 @@ version = "1.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"

+[[package]]
+name = "encoding_rs"
+version = "0.8.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "80df024fbc5ac80f87dfef0d9f5209a252f2a497f7f42944cff24d8253cac065"
+dependencies = [
+ "cfg-if 1.0.0",
+]
+
 [[package]]
 name = "fake-simd"
 version = "0.1.2"
@ -750,12 +779,31 @@ dependencies = [
 "http",
 "indexmap",
 "slab",
- "tokio",
- "tokio-util",
+ "tokio 0.2.25",
+ "tokio-util 0.3.1",
 "tracing",
 "tracing-futures",
 ]

+[[package]]
+name = "h2"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "825343c4eef0b63f541f8903f395dc5beb362a979b5799a84062527ef1e37726"
+dependencies = [
+ "bytes 1.0.1",
+ "fnv",
+ "futures-core",
+ "futures-sink",
+ "futures-util",
+ "http",
+ "indexmap",
+ "slab",
+ "tokio 1.6.0",
+ "tokio-util 0.6.7",
+ "tracing",
+]
+
 [[package]]
 name = "half"
 version = "1.7.1"
@ -893,6 +941,17 @@ dependencies = [
 "http",
 ]

+[[package]]
+name = "http-body"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "60daa14be0e0786db0f03a9e57cb404c9d756eed2b6c62b9ea98ec5743ec75a9"
+dependencies = [
+ "bytes 1.0.1",
+ "http",
+ "pin-project-lite 0.2.6",
+]
+
 [[package]]
 name = "http-ui"
 version = "0.2.1"
@ -922,7 +981,7 @@ dependencies = [
 "stderrlog",
 "structopt",
 "tempfile",
- "tokio",
+ "tokio 0.2.25",
 "warp",
 ]

@ -960,20 +1019,59 @@ dependencies = [
 "futures-channel",
 "futures-core",
 "futures-util",
- "h2",
+ "h2 0.2.7",
 "http",
- "http-body",
+ "http-body 0.3.1",
 "httparse",
 "httpdate",
 "itoa",
 "pin-project 1.0.5",
- "socket2",
- "tokio",
+ "socket2 0.3.19",
+ "tokio 0.2.25",
 "tower-service",
 "tracing",
 "want",
 ]

+[[package]]
+name = "hyper"
+version = "0.14.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8bf09f61b52cfcf4c00de50df88ae423d6c02354e385a86341133b5338630ad1"
+dependencies = [
+ "bytes 1.0.1",
+ "futures-channel",
+ "futures-core",
+ "futures-util",
+ "h2 0.3.3",
+ "http",
+ "http-body 0.4.2",
+ "httparse",
+ "httpdate",
+ "itoa",
+ "pin-project 1.0.5",
+ "socket2 0.4.0",
+ "tokio 1.6.0",
+ "tower-service",
+ "tracing",
+ "want",
+]
+
+[[package]]
+name = "hyper-rustls"
+version = "0.22.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f9f7a97316d44c0af9b0301e65010573a853a9fc97046d7331d7f6bc0fd5a64"
+dependencies = [
+ "futures-util",
+ "hyper 0.14.5",
+ "log",
+ "rustls",
+ "tokio 1.6.0",
+ "tokio-rustls",
+ "webpki",
+]
+
 [[package]]
 name = "idna"
 version = "0.2.2"
@ -1029,6 +1127,12 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "ipnet"
+version = "2.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47be2f14c678be2fdcab04ab1171db51b2762ce6f0a8ee87c8dd4a04ed216135"
+
 [[package]]
 name = "itertools"
 version = "0.9.0"
@ -1261,7 +1365,6 @@ dependencies = [
 "bstr",
 "byteorder",
 "chrono",
- "criterion",
 "crossbeam-channel",
 "csv",
 "either",
@ -1343,6 +1446,19 @@ dependencies = [
 "winapi 0.2.8",
 ]

+[[package]]
+name = "mio"
+version = "0.7.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf80d3e903b34e0bd7282b218398aec54e082c840d9baf8339e0080a0c542956"
+dependencies = [
+ "libc",
+ "log",
+ "miow 0.3.7",
+ "ntapi",
+ "winapi 0.3.9",
+]
+
 [[package]]
 name = "mio-named-pipes"
 version = "0.1.7"
@ -1350,7 +1466,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0840c1c50fd55e521b247f949c241c9997709f23bd7f023b9762cd561e935656"
 dependencies = [
 "log",
- "mio",
+ "mio 0.6.23",
 "miow 0.3.7",
 "winapi 0.3.9",
 ]
@ -1363,7 +1479,7 @@ checksum = "afcb699eb26d4332647cc848492bbc15eafb26f08d0304550d5aa1f612e066f0"
 dependencies = [
 "iovec",
 "libc",
- "mio",
+ "mio 0.6.23",
 ]

 [[package]]
@ -1441,6 +1557,15 @@ dependencies = [
 "version_check",
 ]

+[[package]]
+name = "ntapi"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44"
+dependencies = [
+ "winapi 0.3.9",
+]
+
 [[package]]
 name = "num-integer"
 version = "0.1.44"
@ -1956,12 +2081,62 @@ dependencies = [
 "winapi 0.3.9",
 ]

+[[package]]
+name = "reqwest"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2296f2fac53979e8ccbc4a1136b25dcefd37be9ed7e4a1f6b05a6029c84ff124"
+dependencies = [
+ "base64 0.13.0",
+ "bytes 1.0.1",
+ "encoding_rs",
+ "futures-core",
+ "futures-util",
+ "http",
+ "http-body 0.4.2",
+ "hyper 0.14.5",
+ "hyper-rustls",
+ "ipnet",
+ "js-sys",
+ "lazy_static",
+ "log",
+ "mime",
+ "percent-encoding",
+ "pin-project-lite 0.2.6",
+ "rustls",
+ "serde",
+ "serde_urlencoded 0.7.0",
+ "tokio 1.6.0",
+ "tokio-rustls",
+ "url",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+ "webpki-roots",
+ "winreg",
+]
+
 [[package]]
 name = "retain_mut"
 version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "53552c6c49e1e13f1a203ef0080ab3bbef0beb570a528993e83df057a9d9bba1"

+[[package]]
+name = "ring"
+version = "0.16.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc"
+dependencies = [
+ "cc",
+ "libc",
+ "once_cell",
+ "spin",
+ "untrusted",
+ "web-sys",
+ "winapi 0.3.9",
+]
+
 [[package]]
 name = "roaring"
 version = "0.6.6"
@ -1982,6 +2157,19 @@ dependencies = [
 "semver",
 ]

+[[package]]
+name = "rustls"
+version = "0.19.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "35edb675feee39aec9c99fa5ff985081995a06d594114ae14cbe797ad7b7a6d7"
+dependencies = [
+ "base64 0.13.0",
+ "log",
+ "ring",
+ "sct",
+ "webpki",
+]
+
 [[package]]
 name = "ryu"
 version = "1.0.5"
@ -2015,6 +2203,16 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"

+[[package]]
+name = "sct"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b362b83898e0e69f38515b82ee15aa80636befe47c3b6d3d89a911e78fc228ce"
+dependencies = [
+ "ring",
+ "untrusted",
+]
+
 [[package]]
 name = "search"
 version = "0.2.1"
@ -2108,6 +2306,18 @@ dependencies = [
 "url",
 ]

+[[package]]
+name = "serde_urlencoded"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edfa57a7f8d9c1d260a549e7224100f6c43d43f9103e06dd8b4095a9b2b43ce9"
+dependencies = [
+ "form_urlencoded",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
 [[package]]
 name = "sha-1"
 version = "0.8.2"
@ -2193,6 +2403,22 @@ dependencies = [
 "winapi 0.3.9",
 ]

+[[package]]
+name = "socket2"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9e3dfc207c526015c632472a77be09cf1b6e46866581aecae5cc38fb4235dea2"
+dependencies = [
+ "libc",
+ "winapi 0.3.9",
+]
+
+[[package]]
+name = "spin"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
+
 [[package]]
 name = "static_assertions"
 version = "1.1.0"
@ -2386,7 +2612,7 @@ dependencies = [
 "lazy_static",
 "libc",
 "memchr",
- "mio",
+ "mio 0.6.23",
 "mio-named-pipes",
 "mio-uds",
 "num_cpus",
@ -2397,6 +2623,21 @@ dependencies = [
 "winapi 0.3.9",
 ]

+[[package]]
+name = "tokio"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bd3076b5c8cc18138b8f8814895c11eb4de37114a5d127bafdc5e55798ceef37"
+dependencies = [
+ "autocfg",
+ "bytes 1.0.1",
+ "libc",
+ "memchr",
+ "mio 0.7.11",
+ "num_cpus",
+ "pin-project-lite 0.2.6",
+]
+
 [[package]]
 name = "tokio-macros"
 version = "0.2.6"
@ -2408,6 +2649,17 @@ dependencies = [
 "syn 1.0.64",
 ]

+[[package]]
+name = "tokio-rustls"
+version = "0.22.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc6844de72e57df1980054b38be3a9f4702aba4858be64dd700181a8a6d0e1b6"
+dependencies = [
+ "rustls",
+ "tokio 1.6.0",
+ "webpki",
+]
+
 [[package]]
 name = "tokio-tungstenite"
 version = "0.11.0"
@ -2417,7 +2669,7 @@ dependencies = [
 "futures-util",
 "log",
 "pin-project 0.4.27",
- "tokio",
+ "tokio 0.2.25",
 "tungstenite",
 ]

@ -2432,7 +2684,21 @@ dependencies = [
 "futures-sink",
 "log",
 "pin-project-lite 0.1.12",
- "tokio",
+ "tokio 0.2.25",
+]
+
+[[package]]
+name = "tokio-util"
+version = "0.6.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1caa0b0c8d94a049db56b5acf8cba99dc0623aab1b26d5b5f5e2d945846b3592"
+dependencies = [
+ "bytes 1.0.1",
+ "futures-core",
+ "futures-sink",
+ "log",
+ "pin-project-lite 0.2.6",
+ "tokio 1.6.0",
 ]

 [[package]]
@ -2578,6 +2844,12 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"

+[[package]]
+name = "untrusted"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
+
 [[package]]
 name = "url"
 version = "2.2.1"
@ -2654,7 +2926,7 @@ dependencies = [
 "futures",
 "headers",
 "http",
- "hyper",
+ "hyper 0.13.10",
 "log",
 "mime",
 "mime_guess",
@ -2663,8 +2935,8 @@ dependencies = [
 "scoped-tls",
 "serde",
 "serde_json",
- "serde_urlencoded",
- "tokio",
+ "serde_urlencoded 0.6.1",
+ "tokio 0.2.25",
 "tokio-tungstenite",
 "tower-service",
 "tracing",
@ -2691,6 +2963,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8fe8f61dba8e5d645a4d8132dc7a0a66861ed5e1045d2c0ed940fab33bac0fbe"
 dependencies = [
 "cfg-if 1.0.0",
+ "serde",
+ "serde_json",
 "wasm-bindgen-macro",
 ]

@ -2709,6 +2983,18 @@ dependencies = [
 "wasm-bindgen-shared",
 ]

+[[package]]
+name = "wasm-bindgen-futures"
+version = "0.4.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73157efb9af26fb564bb59a009afd1c7c334a44db171d280690d0c3faaec3468"
+dependencies = [
+ "cfg-if 1.0.0",
+ "js-sys",
+ "wasm-bindgen",
+ "web-sys",
+]
+
 [[package]]
 name = "wasm-bindgen-macro"
 version = "0.2.72"
@ -2748,6 +3034,25 @@ dependencies = [
 "wasm-bindgen",
 ]

+[[package]]
+name = "webpki"
+version = "0.21.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8e38c0608262c46d4a56202ebabdeb094cef7e560ca7a226c6bf055188aa4ea"
+dependencies = [
+ "ring",
+ "untrusted",
+]
+
+[[package]]
+name = "webpki-roots"
+version = "0.21.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aabe153544e473b775453675851ecc86863d2a81d786d741f6b76778f2a48940"
+dependencies = [
+ "webpki",
+]
+
 [[package]]
 name = "whatlang"
 version = "0.9.0"
@ -2800,6 +3105,15 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"

+[[package]]
+name = "winreg"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0120db82e8a1e0b9fb3345a539c478767c0048d842860994d96113d5b667bd69"
+dependencies = [
+ "winapi 0.3.9",
+]
+
 [[package]]
 name = "ws2_32-sys"
 version = "0.2.1"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,5 +1,5 @@
 [workspace]
-members = ["milli", "http-ui", "infos", "helpers", "search"]
+members = ["milli", "http-ui", "benchmarks", "infos", "helpers", "search"]
 default-members = ["milli"]

 [profile.release]
--- a/benchmarks/.gitignore
+++ b/benchmarks/.gitignore
@ -0,0 +1 @@
+benches/datasets_paths.rs
--- a/benchmarks/Cargo.toml
+++ b/benchmarks/Cargo.toml
@ -0,0 +1,28 @@
+[package]
+name = "benchmarks"
+version = "0.1.0"
+edition = "2018"
+publish = false
+
+
+[dependencies]
+milli = { path = "../milli" }
+
+[dev-dependencies]
+heed = "*" # we want to use the version milli uses
+criterion = "0.3.4"
+
+[build-dependencies]
+anyhow = "1.0"
+bytes = "1.0"
+flate2 = "1.0.20"
+convert_case = "0.4"
+reqwest = { version = "0.11.3", features = ["blocking", "rustls-tls"], default-features = false }
+
+[[bench]]
+name = "songs"
+harness = false
+
+[[bench]]
+name = "wiki"
+harness = false
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -0,0 +1,110 @@
+Benchmarks
+==========
+
+## TOC
+
+- [Datasets](#datasets)
+- [Run the benchmarks](#run-the-benchmarks)
+- [Comparison between benchmarks](#comparison-between-benchmarks)
+
+## Datasets
+
+The benchmarks are available for the following datasets:
+- `songs`
+- `wiki`
+
+### Songs
+
+`songs` is a subset of the [`songs.csv` dataset](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz).
+
+It was generated with this command:
+
+```bash
+xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv
+```
+
+_[Download the generated `songs` dataset](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)._
+
+### Wiki
+
+`wiki` is a subset of the [`wikipedia-articles.csv` dataset](https://meili-datasets.s3.fr-par.scw.cloud/wikipedia-articles.csv.gz).
+
+It was generated with the following command:
+
+```bash
+xsv sample --seed 42 500000 wikipedia-articles.csv -o smol-wikipedia-articles.csv
+```
+
+_[Download the generated `wiki` dataset](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-wikipedia-articles.csv.gz)._
+
+## Run the benchmarks
+
+### On our private server
+
+The Meili team has self-hosted his own GitHub runner to run benchmarks on our dedicated bare metal server.
+
+To trigger the benchmark workflow:
+- Go to the `Actions` tab of this repository.
+- Select the `Benchmarks` workflow on the left.
+- Click on `Run workflow` in the blue banner.
+- Select the branch on which you want to run the benchmarks and select the dataset you want (default: `songs`).
+- Finally, click on `Run workflow`.
+
+This GitHub workflow will run the benchmarks and push the `critcmp` report to a DigitalOcean Space (= S3).
+
+The name of the uploaded file is displayed in the workflow.
+
+_[More about critcmp](https://github.com/BurntSushi/critcmp)._
+
+💡 To compare the just-uploaded benchmark with another one, check out the [next section](#comparison-between-benchmarks).
+
+### On your machine
+
+To run all the benchmarks (~4h):
+
+```bash
+cargo bench
+```
+
+To run only the `songs` (~1h) or `wiki` (~3h) benchmark:
+
+```bash
+cargo bench --bench <dataset name>
+```
+
+By default, the benchmarks will be downloaded and uncompressed automatically in the target directory.<br>
+If you don't want to download the datasets every time you update something on the code, you can specify a custom directory with the environment variable `MILLI_BENCH_DATASETS_PATH`:
+
+```bash
+mkdir ~/datasets
+MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded
+touch build.rs
+MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded
+```
+
+## Comparison between benchmarks
+
+The benchmark reports we push are generated with `critcmp`. Thus, we use `critcmp` to generate comparison results between 2 benchmarks.
+
+We provide a script to download and display the comparison report.
+
+Requirements:
+- `grep`
+- `curl`
+- [`critcmp`](https://github.com/BurntSushi/critcmp)
+
+List the available file in the DO Space:
+
+```bash
+./benchmarks/script/list.sh
+```
+```bash
+songs_main_09a4321.json
+songs_geosearch_24ec456.json
+```
+
+Run the comparison script:
+
+```bash
+./benchmarks/scripts/compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json
+```
--- a/benchmarks/benches/songs.rs
+++ b/benchmarks/benches/songs.rs
@ -0,0 +1,211 @@
+mod datasets_paths;
+mod utils;
+
+use criterion::{criterion_group, criterion_main};
+use milli::update::Settings;
+use utils::Conf;
+
+fn base_conf(builder: &mut Settings) {
+    let displayed_fields = [
+        "id", "title", "album", "artist", "genre", "country", "released", "duration",
+    ]
+    .iter()
+    .map(|s| s.to_string())
+    .collect();
+    builder.set_displayed_fields(displayed_fields);
+
+    let searchable_fields = ["title", "album", "artist"]
+        .iter()
+        .map(|s| s.to_string())
+        .collect();
+    builder.set_searchable_fields(searchable_fields);
+
+    let faceted_fields = [
+        "released-timestamp",
+        "duration-float",
+        "genre",
+        "country",
+        "artist",
+    ]
+    .iter()
+    .map(|s| s.to_string())
+    .collect();
+    builder.set_faceted_fields(faceted_fields);
+}
+
+const BASE_CONF: Conf = Conf {
+    dataset: datasets_paths::SMOL_SONGS,
+    queries: &[
+        "john ",             // 9097
+        "david ",            // 4794
+        "charles ",          // 1957
+        "david bowie ",      // 1200
+        "michael jackson ",  // 600
+        "thelonious monk ",  // 303
+        "charles mingus ",   // 142
+        "marcus miller ",    // 60
+        "tamo ",             // 13
+        "Notstandskomitee ", // 4
+    ],
+    configure: base_conf,
+    primary_key: Some("id"),
+    ..Conf::BASE
+};
+
+fn bench_songs(c: &mut criterion::Criterion) {
+    let default_criterion: Vec<String> = milli::default_criteria()
+        .iter()
+        .map(|criteria| criteria.to_string())
+        .collect();
+    let default_criterion = default_criterion.iter().map(|s| s.as_str());
+    let asc_default: Vec<&str> = std::iter::once("asc(released-timestamp)")
+        .chain(default_criterion.clone())
+        .collect();
+    let desc_default: Vec<&str> = std::iter::once("desc(released-timestamp)")
+        .chain(default_criterion.clone())
+        .collect();
+
+    let basic_with_quote: Vec<String> = BASE_CONF
+        .queries
+        .iter()
+        .map(|s| {
+            s.trim()
+                .split(' ')
+                .map(|s| format!(r#""{}""#, s))
+                .collect::<Vec<String>>()
+                .join(" ")
+        })
+        .collect();
+    let basic_with_quote: &[&str] = &basic_with_quote
+        .iter()
+        .map(|s| s.as_str())
+        .collect::<Vec<&str>>();
+
+    let confs = &[
+        /* first we bench each criterion alone */
+        utils::Conf {
+            group_name: "proximity",
+            queries: &[
+                "black saint sinner lady ",
+                "les dangeureuses 1960 ",
+                "The Disneyland Sing-Along Chorus ",
+                "Under Great Northern Lights ",
+                "7000 Danses Un Jour Dans Notre Vie ",
+            ],
+            criterion: Some(&["proximity"]),
+            optional_words: false,
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "typo",
+            queries: &[
+                "mongus ",
+                "thelonius monk ",
+                "Disnaylande ",
+                "the white striper ",
+                "indochie ",
+                "indochien ",
+                "klub des loopers ",
+                "fear of the duck ",
+                "michel depech ",
+                "stromal ",
+                "dire straights ",
+                "Arethla Franklin ",
+            ],
+            criterion: Some(&["typo"]),
+            optional_words: false,
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "words",
+            queries: &[
+                "the black saint and the sinner lady and the good doggo ", // four words to pop
+                "les liaisons dangeureuses 1793 ",                         // one word to pop
+                "The Disneyland Children's Sing-Alone song ",              // two words to pop
+                "seven nation mummy ",                                     // one word to pop
+                "7000 Danses / Le Baiser / je me trompe de mots ",         // four words to pop
+                "Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop
+                "whathavenotnsuchforth and a good amount of words to pop to match the first one ", // 13
+            ],
+            criterion: Some(&["words"]),
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "asc",
+            criterion: Some(&["asc(released-timestamp)"]),
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "desc",
+            criterion: Some(&["desc(released-timestamp)"]),
+            ..BASE_CONF
+        },
+
+        /* then we bench the asc and desc criterion on top of the default criterion */
+        utils::Conf {
+            group_name: "asc + default",
+            criterion: Some(&asc_default[..]),
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "desc + default",
+            criterion: Some(&desc_default[..]),
+            ..BASE_CONF
+        },
+
+        /* we bench the filters with the default request */
+        utils::Conf {
+            group_name: "basic filter: <=",
+            facet_condition: Some("released-timestamp <= 946728000"), // year 2000
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "basic filter: TO",
+            facet_condition: Some("released-timestamp 946728000 TO 1262347200"), // year 2000 to 2010
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "big filter",
+            facet_condition: Some("released-timestamp != 1262347200 AND (NOT (released-timestamp = 946728000)) AND (duration-float = 1 OR (duration-float 1.1 TO 1.5 AND released-timestamp > 315576000))"),
+            ..BASE_CONF
+        },
+
+        /* the we bench some global / normal search with all the default criterion in the default
+         * order */
+        utils::Conf {
+            group_name: "basic placeholder",
+            queries: &[""],
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "basic without quote",
+            queries: &BASE_CONF
+                .queries
+                .iter()
+                .map(|s| s.trim()) // we remove the space at the end of each request
+                .collect::<Vec<&str>>(),
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "basic with quote",
+            queries: basic_with_quote,
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "prefix search",
+            queries: &[
+                "s", // 500k+ results
+                "a", //
+                "b", //
+                "i", //
+                "x", // only 7k results
+            ],
+            ..BASE_CONF
+        },
+    ];
+
+    utils::run_benches(c, confs);
+}
+
+criterion_group!(benches, bench_songs);
+criterion_main!(benches);
--- a/benchmarks/benches/utils.rs
+++ b/benchmarks/benches/utils.rs
@ -0,0 +1,119 @@
+use std::fs::{create_dir_all, remove_dir_all, File};
+
+use criterion::BenchmarkId;
+use heed::EnvOpenOptions;
+use milli::{
+    update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat},
+    FacetCondition, Index,
+};
+
+pub struct Conf<'a> {
+    /// where we are going to create our database.mmdb directory
+    /// each benchmark will first try to delete it and then recreate it
+    pub database_name: &'a str,
+    /// the dataset to be used, it must be an uncompressed csv
+    pub dataset: &'a str,
+    pub group_name: &'a str,
+    pub queries: &'a [&'a str],
+    /// here you can change which criterion are used and in which order.
+    /// - if you specify something all the base configuration will be thrown out
+    /// - if you don't specify anything (None) the default configuration will be kept
+    pub criterion: Option<&'a [&'a str]>,
+    /// the last chance to configure your database as you want
+    pub configure: fn(&mut Settings),
+    pub facet_condition: Option<&'a str>,
+    /// enable or disable the optional words on the query
+    pub optional_words: bool,
+    /// primary key, if there is None we'll auto-generate docids for every documents
+    pub primary_key: Option<&'a str>,
+}
+
+impl Conf<'_> {
+    pub const BASE: Self = Conf {
+        database_name: "benches.mmdb",
+        dataset: "",
+        group_name: "",
+        queries: &[],
+        criterion: None,
+        configure: |_| (),
+        facet_condition: None,
+        optional_words: true,
+        primary_key: None,
+    };
+}
+
+pub fn base_setup(conf: &Conf) -> Index {
+    match remove_dir_all(&conf.database_name) {
+        Ok(_) => (),
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => (),
+        Err(e) => panic!("{}", e),
+    }
+    create_dir_all(&conf.database_name).unwrap();
+
+    let mut options = EnvOpenOptions::new();
+    options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
+    options.max_readers(10);
+    let index = Index::new(options, conf.database_name).unwrap();
+    if let Some(primary_key) = conf.primary_key {
+        let mut wtxn = index.write_txn().unwrap();
+        index.put_primary_key(&mut wtxn, primary_key).unwrap();
+    }
+
+    let update_builder = UpdateBuilder::new(0);
+    let mut wtxn = index.write_txn().unwrap();
+    let mut builder = update_builder.settings(&mut wtxn, &index);
+
+    if let Some(criterion) = conf.criterion {
+        builder.reset_faceted_fields();
+        builder.reset_criteria();
+        builder.reset_stop_words();
+
+        let criterion = criterion.iter().map(|s| s.to_string()).collect();
+        builder.set_criteria(criterion);
+    }
+
+    (conf.configure)(&mut builder);
+
+    builder.execute(|_, _| ()).unwrap();
+    wtxn.commit().unwrap();
+
+    let update_builder = UpdateBuilder::new(0);
+    let mut wtxn = index.write_txn().unwrap();
+    let mut builder = update_builder.index_documents(&mut wtxn, &index);
+    if let None = conf.primary_key {
+        builder.enable_autogenerate_docids();
+    }
+    builder.update_format(UpdateFormat::Csv);
+    builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
+    let reader = File::open(conf.dataset)
+        .expect(&format!("could not find the dataset in: {}", conf.dataset));
+    builder.execute(reader, |_, _| ()).unwrap();
+    wtxn.commit().unwrap();
+
+    index
+}
+
+pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
+    for conf in confs {
+        let index = base_setup(conf);
+
+        let mut group = c.benchmark_group(&format!("{}: {}", conf.dataset, conf.group_name));
+
+        for &query in conf.queries {
+            group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| {
+                b.iter(|| {
+                    let rtxn = index.read_txn().unwrap();
+                    let mut search = index.search(&rtxn);
+                    search.query(query).optional_words(conf.optional_words);
+                    if let Some(facet_condition) = conf.facet_condition {
+                        let facet_condition =
+                            FacetCondition::from_str(&rtxn, &index, facet_condition).unwrap();
+                        search.facet_condition(facet_condition);
+                    }
+                    let _ids = search.execute().unwrap();
+                });
+            });
+        }
+        group.finish();
+    }
+}
--- a/benchmarks/benches/wiki.rs
+++ b/benchmarks/benches/wiki.rs
@ -0,0 +1,133 @@
+mod datasets_paths;
+mod utils;
+
+use criterion::{criterion_group, criterion_main};
+use milli::update::Settings;
+use utils::Conf;
+
+fn base_conf(builder: &mut Settings) {
+    let displayed_fields = ["title", "body", "url"]
+        .iter()
+        .map(|s| s.to_string())
+        .collect();
+    builder.set_displayed_fields(displayed_fields);
+
+    let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect();
+    builder.set_searchable_fields(searchable_fields);
+}
+
+const BASE_CONF: Conf = Conf {
+    dataset: datasets_paths::SMOL_WIKI_ARTICLES,
+    queries: &[
+        "mingus ",        // 46 candidates
+        "miles davis ",   // 159
+        "rock and roll ", // 1007
+        "machine ",       // 3448
+        "spain ",         // 7002
+        "japan ",         // 10.593
+        "france ",        // 17.616
+        "film ",          // 24.959
+    ],
+    configure: base_conf,
+    ..Conf::BASE
+};
+
+fn bench_songs(c: &mut criterion::Criterion) {
+    let basic_with_quote: Vec<String> = BASE_CONF
+        .queries
+        .iter()
+        .map(|s| {
+            s.trim()
+                .split(' ')
+                .map(|s| format!(r#""{}""#, s))
+                .collect::<Vec<String>>()
+                .join(" ")
+        })
+        .collect();
+    let basic_with_quote: &[&str] = &basic_with_quote
+        .iter()
+        .map(|s| s.as_str())
+        .collect::<Vec<&str>>();
+
+    let confs = &[
+        /* first we bench each criterion alone */
+        utils::Conf {
+            group_name: "proximity",
+            queries: &[
+                "herald sings ",
+                "april paris ",
+                "tea two ",
+                "diesel engine ",
+            ],
+            criterion: Some(&["proximity"]),
+            optional_words: false,
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "typo",
+            queries: &[
+                "migrosoft ",
+                "linax ",
+                "Disnaylande ",
+                "phytogropher ",
+                "nympalidea ",
+                "aritmetric ",
+                "the fronce ",
+                "sisan ",
+            ],
+            criterion: Some(&["typo"]),
+            optional_words: false,
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "words",
+            queries: &[
+                "the black saint and the sinner lady and the good doggo ", // four words to pop, 27 results
+                "Kameya Tokujirō mingus monk ",                           // two words to pop, 55
+                "Ulrich Hensel meilisearch milli ",                        // two words to pop, 306
+                "Idaho Bellevue pizza ",                                   // one word to pop, 800
+                "Abraham machin ",                                         // one word to pop, 1141
+            ],
+            criterion: Some(&["words"]),
+            ..BASE_CONF
+        },
+        /* the we bench some global / normal search with all the default criterion in the default
+         * order */
+        utils::Conf {
+            group_name: "basic placeholder",
+            queries: &[""],
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "basic without quote",
+            queries: &BASE_CONF
+                .queries
+                .iter()
+                .map(|s| s.trim()) // we remove the space at the end of each request
+                .collect::<Vec<&str>>(),
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "basic with quote",
+            queries: basic_with_quote,
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "prefix search",
+            queries: &[
+                "t", // 453k results
+                "c", // 405k
+                "g", // 318k
+                "j", // 227k
+                "q", // 71k
+                "x", // 17k
+            ],
+            ..BASE_CONF
+        },
+    ];
+
+    utils::run_benches(c, confs);
+}
+
+criterion_group!(benches, bench_songs);
+criterion_main!(benches);
--- a/benchmarks/build.rs
+++ b/benchmarks/build.rs
@ -0,0 +1,80 @@
+use std::path::{Path, PathBuf};
+use std::{env, fs};
+use std::{
+    fs::File,
+    io::{Cursor, Read, Seek, Write},
+};
+
+use bytes::Bytes;
+use convert_case::{Case, Casing};
+use flate2::read::GzDecoder;
+use reqwest::IntoUrl;
+
+const BASE_URL: &str = "https://meili-datasets.s3.fr-par.scw.cloud/benchmarks";
+
+const DATASET_SONGS: &str = "smol-songs";
+const DATASET_WIKI: &str = "smol-wiki-articles";
+
+/// The name of the environment variable used to select the path
+/// of the directory containing the datasets
+const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH";
+
+fn main() -> anyhow::Result<()> {
+    let out_dir = PathBuf::from(env::var(BASE_DATASETS_PATH_KEY).unwrap_or(env::var("OUT_DIR")?));
+
+    let benches_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?).join("benches");
+    let mut manifest_paths_file = File::create(benches_dir.join("datasets_paths.rs"))?;
+    writeln!(
+        manifest_paths_file,
+        r#"//! This file is generated by the build script.
+//! Do not modify by hand, use the build.rs file.
+#![allow(dead_code)]
+"#
+    )?;
+    writeln!(manifest_paths_file)?;
+
+    for dataset in &[DATASET_SONGS, DATASET_WIKI] {
+        let out_path = out_dir.join(dataset);
+        let out_file = out_path.with_extension("csv");
+
+        writeln!(
+            &mut manifest_paths_file,
+            r#"pub const {}: &str = {:?};"#,
+            dataset.to_case(Case::ScreamingSnake),
+            out_file.display(),
+        )?;
+
+        if out_file.exists() {
+            eprintln!("The dataset {} already exists on the file system and will not be downloaded again", dataset);
+            continue;
+        }
+        let url = format!("{}/{}.csv.gz", BASE_URL, dataset);
+        eprintln!("downloading: {}", url);
+        let bytes = download_dataset(url.clone())?;
+        eprintln!("{} downloaded successfully", url);
+        eprintln!("uncompressing in {}", out_path.display());
+        uncompress_in_file(bytes, &out_file)?;
+    }
+
+    Ok(())
+}
+
+fn download_dataset<U: IntoUrl>(url: U) -> anyhow::Result<Cursor<Bytes>> {
+    let bytes = reqwest::blocking::Client::builder()
+        .timeout(None)
+        .build()?
+        .get(url)
+        .send()?
+        .bytes()?;
+    Ok(Cursor::new(bytes))
+}
+
+fn uncompress_in_file<R: Read + Seek, P: AsRef<Path>>(bytes: R, path: P) -> anyhow::Result<()> {
+    let path = path.as_ref();
+    let mut gz = GzDecoder::new(bytes);
+    let mut dataset = Vec::new();
+    gz.read_to_end(&mut dataset)?;
+
+    fs::write(path, dataset)?;
+    Ok(())
+}
--- a/benchmarks/scripts/compare.sh
+++ b/benchmarks/scripts/compare.sh
@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+
+# Requirements:
+# - critcmp. See: https://github.com/BurntSushi/critcmp
+# - curl
+
+# Usage
+# $ bash compare.sh json_file1 json_file1
+# ex: bash compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json
+
+# Checking that critcmp is installed
+command -v critcmp > /dev/null 2>&1
+if [[ "$?" -ne 0 ]]; then
+    echo 'You must install critcmp to make this script working.'
+    echo '$ cargo install critcmp'
+    echo 'See: https://github.com/BurntSushi/critcmp'
+    exit 1
+fi
+
+if [[ $# -ne 2 ]]
+  then
+    echo 'Need 2 arguments.'
+    echo 'Usage: '
+    echo '  $ ./compare.sh file_to_download1 file_to_download2'
+    echo 'Ex:'
+    echo '  $ ./compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json'
+    exit 1
+fi
+
+file1="$1"
+file2="$2"
+s3_url='https://milli-benchmarks.fra1.digitaloceanspaces.com/critcmp_results'
+file1_s3_url="$s3_url/$file1"
+file2_s3_url="$s3_url/$file2"
+file1_local_path="/tmp/$file1"
+file2_local_path="/tmp/$file2"
+
+if [[ ! -f "$file1_local_path" ]]; then
+    curl "$file1_s3_url" -O "$file1_local_path"
+    if [[ "$?" -ne 0 ]]; then
+	    echo 'curl command failed.'
+	    exit 1
+    fi
+else
+    echo "$file1 already present in /tmp, no need to download."
+fi
+
+if [[ ! -f "$file2_local_path" ]]; then
+    curl "$file2_s3_url" -O "$file2_local_path"
+    if [[ "$?" -ne 0 ]]; then
+	    echo 'curl command failed.'
+	    exit 1
+    fi
+else
+    echo "$file2 already present in /tmp, no need to download."
+fi
+
+critcmp --color always "$file1_local_path" "$file2_local_path"
--- a/benchmarks/scripts/list.sh
+++ b/benchmarks/scripts/list.sh
@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+# Requirements:
+# - curl
+# - grep
+
+res=$(curl -s https://milli-benchmarks.fra1.digitaloceanspaces.com | grep -o '<Key>[^<]\+' | cut -c 5- | grep critcmp_results/ | cut -c 18-)
+
+for pattern in "$@"
+do
+	res=$(echo "$res" | grep $pattern)
+done
+
+echo "$res"
--- a/benchmarks/src/lib.rs
+++ b/benchmarks/src/lib.rs
@ -0,0 +1,5 @@
+//! This library is only used to isolate the benchmarks
+//! from the original milli library.
+//!
+//! It does not include interesting functions for milli library
+//! users only for milli contributors.
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@ -53,13 +53,8 @@ tinytemplate = "=1.1.0"

 [dev-dependencies]
 big_s = "1.0.2"
-criterion = "0.3.4"
 maplit = "1.0.2"
 rand = "0.8.3"

 [features]
 default = []
-
-[[bench]]
-name = "search"
-harness = false
--- a/milli/benches/search.rs
+++ b/milli/benches/search.rs
@ -1,36 +0,0 @@
-use std::time::Duration;
-
-use heed::EnvOpenOptions;
-use milli::Index;
-use criterion::{criterion_group, criterion_main, BenchmarkId};
-
-fn bench_search(c: &mut criterion::Criterion) {
-    let database = "books-4cpu.mmdb";
-    let queries = [
-        "minogue kylie",
-        "minogue kylie live",
-    ];
-
-    let mut options = EnvOpenOptions::new();
-    options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
-    options.max_readers(10);
-    let index = Index::new(options, database).unwrap();
-
-    let mut group = c.benchmark_group("search");
-    group.sample_size(10);
-    group.measurement_time(Duration::from_secs(12));
-
-    for query in &queries {
-        group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| {
-            b.iter(|| {
-                let rtxn = index.read_txn().unwrap();
-                let _documents_ids = index.search(&rtxn).query(*query).execute().unwrap();
-            });
-        });
-    }
-
-    group.finish();
-}
-
-criterion_group!(benches, bench_search);
-criterion_main!(benches);