From 0d0e900158cba3c450c99e241db18c224d1a4a21 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= <clementine@meilisearch.com>
Date: Wed, 26 May 2021 15:57:22 +0200
Subject: [PATCH] Add CI for benchmarks

---
 .github/workflows/benchmarks.yml |  63 +++++++++++++++++++
 benchmarks/README.md             | 101 +++++++++++++++++++++++++++----
 benchmarks/scripts/compare.sh    |  58 ++++++++++++++++++
 3 files changed, 209 insertions(+), 13 deletions(-)
 create mode 100644 .github/workflows/benchmarks.yml
 create mode 100644 benchmarks/scripts/compare.sh

diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
new file mode 100644
index 000000000..867e13132
--- /dev/null
+++ b/.github/workflows/benchmarks.yml
@@ -0,0 +1,63 @@
+name: Benchmarks
+
+on:
+  workflow_dispatch:
+    inputs:
+      dataset_name:
+        description: 'The name of the dataset used to benchmark (songs or wiki)'
+        required: false
+        default: 'songs'
+
+jobs:
+  benchmarks:
+    name: Run and upload benchmarks
+    runs-on: self-hosted
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions-rs/toolchain@v1
+        with:
+          profile: minimal
+          toolchain: stable
+          override: true
+
+      # Set variables
+      - name: Set current branch name
+        shell: bash
+        run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})"
+        id: current_branch
+      - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3
+        shell: bash
+        run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')"
+        id: normalized_current_branch
+      - name: Set shorter commit SHA
+        shell: bash
+        run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)"
+        id: commit_sha
+      - name: Set file basename with format "dataset_branch_commitSHA"
+        shell: bash
+        run: echo "##[set-output name=basename;]$(echo ${{ github.event.inputs.dataset_name }}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})"
+        id: file
+
+      # Run benchmarks
+      - name: Run benchmarks - Dataset ${{ github.event.inputs.dataset_name }} -  Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
+        run: |
+          cd benchmarks
+          cargo bench --bench ${{ github.event.inputs.dataset_name }} -- --save-baseline ${{ steps.file.outputs.basename }}
+
+      # Generate critcmpf files
+      - name: Install critcmp
+        run: cargo install critcmp
+      - name: Export cripcmp file
+        run: |
+          critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json
+
+      # Upload benchmarks
+      - name: Upload to DO Spaces # DigitalOcean Spaces = S3
+        uses: BetaHuhn/do-spaces-action@v2
+        with:
+          access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }}
+          secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }}
+          space_name: ${{ secrets.DO_SPACES_SPACE_NAME }}
+          space_region: ${{ secrets.DO_SPACES_SPACE_REGION }}
+          source: ${{ steps.file.outputs.basename }}.json
+          out_dir: critcmp_results
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 8c91700e9..cde4062e5 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,30 +1,105 @@
 Benchmarks
 ==========
 
-For our benchmark we are using a small subset of the dataset `songs.csv`. It was generated with this command:
-```
+## TOC
+
+- [Datasets](#datasets)
+- [Run the benchmarks](#run-the-benchmarks)
+- [Comparison between benchmarks](#comparison-between-benchmarks)
+
+## Datasets
+
+The benchmarks are available for the following datasets:
+- `songs`
+- `wiki`
+
+### Songs
+
+`songs` is a subset of the [`songs.csv` dataset](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz).
+
+It was generated with this command:
+
+```bash
 xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv
 ```
-You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)
-And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz).
 
-We also use a subset of `wikipedia-articles.csv` that was generated with the following command:
-```
+_[Download the generated `songs` dataset](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)._
+
+### Wiki
+
+`wiki` is a subset of the [`wikipedia-articles.csv` dataset](https://meili-datasets.s3.fr-par.scw.cloud/wikipedia-articles.csv.gz).
+
+It was generated with the following command:
+
+```bash
 xsv sample --seed 42 500000 wikipedia-articles.csv -o smol-wikipedia-articles.csv
 ```
-You can download the original [here](https://meili-datasets.s3.fr-par.scw.cloud/wikipedia-articles.csv.gz) and the subset [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-wikipedia-articles.csv.gz).
 
------
+_[Download the generated `wiki` dataset](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-wikipedia-articles.csv.gz)._
 
-- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h
-- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h
-- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h
+## Run the benchmarks
 
-By default the benchmarks will be downloaded and uncompressed automatically in the target directory.
-If you don't want to download the datasets everytime you updates something on the code you can specify a custom directory with the env variable `MILLI_BENCH_DATASETS_PATH`:
+### On our private server
+
+The Meili team has self-hosted his own GitHub runner to run benchmarks on our dedicated bare metal server.
+
+To trigger the benchmark workflow:
+- Go to the `Actions` tab of this repository.
+- Select the `Benchmarks` workflow on the left.
+- Click on `Run workflow` in the blue banner.
+- Select the branch on which you want to run the benchmarks and select the dataset you want (default: `songs`).
+- Finally, click on `Run workflow`.
+
+This GitHub workflow will run the benchmarks and push the `critcmp` report to a DigitalOcean Space (= S3).
+
+_[More about critcmp](https://github.com/BurntSushi/critcmp)._
+
+### On your machine
+
+To run all the benchmarks (~4h):
+
+```bash
+cargo bench
 ```
+
+To run only the `songs` (~1h) or `wiki` (~3h) benchmark:
+
+```bash
+cargo bench --bench <dataset name>
+```
+
+By default, the benchmarks will be downloaded and uncompressed automatically in the target directory.<br>
+If you don't want to download the datasets every time you update something on the code, you can specify a custom directory with the environment variable `MILLI_BENCH_DATASETS_PATH`:
+
+```bash
 mkdir ~/datasets
 MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded
 touch build.rs
 MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded
 ```
+
+## Comparison between benchmarks
+
+The benchmark reports we push are generated with `critcmp`. Thus, we use `critcmp` to generate comparison results between 2 benchmarks.
+
+We provide a script to download and display the comparison report.
+
+Requirements:
+- [`s3cmd`](https://github.com/s3tools/s3cmd) and being logged to the DigitalOcean Space "milli-benchmarks". See the [DigitalOcean guide](https://docs.digitalocean.com/products/spaces/resources/s3cmd/)
+- [`critcmp`](https://github.com/BurntSushi/critcmp)
+
+List the available file in the DO Space:
+
+```bash
+s3cmd ls s3://milli-benchmarks/critcmp_results/
+```
+```bash
+2021-05-31 14:40       279890  s3://milli-benchmarks/critcmp_results/songs_main_09a4321.json
+2021-05-31 13:49       279576  s3://milli-benchmarks/critcmp_results/songs_geosearch_24ec456.json
+```
+
+Run the comparison script:
+
+```bash
+bash benchmarks/scripts/compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json
+```
diff --git a/benchmarks/scripts/compare.sh b/benchmarks/scripts/compare.sh
new file mode 100644
index 000000000..868baeacf
--- /dev/null
+++ b/benchmarks/scripts/compare.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+# Requirements:
+# - s3cmd and being logged to the DO Space "milli-benchmarks". See: https://docs.digitalocean.com/products/spaces/resources/s3cmd/
+# - critcmp. See: https://github.com/BurntSushi/critcmp
+
+# Usage
+# $ bash compare.sh json_file1 json_file1
+# ex: bash compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json
+
+# Checking that critcmp is installed
+command -v critcmp > /dev/null 2>&1
+if [[ "$?" -ne 0 ]]; then
+    echo 'You must install critcmp to make this script working.'
+    echo '$ cargo install critcmp'
+    echo 'See: https://github.com/BurntSushi/critcmp'
+    exit 1
+fi
+
+# Checking that s3cmd is installed
+command -v s3cmd > /dev/null 2>&1
+if [[ "$?" -ne 0 ]]; then
+    echo 'You must install s3cmd to make this script working.'
+    echo 'See: https://github.com/s3tools/s3cmd'
+    exit 1
+fi
+
+if [[ $# -ne 2 ]]
+  then
+    echo 'Need 2 arguments.'
+    echo 'Usage: '
+    echo '  $ bash compare.sh file_to_download1 file_to_download2'
+    echo 'Ex:'
+    echo '  $ bash compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json'
+    exit 1
+fi
+
+file1="$1"
+file2="$2"
+s3_path='s3://milli-benchmarks/critcmp_results'
+file1_s3_path="$s3_path/$file1"
+file2_s3_path="$s3_path/$file2"
+file1_local_path="/tmp/$file1"
+file2_local_path="/tmp/$file2"
+
+if [[ ! -f "$file1_local_path" ]]; then
+    s3cmd get "$file1_s3_path" "$file1_local_path"
+else
+    echo "$file1 already present in /tmp, no need to download."
+fi
+
+if [[ ! -f "$file2_local_path" ]]; then
+    s3cmd get "$file2_s3_path" "$file2_local_path"
+else
+    echo "$file2 already present in /tmp, no need to download."
+fi
+
+critcmp --color always "$file1_local_path" "$file2_local_path"