From 0d0e900158cba3c450c99e241db18c224d1a4a21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Wed, 26 May 2021 15:57:22 +0200 Subject: [PATCH] Add CI for benchmarks --- .github/workflows/benchmarks.yml | 63 +++++++++++++++++++ benchmarks/README.md | 101 +++++++++++++++++++++++++++---- benchmarks/scripts/compare.sh | 58 ++++++++++++++++++ 3 files changed, 209 insertions(+), 13 deletions(-) create mode 100644 .github/workflows/benchmarks.yml create mode 100644 benchmarks/scripts/compare.sh diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml new file mode 100644 index 000000000..867e13132 --- /dev/null +++ b/.github/workflows/benchmarks.yml @@ -0,0 +1,63 @@ +name: Benchmarks + +on: + workflow_dispatch: + inputs: + dataset_name: + description: 'The name of the dataset used to benchmark (songs or wiki)' + required: false + default: 'songs' + +jobs: + benchmarks: + name: Run and upload benchmarks + runs-on: self-hosted + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + # Set variables + - name: Set current branch name + shell: bash + run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})" + id: current_branch + - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 + shell: bash + run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" + id: normalized_current_branch + - name: Set shorter commit SHA + shell: bash + run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)" + id: commit_sha + - name: Set file basename with format "dataset_branch_commitSHA" + shell: bash + run: echo "##[set-output name=basename;]$(echo ${{ github.event.inputs.dataset_name }}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + id: file + + # Run benchmarks + - name: Run benchmarks - Dataset ${{ github.event.inputs.dataset_name }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} + run: | + cd benchmarks + cargo bench --bench ${{ github.event.inputs.dataset_name }} -- --save-baseline ${{ steps.file.outputs.basename }} + + # Generate critcmpf files + - name: Install critcmp + run: cargo install critcmp + - name: Export cripcmp file + run: | + critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json + + # Upload benchmarks + - name: Upload to DO Spaces # DigitalOcean Spaces = S3 + uses: BetaHuhn/do-spaces-action@v2 + with: + access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }} + secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }} + space_name: ${{ secrets.DO_SPACES_SPACE_NAME }} + space_region: ${{ secrets.DO_SPACES_SPACE_REGION }} + source: ${{ steps.file.outputs.basename }}.json + out_dir: critcmp_results diff --git a/benchmarks/README.md b/benchmarks/README.md index 8c91700e9..cde4062e5 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,30 +1,105 @@ Benchmarks ========== -For our benchmark we are using a small subset of the dataset `songs.csv`. It was generated with this command: -``` +## TOC + +- [Datasets](#datasets) +- [Run the benchmarks](#run-the-benchmarks) +- [Comparison between benchmarks](#comparison-between-benchmarks) + +## Datasets + +The benchmarks are available for the following datasets: +- `songs` +- `wiki` + +### Songs + +`songs` is a subset of the [`songs.csv` dataset](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz). + +It was generated with this command: + +```bash xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv ``` -You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz) -And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz). -We also use a subset of `wikipedia-articles.csv` that was generated with the following command: -``` +_[Download the generated `songs` dataset](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)._ + +### Wiki + +`wiki` is a subset of the [`wikipedia-articles.csv` dataset](https://meili-datasets.s3.fr-par.scw.cloud/wikipedia-articles.csv.gz). + +It was generated with the following command: + +```bash xsv sample --seed 42 500000 wikipedia-articles.csv -o smol-wikipedia-articles.csv ``` -You can download the original [here](https://meili-datasets.s3.fr-par.scw.cloud/wikipedia-articles.csv.gz) and the subset [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-wikipedia-articles.csv.gz). ------ +_[Download the generated `wiki` dataset](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-wikipedia-articles.csv.gz)._ -- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h -- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h -- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h +## Run the benchmarks -By default the benchmarks will be downloaded and uncompressed automatically in the target directory. -If you don't want to download the datasets everytime you updates something on the code you can specify a custom directory with the env variable `MILLI_BENCH_DATASETS_PATH`: +### On our private server + +The Meili team has self-hosted his own GitHub runner to run benchmarks on our dedicated bare metal server. + +To trigger the benchmark workflow: +- Go to the `Actions` tab of this repository. +- Select the `Benchmarks` workflow on the left. +- Click on `Run workflow` in the blue banner. +- Select the branch on which you want to run the benchmarks and select the dataset you want (default: `songs`). +- Finally, click on `Run workflow`. + +This GitHub workflow will run the benchmarks and push the `critcmp` report to a DigitalOcean Space (= S3). + +_[More about critcmp](https://github.com/BurntSushi/critcmp)._ + +### On your machine + +To run all the benchmarks (~4h): + +```bash +cargo bench ``` + +To run only the `songs` (~1h) or `wiki` (~3h) benchmark: + +```bash +cargo bench --bench +``` + +By default, the benchmarks will be downloaded and uncompressed automatically in the target directory.
+If you don't want to download the datasets every time you update something on the code, you can specify a custom directory with the environment variable `MILLI_BENCH_DATASETS_PATH`: + +```bash mkdir ~/datasets MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded touch build.rs MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded ``` + +## Comparison between benchmarks + +The benchmark reports we push are generated with `critcmp`. Thus, we use `critcmp` to generate comparison results between 2 benchmarks. + +We provide a script to download and display the comparison report. + +Requirements: +- [`s3cmd`](https://github.com/s3tools/s3cmd) and being logged to the DigitalOcean Space "milli-benchmarks". See the [DigitalOcean guide](https://docs.digitalocean.com/products/spaces/resources/s3cmd/) +- [`critcmp`](https://github.com/BurntSushi/critcmp) + +List the available file in the DO Space: + +```bash +s3cmd ls s3://milli-benchmarks/critcmp_results/ +``` +```bash +2021-05-31 14:40 279890 s3://milli-benchmarks/critcmp_results/songs_main_09a4321.json +2021-05-31 13:49 279576 s3://milli-benchmarks/critcmp_results/songs_geosearch_24ec456.json +``` + +Run the comparison script: + +```bash +bash benchmarks/scripts/compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json +``` diff --git a/benchmarks/scripts/compare.sh b/benchmarks/scripts/compare.sh new file mode 100644 index 000000000..868baeacf --- /dev/null +++ b/benchmarks/scripts/compare.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Requirements: +# - s3cmd and being logged to the DO Space "milli-benchmarks". See: https://docs.digitalocean.com/products/spaces/resources/s3cmd/ +# - critcmp. See: https://github.com/BurntSushi/critcmp + +# Usage +# $ bash compare.sh json_file1 json_file1 +# ex: bash compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json + +# Checking that critcmp is installed +command -v critcmp > /dev/null 2>&1 +if [[ "$?" -ne 0 ]]; then + echo 'You must install critcmp to make this script working.' + echo '$ cargo install critcmp' + echo 'See: https://github.com/BurntSushi/critcmp' + exit 1 +fi + +# Checking that s3cmd is installed +command -v s3cmd > /dev/null 2>&1 +if [[ "$?" -ne 0 ]]; then + echo 'You must install s3cmd to make this script working.' + echo 'See: https://github.com/s3tools/s3cmd' + exit 1 +fi + +if [[ $# -ne 2 ]] + then + echo 'Need 2 arguments.' + echo 'Usage: ' + echo ' $ bash compare.sh file_to_download1 file_to_download2' + echo 'Ex:' + echo ' $ bash compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json' + exit 1 +fi + +file1="$1" +file2="$2" +s3_path='s3://milli-benchmarks/critcmp_results' +file1_s3_path="$s3_path/$file1" +file2_s3_path="$s3_path/$file2" +file1_local_path="/tmp/$file1" +file2_local_path="/tmp/$file2" + +if [[ ! -f "$file1_local_path" ]]; then + s3cmd get "$file1_s3_path" "$file1_local_path" +else + echo "$file1 already present in /tmp, no need to download." +fi + +if [[ ! -f "$file2_local_path" ]]; then + s3cmd get "$file2_s3_path" "$file2_local_path" +else + echo "$file2 already present in /tmp, no need to download." +fi + +critcmp --color always "$file1_local_path" "$file2_local_path"