From 06c414a75388bd34f7dc3e768f433e0b5bafac23 Mon Sep 17 00:00:00 2001
From: tamo <tamo@meilisearch.com>
Date: Tue, 25 May 2021 17:09:14 +0200
Subject: [PATCH] move the benchmarks to another crate so we can download the
 datasets automatically without adding overhead to the build of milli

---
 Cargo.toml                             |  2 +-
 benchmarks/Cargo.toml                  | 29 ++++++++++
 benchmarks/README.md                   | 30 ++++++++++
 {milli => benchmarks}/benches/songs.rs |  3 +-
 {milli => benchmarks}/benches/utils.rs | 21 ++-----
 {milli => benchmarks}/benches/wiki.rs  |  3 +-
 benchmarks/build.rs                    | 80 ++++++++++++++++++++++++++
 benchmarks/src/lib.rs                  |  5 ++
 milli/Cargo.toml                       |  9 ---
 milli/benches/README.md                | 27 ---------
 10 files changed, 154 insertions(+), 55 deletions(-)
 create mode 100644 benchmarks/Cargo.toml
 create mode 100644 benchmarks/README.md
 rename {milli => benchmarks}/benches/songs.rs (99%)
 rename {milli => benchmarks}/benches/utils.rs (81%)
 rename {milli => benchmarks}/benches/wiki.rs (98%)
 create mode 100644 benchmarks/build.rs
 create mode 100644 benchmarks/src/lib.rs
 delete mode 100644 milli/benches/README.md

diff --git a/Cargo.toml b/Cargo.toml
index a60c293e3..ff0b2582a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,5 +1,5 @@
 [workspace]
-members = ["milli", "http-ui", "infos", "helpers", "search"]
+members = ["milli", "http-ui", "benchmarks", "infos", "helpers", "search"]
 default-members = ["milli"]
 
 [profile.release]
diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml
new file mode 100644
index 000000000..f7b66fe3a
--- /dev/null
+++ b/benchmarks/Cargo.toml
@@ -0,0 +1,29 @@
+[package]
+name = "benchmarks"
+version = "0.1.0"
+edition = "2018"
+publish = false
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+milli = { path = "../milli" }
+
+[dev-dependencies]
+heed = "*" # we want to use the version milli uses
+criterion = "0.3.4"
+
+[build-dependencies]
+anyhow = "1.0"
+bytes = "1.0"
+flate2 = "1.0.20"
+convert_case = "0.4"
+reqwest = { version = "0.11.3", features = ["blocking", "rustls-tls"], default-features = false }
+
+[[bench]]
+name = "songs"
+harness = false
+
+[[bench]]
+name = "wiki"
+harness = false
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 000000000..8c91700e9
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,30 @@
+Benchmarks
+==========
+
+For our benchmark we are using a small subset of the dataset `songs.csv`. It was generated with this command:
+```
+xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv
+```
+You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)
+And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz).
+
+We also use a subset of `wikipedia-articles.csv` that was generated with the following command:
+```
+xsv sample --seed 42 500000 wikipedia-articles.csv -o smol-wikipedia-articles.csv
+```
+You can download the original [here](https://meili-datasets.s3.fr-par.scw.cloud/wikipedia-articles.csv.gz) and the subset [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-wikipedia-articles.csv.gz).
+
+-----
+
+- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h
+- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h
+- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h
+
+By default the benchmarks will be downloaded and uncompressed automatically in the target directory.
+If you don't want to download the datasets everytime you updates something on the code you can specify a custom directory with the env variable `MILLI_BENCH_DATASETS_PATH`:
+```
+mkdir ~/datasets
+MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded
+touch build.rs
+MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded
+```
diff --git a/milli/benches/songs.rs b/benchmarks/benches/songs.rs
similarity index 99%
rename from milli/benches/songs.rs
rename to benchmarks/benches/songs.rs
index 430b73a40..dd52a0afc 100644
--- a/milli/benches/songs.rs
+++ b/benchmarks/benches/songs.rs
@@ -1,3 +1,4 @@
+mod datasets_paths;
 mod utils;
 
 use criterion::{criterion_group, criterion_main};
@@ -33,7 +34,7 @@ fn base_conf(builder: &mut Settings) {
 }
 
 const BASE_CONF: Conf = Conf {
-    dataset: "smol-songs.csv",
+    dataset: datasets_paths::SMOL_SONGS,
     queries: &[
         "john ",             // 9097
         "david ",            // 4794
diff --git a/milli/benches/utils.rs b/benchmarks/benches/utils.rs
similarity index 81%
rename from milli/benches/utils.rs
rename to benchmarks/benches/utils.rs
index f3f5e9bf6..e0feb9b0e 100644
--- a/milli/benches/utils.rs
+++ b/benchmarks/benches/utils.rs
@@ -7,15 +7,6 @@ use milli::{
     FacetCondition, Index,
 };
 
-/// The name of the environment variable used to select the path
-/// of the directory containing the datasets
-const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH";
-
-/// The default path for the dataset if nothing is specified
-/// By default we chose `milli/benches` because any cargo command ran in `milli/milli/**` will be
-/// executed with a pwd of `milli/milli`
-const DEFAULT_DATASETS_PATH: &str = "milli/benches";
-
 pub struct Conf<'a> {
     /// where we are going to create our database.mmdb directory
     /// each benchmark will first try to delete it and then recreate it
@@ -33,6 +24,8 @@ pub struct Conf<'a> {
     pub facet_condition: Option<&'a str>,
     /// enable or disable the optional words on the query
     pub optional_words: bool,
+    /// primary key, if there is None we'll auto-generate docids for every documents
+    pub primary_key: Option<&'a str>,
 }
 
 impl Conf<'_> {
@@ -47,6 +40,7 @@ impl Conf<'_> {
         configure: Self::nop,
         facet_condition: None,
         optional_words: true,
+        primary_key: None,
     };
 }
 
@@ -86,13 +80,8 @@ pub fn base_setup(conf: &Conf) -> Index {
     let mut builder = update_builder.index_documents(&mut wtxn, &index);
     builder.update_format(UpdateFormat::Csv);
     builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
-    // we called from cargo the current directory is supposed to be milli/milli
-    let base_dataset_path = std::env::vars()
-        .find(|var| var.0 == BASE_DATASETS_PATH_KEY)
-        .map_or(DEFAULT_DATASETS_PATH.to_owned(), |(_key, value)| value);
-    let dataset_path = format!("{}/{}", base_dataset_path, conf.dataset);
-    let reader = File::open(&dataset_path)
-        .expect(&format!("could not find the dataset in: {}", &dataset_path));
+    let reader = File::open(conf.dataset)
+        .expect(&format!("could not find the dataset in: {}", conf.dataset));
     builder.execute(reader, |_, _| ()).unwrap();
     wtxn.commit().unwrap();
 
diff --git a/milli/benches/wiki.rs b/benchmarks/benches/wiki.rs
similarity index 98%
rename from milli/benches/wiki.rs
rename to benchmarks/benches/wiki.rs
index 8c15f11ca..99ecff2ce 100644
--- a/milli/benches/wiki.rs
+++ b/benchmarks/benches/wiki.rs
@@ -1,3 +1,4 @@
+mod datasets_paths;
 mod utils;
 
 use criterion::{criterion_group, criterion_main};
@@ -16,7 +17,7 @@ fn base_conf(builder: &mut Settings) {
 }
 
 const BASE_CONF: Conf = Conf {
-    dataset: "smol-wiki-articles.csv",
+    dataset: datasets_paths::SMOL_WIKI_ARTICLES,
     queries: &[
         "mingus ",        // 46 candidates
         "miles davis ",   // 159
diff --git a/benchmarks/build.rs b/benchmarks/build.rs
new file mode 100644
index 000000000..dc92a1a4c
--- /dev/null
+++ b/benchmarks/build.rs
@@ -0,0 +1,80 @@
+use std::path::{Path, PathBuf};
+use std::{env, fs};
+use std::{
+    fs::File,
+    io::{Cursor, Read, Seek, Write},
+};
+
+use bytes::Bytes;
+use convert_case::{Case, Casing};
+use flate2::read::GzDecoder;
+use reqwest::IntoUrl;
+
+const BASE_URL: &str = "https://meili-datasets.s3.fr-par.scw.cloud/benchmarks";
+
+const DATASET_SONGS: &str = "smol-songs";
+const DATASET_WIKI: &str = "smol-wiki-articles";
+
+/// The name of the environment variable used to select the path
+/// of the directory containing the datasets
+const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH";
+
+fn main() -> anyhow::Result<()> {
+    let out_dir = PathBuf::from(env::var(BASE_DATASETS_PATH_KEY).unwrap_or(env::var("OUT_DIR")?));
+
+    let benches_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?).join("benches");
+    let mut manifest_paths_file = File::create(benches_dir.join("datasets_paths.rs"))?;
+    writeln!(
+        manifest_paths_file,
+        r#"//! This file is generated by the build script.
+//! Do not modify by hand, use the build.rs file.
+#![allow(dead_code)]
+"#
+    )?;
+    writeln!(manifest_paths_file)?;
+
+    for dataset in &[DATASET_SONGS, DATASET_WIKI] {
+        let out_path = out_dir.join(dataset);
+        let out_file = out_path.with_extension("csv");
+
+        writeln!(
+            &mut manifest_paths_file,
+            r#"pub const {}: &str = {:?};"#,
+            dataset.to_case(Case::ScreamingSnake),
+            out_file.display(),
+        )?;
+
+        if out_file.exists() {
+            eprintln!("The dataset {} already exists on the file system and will not be downloaded again", dataset);
+            continue;
+        }
+        let url = format!("{}/{}.csv.gz", BASE_URL, dataset);
+        eprintln!("downloading: {}", url);
+        let bytes = download_dataset(url.clone())?;
+        eprintln!("{} downloaded successfully", url);
+        eprintln!("uncompressing in {}", out_path.display());
+        uncompress_in_file(bytes, &out_file)?;
+    }
+
+    Ok(())
+}
+
+fn download_dataset<U: IntoUrl>(url: U) -> anyhow::Result<Cursor<Bytes>> {
+    let bytes = reqwest::blocking::Client::builder()
+        .timeout(None)
+        .build()?
+        .get(url)
+        .send()?
+        .bytes()?;
+    Ok(Cursor::new(bytes))
+}
+
+fn uncompress_in_file<R: Read + Seek, P: AsRef<Path>>(bytes: R, path: P) -> anyhow::Result<()> {
+    let path = path.as_ref();
+    let mut gz = GzDecoder::new(bytes);
+    let mut dataset = Vec::new();
+    gz.read_to_end(&mut dataset)?;
+
+    fs::write(path, dataset)?;
+    Ok(())
+}
diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs
new file mode 100644
index 000000000..4281ec115
--- /dev/null
+++ b/benchmarks/src/lib.rs
@@ -0,0 +1,5 @@
+//! This library is only used to isolate the benchmarks
+//! from the original milli library.
+//!
+//! It does not include interesting functions for milli library
+//! users only for milli contributors.
diff --git a/milli/Cargo.toml b/milli/Cargo.toml
index 1c0f74613..2af6a9042 100644
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -53,17 +53,8 @@ tinytemplate = "=1.1.0"
 
 [dev-dependencies]
 big_s = "1.0.2"
-criterion = "0.3.4"
 maplit = "1.0.2"
 rand = "0.8.3"
 
 [features]
 default = []
-
-[[bench]]
-name = "songs"
-harness = false
-
-[[bench]]
-name = "wiki"
-harness = false
diff --git a/milli/benches/README.md b/milli/benches/README.md
deleted file mode 100644
index b2c1aec15..000000000
--- a/milli/benches/README.md
+++ /dev/null
@@ -1,27 +0,0 @@
-Benchmarks
-==========
-
-For our benchmark we are using a small subset of the dataset songs.csv. It was generated with this command:
-```
-xsv sample --seed 42 song.csv -o smol-songs.csv
-```
-You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)
-And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz).
-
-You need to put this file in the current directory: `milli/milli/benches/smol-songs.csv.gz`
-You can run the following command from the root of this git repository
-```
-wget https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz -O milli/benches/smol-songs.csv.gz
-```
-
-- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h
-- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h
-- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h
-
-By default the benchmarks expect the datasets to be uncompressed and present in `milli/milli/benches`, but you can also specify your own path with the environment variable `MILLI_BENCH_DATASETS_PATH` like that:
-```
-MILLI_BENCH_DATASETS_PATH=~/Downloads/datasets cargo bench --bench songs
-```
-
-Our benchmarking suite uses criterion which allow you to do a lot of configuration, see the documentation [here](https://bheisler.github.io/criterion.rs/book/user_guide/user_guide.html)
-