move the benchmarks to another crate so we can download the datasets automatically without adding overhead to the build of milli

2025-05-25 09:03:59 +02:00 · 2021-05-25 17:09:14 +02:00 · 2021-05-25 17:09:14 +02:00 · 06c414a753
commit 06c414a753
parent 3c84075d2d
10 changed files with 154 additions and 55 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,5 +1,5 @@
 [workspace]
-members = ["milli", "http-ui", "infos", "helpers", "search"]
+members = ["milli", "http-ui", "benchmarks", "infos", "helpers", "search"]
 default-members = ["milli"]

 [profile.release]
--- a/benchmarks/Cargo.toml
+++ b/benchmarks/Cargo.toml
@ -0,0 +1,29 @@
+[package]
+name = "benchmarks"
+version = "0.1.0"
+edition = "2018"
+publish = false
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+milli = { path = "../milli" }
+
+[dev-dependencies]
+heed = "*" # we want to use the version milli uses
+criterion = "0.3.4"
+
+[build-dependencies]
+anyhow = "1.0"
+bytes = "1.0"
+flate2 = "1.0.20"
+convert_case = "0.4"
+reqwest = { version = "0.11.3", features = ["blocking", "rustls-tls"], default-features = false }
+
+[[bench]]
+name = "songs"
+harness = false
+
+[[bench]]
+name = "wiki"
+harness = false
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -0,0 +1,30 @@
+Benchmarks
+==========
+
+For our benchmark we are using a small subset of the dataset `songs.csv`. It was generated with this command:
+```
+xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv
+```
+You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)
+And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz).
+
+We also use a subset of `wikipedia-articles.csv` that was generated with the following command:
+```
+xsv sample --seed 42 500000 wikipedia-articles.csv -o smol-wikipedia-articles.csv
+```
+You can download the original [here](https://meili-datasets.s3.fr-par.scw.cloud/wikipedia-articles.csv.gz) and the subset [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-wikipedia-articles.csv.gz).
+
+-----
+
+- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h
+- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h
+- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h
+
+By default the benchmarks will be downloaded and uncompressed automatically in the target directory.
+If you don't want to download the datasets everytime you updates something on the code you can specify a custom directory with the env variable `MILLI_BENCH_DATASETS_PATH`:
+```
+mkdir ~/datasets
+MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded
+touch build.rs
+MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded
+```
--- a/benchmarks/benches/songs.rs
+++ b/benchmarks/benches/songs.rs
@ -1,3 +1,4 @@
+mod datasets_paths;
 mod utils;

 use criterion::{criterion_group, criterion_main};
@ -33,7 +34,7 @@ fn base_conf(builder: &mut Settings) {
 }

 const BASE_CONF: Conf = Conf {
-    dataset: "smol-songs.csv",
+    dataset: datasets_paths::SMOL_SONGS,
    queries: &[
        "john ",             // 9097
        "david ",            // 4794
--- a/benchmarks/benches/utils.rs
+++ b/benchmarks/benches/utils.rs
@ -7,15 +7,6 @@ use milli::{
    FacetCondition, Index,
 };

-/// The name of the environment variable used to select the path
-/// of the directory containing the datasets
-const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH";
-
-/// The default path for the dataset if nothing is specified
-/// By default we chose `milli/benches` because any cargo command ran in `milli/milli/**` will be
-/// executed with a pwd of `milli/milli`
-const DEFAULT_DATASETS_PATH: &str = "milli/benches";
-
 pub struct Conf<'a> {
    /// where we are going to create our database.mmdb directory
    /// each benchmark will first try to delete it and then recreate it
@ -33,6 +24,8 @@ pub struct Conf<'a> {
    pub facet_condition: Option<&'a str>,
    /// enable or disable the optional words on the query
    pub optional_words: bool,
+    /// primary key, if there is None we'll auto-generate docids for every documents
+    pub primary_key: Option<&'a str>,
 }

 impl Conf<'_> {
@ -47,6 +40,7 @@ impl Conf<'_> {
        configure: Self::nop,
        facet_condition: None,
        optional_words: true,
+        primary_key: None,
    };
 }

@ -86,13 +80,8 @@ pub fn base_setup(conf: &Conf) -> Index {
    let mut builder = update_builder.index_documents(&mut wtxn, &index);
    builder.update_format(UpdateFormat::Csv);
    builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
-    // we called from cargo the current directory is supposed to be milli/milli
-    let base_dataset_path = std::env::vars()
-        .find(|var| var.0 == BASE_DATASETS_PATH_KEY)
-        .map_or(DEFAULT_DATASETS_PATH.to_owned(), |(_key, value)| value);
-    let dataset_path = format!("{}/{}", base_dataset_path, conf.dataset);
-    let reader = File::open(&dataset_path)
-        .expect(&format!("could not find the dataset in: {}", &dataset_path));
+    let reader = File::open(conf.dataset)
+        .expect(&format!("could not find the dataset in: {}", conf.dataset));
    builder.execute(reader, |_, _| ()).unwrap();
    wtxn.commit().unwrap();

--- a/benchmarks/benches/wiki.rs
+++ b/benchmarks/benches/wiki.rs
@ -1,3 +1,4 @@
+mod datasets_paths;
 mod utils;

 use criterion::{criterion_group, criterion_main};
@ -16,7 +17,7 @@ fn base_conf(builder: &mut Settings) {
 }

 const BASE_CONF: Conf = Conf {
-    dataset: "smol-wiki-articles.csv",
+    dataset: datasets_paths::SMOL_WIKI_ARTICLES,
    queries: &[
        "mingus ",        // 46 candidates
        "miles davis ",   // 159
--- a/benchmarks/build.rs
+++ b/benchmarks/build.rs
@ -0,0 +1,80 @@
+use std::path::{Path, PathBuf};
+use std::{env, fs};
+use std::{
+    fs::File,
+    io::{Cursor, Read, Seek, Write},
+};
+
+use bytes::Bytes;
+use convert_case::{Case, Casing};
+use flate2::read::GzDecoder;
+use reqwest::IntoUrl;
+
+const BASE_URL: &str = "https://meili-datasets.s3.fr-par.scw.cloud/benchmarks";
+
+const DATASET_SONGS: &str = "smol-songs";
+const DATASET_WIKI: &str = "smol-wiki-articles";
+
+/// The name of the environment variable used to select the path
+/// of the directory containing the datasets
+const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH";
+
+fn main() -> anyhow::Result<()> {
+    let out_dir = PathBuf::from(env::var(BASE_DATASETS_PATH_KEY).unwrap_or(env::var("OUT_DIR")?));
+
+    let benches_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?).join("benches");
+    let mut manifest_paths_file = File::create(benches_dir.join("datasets_paths.rs"))?;
+    writeln!(
+        manifest_paths_file,
+        r#"//! This file is generated by the build script.
+//! Do not modify by hand, use the build.rs file.
+#![allow(dead_code)]
+"#
+    )?;
+    writeln!(manifest_paths_file)?;
+
+    for dataset in &[DATASET_SONGS, DATASET_WIKI] {
+        let out_path = out_dir.join(dataset);
+        let out_file = out_path.with_extension("csv");
+
+        writeln!(
+            &mut manifest_paths_file,
+            r#"pub const {}: &str = {:?};"#,
+            dataset.to_case(Case::ScreamingSnake),
+            out_file.display(),
+        )?;
+
+        if out_file.exists() {
+            eprintln!("The dataset {} already exists on the file system and will not be downloaded again", dataset);
+            continue;
+        }
+        let url = format!("{}/{}.csv.gz", BASE_URL, dataset);
+        eprintln!("downloading: {}", url);
+        let bytes = download_dataset(url.clone())?;
+        eprintln!("{} downloaded successfully", url);
+        eprintln!("uncompressing in {}", out_path.display());
+        uncompress_in_file(bytes, &out_file)?;
+    }
+
+    Ok(())
+}
+
+fn download_dataset<U: IntoUrl>(url: U) -> anyhow::Result<Cursor<Bytes>> {
+    let bytes = reqwest::blocking::Client::builder()
+        .timeout(None)
+        .build()?
+        .get(url)
+        .send()?
+        .bytes()?;
+    Ok(Cursor::new(bytes))
+}
+
+fn uncompress_in_file<R: Read + Seek, P: AsRef<Path>>(bytes: R, path: P) -> anyhow::Result<()> {
+    let path = path.as_ref();
+    let mut gz = GzDecoder::new(bytes);
+    let mut dataset = Vec::new();
+    gz.read_to_end(&mut dataset)?;
+
+    fs::write(path, dataset)?;
+    Ok(())
+}
--- a/benchmarks/src/lib.rs
+++ b/benchmarks/src/lib.rs
@ -0,0 +1,5 @@
+//! This library is only used to isolate the benchmarks
+//! from the original milli library.
+//!
+//! It does not include interesting functions for milli library
+//! users only for milli contributors.
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@ -53,17 +53,8 @@ tinytemplate = "=1.1.0"

 [dev-dependencies]
 big_s = "1.0.2"
-criterion = "0.3.4"
 maplit = "1.0.2"
 rand = "0.8.3"

 [features]
 default = []
-
-[[bench]]
-name = "songs"
-harness = false
-
-[[bench]]
-name = "wiki"
-harness = false
--- a/milli/benches/README.md
+++ b/milli/benches/README.md
@ -1,27 +0,0 @@
-Benchmarks
-==========
-
-For our benchmark we are using a small subset of the dataset songs.csv. It was generated with this command:
-```
-xsv sample --seed 42 song.csv -o smol-songs.csv
-```
-You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)
-And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz).
-
-You need to put this file in the current directory: `milli/milli/benches/smol-songs.csv.gz`
-You can run the following command from the root of this git repository
-```
-wget https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz -O milli/benches/smol-songs.csv.gz
-```
-
- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h
- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h
- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h
-
-By default the benchmarks expect the datasets to be uncompressed and present in `milli/milli/benches`, but you can also specify your own path with the environment variable `MILLI_BENCH_DATASETS_PATH` like that:
-```
-MILLI_BENCH_DATASETS_PATH=~/Downloads/datasets cargo bench --bench songs
-```
-
-Our benchmarking suite uses criterion which allow you to do a lot of configuration, see the documentation [here](https://bheisler.github.io/criterion.rs/book/user_guide/user_guide.html)
-