move the benchmarks to another crate so we can download the datasets automatically without adding overhead to the build of milli

2025-07-03 20:07:09 +02:00 · 2021-05-25 17:09:14 +02:00 · 2021-05-25 17:09:14 +02:00 · 06c414a753
commit 06c414a753
parent 3c84075d2d
10 changed files with 154 additions and 55 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,5 +1,5 @@
 [workspace]
-members = ["milli", "http-ui", "infos", "helpers", "search"]
+members = ["milli", "http-ui", "benchmarks", "infos", "helpers", "search"]
 default-members = ["milli"]
 [profile.release]
--- a/benchmarks/Cargo.toml
+++ b/benchmarks/Cargo.toml
@ -0,0 +1,29 @@
 [package]
 name = "benchmarks"
 version = "0.1.0"
 edition = "2018"
 publish = false
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 [dependencies]
 milli = { path = "../milli" }
 [dev-dependencies]
 heed = "*" # we want to use the version milli uses
 criterion = "0.3.4"
 [build-dependencies]
 anyhow = "1.0"
 bytes = "1.0"
 flate2 = "1.0.20"
 convert_case = "0.4"
 reqwest = { version = "0.11.3", features = ["blocking", "rustls-tls"], default-features = false }
 [[bench]]
 name = "songs"
 harness = false
 [[bench]]
 name = "wiki"
 harness = false
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -0,0 +1,30 @@
 Benchmarks
 ==========
 For our benchmark we are using a small subset of the dataset `songs.csv`. It was generated with this command:
 ```
 xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv
 ```
 You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)
 And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz).
 We also use a subset of `wikipedia-articles.csv` that was generated with the following command:
 ```
 xsv sample --seed 42 500000 wikipedia-articles.csv -o smol-wikipedia-articles.csv
 ```
 You can download the original [here](https://meili-datasets.s3.fr-par.scw.cloud/wikipedia-articles.csv.gz) and the subset [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-wikipedia-articles.csv.gz).
 -----
 - To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h
 - You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h
 - And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h
 By default the benchmarks will be downloaded and uncompressed automatically in the target directory.
 If you don't want to download the datasets everytime you updates something on the code you can specify a custom directory with the env variable `MILLI_BENCH_DATASETS_PATH`:
 ```
 mkdir ~/datasets
 MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded
 touch build.rs
 MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded
 ```
--- a/benchmarks/benches/songs.rs
+++ b/benchmarks/benches/songs.rs
@ -1,3 +1,4 @@
 mod datasets_paths;
 mod utils;
 use criterion::{criterion_group, criterion_main};
@ -33,7 +34,7 @@ fn base_conf(builder: &mut Settings) {
 }
 const BASE_CONF: Conf = Conf {
-    dataset: "smol-songs.csv",
+    dataset: datasets_paths::SMOL_SONGS,
    queries: &[
        "john ",             // 9097
        "david ",            // 4794
--- a/benchmarks/benches/utils.rs
+++ b/benchmarks/benches/utils.rs
@ -7,15 +7,6 @@ use milli::{
    FacetCondition, Index,
 };
 /// The name of the environment variable used to select the path
 /// of the directory containing the datasets
 const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH";
 /// The default path for the dataset if nothing is specified
 /// By default we chose `milli/benches` because any cargo command ran in `milli/milli/**` will be
 /// executed with a pwd of `milli/milli`
 const DEFAULT_DATASETS_PATH: &str = "milli/benches";
 pub struct Conf<'a> {
    /// where we are going to create our database.mmdb directory
    /// each benchmark will first try to delete it and then recreate it
@ -33,6 +24,8 @@ pub struct Conf<'a> {
    pub facet_condition: Option<&'a str>,
    /// enable or disable the optional words on the query
    pub optional_words: bool,
    /// primary key, if there is None we'll auto-generate docids for every documents
    pub primary_key: Option<&'a str>,
 }
 impl Conf<'_> {
@ -47,6 +40,7 @@ impl Conf<'_> {
        configure: Self::nop,
        facet_condition: None,
        optional_words: true,
        primary_key: None,
    };
 }
@ -86,13 +80,8 @@ pub fn base_setup(conf: &Conf) -> Index {
    let mut builder = update_builder.index_documents(&mut wtxn, &index);
    builder.update_format(UpdateFormat::Csv);
    builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
-    // we called from cargo the current directory is supposed to be milli/milli
+    let reader = File::open(conf.dataset)
-    let base_dataset_path = std::env::vars()
+        .expect(&format!("could not find the dataset in: {}", conf.dataset));
        .find(|var| var.0 == BASE_DATASETS_PATH_KEY)
        .map_or(DEFAULT_DATASETS_PATH.to_owned(), |(_key, value)| value);
    let dataset_path = format!("{}/{}", base_dataset_path, conf.dataset);
    let reader = File::open(&dataset_path)
        .expect(&format!("could not find the dataset in: {}", &dataset_path));
    builder.execute(reader, |_, _| ()).unwrap();
    wtxn.commit().unwrap();
--- a/benchmarks/benches/wiki.rs
+++ b/benchmarks/benches/wiki.rs
@ -1,3 +1,4 @@
 mod datasets_paths;
 mod utils;
 use criterion::{criterion_group, criterion_main};
@ -16,7 +17,7 @@ fn base_conf(builder: &mut Settings) {
 }
 const BASE_CONF: Conf = Conf {
-    dataset: "smol-wiki-articles.csv",
+    dataset: datasets_paths::SMOL_WIKI_ARTICLES,
    queries: &[
        "mingus ",        // 46 candidates
        "miles davis ",   // 159
--- a/benchmarks/build.rs
+++ b/benchmarks/build.rs
@ -0,0 +1,80 @@
 use std::path::{Path, PathBuf};
 use std::{env, fs};
 use std::{
    fs::File,
    io::{Cursor, Read, Seek, Write},
 };
 use bytes::Bytes;
 use convert_case::{Case, Casing};
 use flate2::read::GzDecoder;
 use reqwest::IntoUrl;
 const BASE_URL: &str = "https://meili-datasets.s3.fr-par.scw.cloud/benchmarks";
 const DATASET_SONGS: &str = "smol-songs";
 const DATASET_WIKI: &str = "smol-wiki-articles";
 /// The name of the environment variable used to select the path
 /// of the directory containing the datasets
 const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH";
 fn main() -> anyhow::Result<()> {
    let out_dir = PathBuf::from(env::var(BASE_DATASETS_PATH_KEY).unwrap_or(env::var("OUT_DIR")?));
    let benches_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?).join("benches");
    let mut manifest_paths_file = File::create(benches_dir.join("datasets_paths.rs"))?;
    writeln!(
        manifest_paths_file,
        r#"//! This file is generated by the build script.
 //! Do not modify by hand, use the build.rs file.
 #![allow(dead_code)]
 "#
    )?;
    writeln!(manifest_paths_file)?;
    for dataset in &[DATASET_SONGS, DATASET_WIKI] {
        let out_path = out_dir.join(dataset);
        let out_file = out_path.with_extension("csv");
        writeln!(
            &mut manifest_paths_file,
            r#"pub const {}: &str = {:?};"#,
            dataset.to_case(Case::ScreamingSnake),
            out_file.display(),
        )?;
        if out_file.exists() {
            eprintln!("The dataset {} already exists on the file system and will not be downloaded again", dataset);
            continue;
        }
        let url = format!("{}/{}.csv.gz", BASE_URL, dataset);
        eprintln!("downloading: {}", url);
        let bytes = download_dataset(url.clone())?;
        eprintln!("{} downloaded successfully", url);
        eprintln!("uncompressing in {}", out_path.display());
        uncompress_in_file(bytes, &out_file)?;
    }
    Ok(())
 }
 fn download_dataset<U: IntoUrl>(url: U) -> anyhow::Result<Cursor<Bytes>> {
    let bytes = reqwest::blocking::Client::builder()
        .timeout(None)
        .build()?
        .get(url)
        .send()?
        .bytes()?;
    Ok(Cursor::new(bytes))
 }
 fn uncompress_in_file<R: Read + Seek, P: AsRef<Path>>(bytes: R, path: P) -> anyhow::Result<()> {
    let path = path.as_ref();
    let mut gz = GzDecoder::new(bytes);
    let mut dataset = Vec::new();
    gz.read_to_end(&mut dataset)?;
    fs::write(path, dataset)?;
    Ok(())
 }
--- a/benchmarks/src/lib.rs
+++ b/benchmarks/src/lib.rs
@ -0,0 +1,5 @@
 //! This library is only used to isolate the benchmarks
 //! from the original milli library.
 //!
 //! It does not include interesting functions for milli library
 //! users only for milli contributors.
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@ -53,17 +53,8 @@ tinytemplate = "=1.1.0"
 [dev-dependencies]
 big_s = "1.0.2"
 criterion = "0.3.4"
 maplit = "1.0.2"
 rand = "0.8.3"
 [features]
 default = []
 [[bench]]
 name = "songs"
 harness = false
 [[bench]]
 name = "wiki"
 harness = false
--- a/milli/benches/README.md
+++ b/milli/benches/README.md
@ -1,27 +0,0 @@
 Benchmarks
 ==========
 For our benchmark we are using a small subset of the dataset songs.csv. It was generated with this command:
 ```
 xsv sample --seed 42 song.csv -o smol-songs.csv
 ```
 You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)
 And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz).
 You need to put this file in the current directory: `milli/milli/benches/smol-songs.csv.gz`
 You can run the following command from the root of this git repository
 ```
 wget https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz -O milli/benches/smol-songs.csv.gz
 ```
 - To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h
 - You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h
 - And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h
 By default the benchmarks expect the datasets to be uncompressed and present in `milli/milli/benches`, but you can also specify your own path with the environment variable `MILLI_BENCH_DATASETS_PATH` like that:
 ```
 MILLI_BENCH_DATASETS_PATH=~/Downloads/datasets cargo bench --bench songs
 ```
 Our benchmarking suite uses criterion which allow you to do a lot of configuration, see the documentation [here](https://bheisler.github.io/criterion.rs/book/user_guide/user_guide.html)