uses an env variable to find the datasets

2025-07-04 20:37:15 +02:00 · 2021-04-27 15:41:16 +02:00 · 2021-04-27 15:41:16 +02:00 · 3c84075d2d
commit 3c84075d2d
parent 4969abeaab
2 changed files with 27 additions and 2 deletions
--- a/milli/benches/README.md
+++ b/milli/benches/README.md
@ -13,3 +13,15 @@ You can run the following command from the root of this git repository
 ```
 wget https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz -O milli/benches/smol-songs.csv.gz
 ```
+
+- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h
+- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h
+- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h
+
+By default the benchmarks expect the datasets to be uncompressed and present in `milli/milli/benches`, but you can also specify your own path with the environment variable `MILLI_BENCH_DATASETS_PATH` like that:
+```
+MILLI_BENCH_DATASETS_PATH=~/Downloads/datasets cargo bench --bench songs
+```
+
+Our benchmarking suite uses criterion which allow you to do a lot of configuration, see the documentation [here](https://bheisler.github.io/criterion.rs/book/user_guide/user_guide.html)
+
--- a/milli/benches/utils.rs
+++ b/milli/benches/utils.rs
@ -7,6 +7,15 @@ use milli::{
    FacetCondition, Index,
 };

+/// The name of the environment variable used to select the path
+/// of the directory containing the datasets
+const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH";
+
+/// The default path for the dataset if nothing is specified
+/// By default we chose `milli/benches` because any cargo command ran in `milli/milli/**` will be
+/// executed with a pwd of `milli/milli`
+const DEFAULT_DATASETS_PATH: &str = "milli/benches";
+
 pub struct Conf<'a> {
    /// where we are going to create our database.mmdb directory
    /// each benchmark will first try to delete it and then recreate it
@ -78,7 +87,10 @@ pub fn base_setup(conf: &Conf) -> Index {
    builder.update_format(UpdateFormat::Csv);
    builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
    // we called from cargo the current directory is supposed to be milli/milli
-    let dataset_path = format!("benches/{}", conf.dataset);
+    let base_dataset_path = std::env::vars()
+        .find(|var| var.0 == BASE_DATASETS_PATH_KEY)
+        .map_or(DEFAULT_DATASETS_PATH.to_owned(), |(_key, value)| value);
+    let dataset_path = format!("{}/{}", base_dataset_path, conf.dataset);
    let reader = File::open(&dataset_path)
        .expect(&format!("could not find the dataset in: {}", &dataset_path));
    builder.execute(reader, |_, _| ()).unwrap();
@ -100,7 +112,8 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
                    let mut search = index.search(&rtxn);
                    search.query(query).optional_words(conf.optional_words);
                    if let Some(facet_condition) = conf.facet_condition {
-                        let facet_condition = FacetCondition::from_str(&rtxn, &index, facet_condition).unwrap();
+                        let facet_condition =
+                            FacetCondition::from_str(&rtxn, &index, facet_condition).unwrap();
                        search.facet_condition(facet_condition);
                    }
                    let _ids = search.execute().unwrap();