move the benchmarks to another crate so we can download the datasets automatically without adding overhead to the build of milli

2025-07-03 11:57:07 +02:00 · 2021-05-25 17:09:14 +02:00 · 2021-05-25 17:09:14 +02:00 · 06c414a753
commit 06c414a753
parent 3c84075d2d
10 changed files with 154 additions and 55 deletions
--- a/benchmarks/build.rs
+++ b/benchmarks/build.rs
@ -0,0 +1,80 @@
+use std::path::{Path, PathBuf};
+use std::{env, fs};
+use std::{
+    fs::File,
+    io::{Cursor, Read, Seek, Write},
+};
+
+use bytes::Bytes;
+use convert_case::{Case, Casing};
+use flate2::read::GzDecoder;
+use reqwest::IntoUrl;
+
+const BASE_URL: &str = "https://meili-datasets.s3.fr-par.scw.cloud/benchmarks";
+
+const DATASET_SONGS: &str = "smol-songs";
+const DATASET_WIKI: &str = "smol-wiki-articles";
+
+/// The name of the environment variable used to select the path
+/// of the directory containing the datasets
+const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH";
+
+fn main() -> anyhow::Result<()> {
+    let out_dir = PathBuf::from(env::var(BASE_DATASETS_PATH_KEY).unwrap_or(env::var("OUT_DIR")?));
+
+    let benches_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?).join("benches");
+    let mut manifest_paths_file = File::create(benches_dir.join("datasets_paths.rs"))?;
+    writeln!(
+        manifest_paths_file,
+        r#"//! This file is generated by the build script.
+//! Do not modify by hand, use the build.rs file.
+#![allow(dead_code)]
+"#
+    )?;
+    writeln!(manifest_paths_file)?;
+
+    for dataset in &[DATASET_SONGS, DATASET_WIKI] {
+        let out_path = out_dir.join(dataset);
+        let out_file = out_path.with_extension("csv");
+
+        writeln!(
+            &mut manifest_paths_file,
+            r#"pub const {}: &str = {:?};"#,
+            dataset.to_case(Case::ScreamingSnake),
+            out_file.display(),
+        )?;
+
+        if out_file.exists() {
+            eprintln!("The dataset {} already exists on the file system and will not be downloaded again", dataset);
+            continue;
+        }
+        let url = format!("{}/{}.csv.gz", BASE_URL, dataset);
+        eprintln!("downloading: {}", url);
+        let bytes = download_dataset(url.clone())?;
+        eprintln!("{} downloaded successfully", url);
+        eprintln!("uncompressing in {}", out_path.display());
+        uncompress_in_file(bytes, &out_file)?;
+    }
+
+    Ok(())
+}
+
+fn download_dataset<U: IntoUrl>(url: U) -> anyhow::Result<Cursor<Bytes>> {
+    let bytes = reqwest::blocking::Client::builder()
+        .timeout(None)
+        .build()?
+        .get(url)
+        .send()?
+        .bytes()?;
+    Ok(Cursor::new(bytes))
+}
+
+fn uncompress_in_file<R: Read + Seek, P: AsRef<Path>>(bytes: R, path: P) -> anyhow::Result<()> {
+    let path = path.as_ref();
+    let mut gz = GzDecoder::new(bytes);
+    let mut dataset = Vec::new();
+    gz.read_to_end(&mut dataset)?;
+
+    fs::write(path, dataset)?;
+    Ok(())
+}