use std::path::{Path, PathBuf}; use std::{env, fs}; use std::{ fs::File, io::{Cursor, Read, Seek, Write}, }; use bytes::Bytes; use convert_case::{Case, Casing}; use flate2::read::GzDecoder; use reqwest::IntoUrl; const BASE_URL: &str = "https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets"; const DATASET_SONGS: &str = "smol-songs"; const DATASET_WIKI: &str = "smol-wiki-articles"; /// The name of the environment variable used to select the path /// of the directory containing the datasets const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH"; fn main() -> anyhow::Result<()> { let out_dir = PathBuf::from(env::var(BASE_DATASETS_PATH_KEY).unwrap_or(env::var("OUT_DIR")?)); let benches_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?).join("benches"); let mut manifest_paths_file = File::create(benches_dir.join("datasets_paths.rs"))?; writeln!( manifest_paths_file, r#"//! This file is generated by the build script. //! Do not modify by hand, use the build.rs file. #![allow(dead_code)] "# )?; writeln!(manifest_paths_file)?; for dataset in &[DATASET_SONGS, DATASET_WIKI] { let out_path = out_dir.join(dataset); let out_file = out_path.with_extension("csv"); writeln!( &mut manifest_paths_file, r#"pub const {}: &str = {:?};"#, dataset.to_case(Case::ScreamingSnake), out_file.display(), )?; if out_file.exists() { eprintln!("The dataset {} already exists on the file system and will not be downloaded again", dataset); continue; } let url = format!("{}/{}.csv.gz", BASE_URL, dataset); eprintln!("downloading: {}", url); let bytes = download_dataset(url.clone())?; eprintln!("{} downloaded successfully", url); eprintln!("uncompressing in {}", out_path.display()); uncompress_in_file(bytes, &out_file)?; } Ok(()) } fn download_dataset(url: U) -> anyhow::Result> { let bytes = reqwest::blocking::Client::builder() .timeout(None) .build()? .get(url) .send()? .bytes()?; Ok(Cursor::new(bytes)) } fn uncompress_in_file>(bytes: R, path: P) -> anyhow::Result<()> { let path = path.as_ref(); let mut gz = GzDecoder::new(bytes); let mut dataset = Vec::new(); gz.read_to_end(&mut dataset)?; fs::write(path, dataset)?; Ok(()) }