78 lines
2.5 KiB
Rust

use std::fs::File;
use std::io::{Cursor, Read, Seek, Write};
use std::path::{Path, PathBuf};
use std::{env, fs};
use bytes::Bytes;
use convert_case::{Case, Casing};
use flate2::read::GzDecoder;
use reqwest::IntoUrl;
const BASE_URL: &str = "https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets";
const DATASET_SONGS: &str = "smol-songs";
const DATASET_WIKI: &str = "smol-wiki-articles";
/// The name of the environment variable used to select the path
/// of the directory containing the datasets
const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH";
fn main() -> anyhow::Result<()> {
let out_dir = PathBuf::from(env::var(BASE_DATASETS_PATH_KEY).unwrap_or(env::var("OUT_DIR")?));
let benches_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?).join("benches");
let mut manifest_paths_file = File::create(benches_dir.join("datasets_paths.rs"))?;
write!(
manifest_paths_file,
r#"//! This file is generated by the build script.
//! Do not modify by hand, use the build.rs file.
#![allow(dead_code)]
"#
)?;
writeln!(manifest_paths_file)?;
for dataset in &[DATASET_SONGS, DATASET_WIKI] {
let out_path = out_dir.join(dataset);
let out_file = out_path.with_extension("csv");
writeln!(
&mut manifest_paths_file,
r#"pub const {}: &str = {:?};"#,
dataset.to_case(Case::ScreamingSnake),
out_file.display(),
)?;
if out_file.exists() {
eprintln!(
"The dataset {} already exists on the file system and will not be downloaded again",
dataset
);
continue;
}
let url = format!("{}/{}.csv.gz", BASE_URL, dataset);
eprintln!("downloading: {}", url);
let bytes = download_dataset(url.clone())?;
eprintln!("{} downloaded successfully", url);
eprintln!("uncompressing in {}", out_path.display());
uncompress_in_file(bytes, &out_file)?;
}
Ok(())
}
fn download_dataset<U: IntoUrl>(url: U) -> anyhow::Result<Cursor<Bytes>> {
let bytes =
reqwest::blocking::Client::builder().timeout(None).build()?.get(url).send()?.bytes()?;
Ok(Cursor::new(bytes))
}
fn uncompress_in_file<R: Read + Seek, P: AsRef<Path>>(bytes: R, path: P) -> anyhow::Result<()> {
let path = path.as_ref();
let mut gz = GzDecoder::new(bytes);
let mut dataset = Vec::new();
gz.read_to_end(&mut dataset)?;
fs::write(path, dataset)?;
Ok(())
}