From 06c414a75388bd34f7dc3e768f433e0b5bafac23 Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 25 May 2021 17:09:14 +0200 Subject: [PATCH] move the benchmarks to another crate so we can download the datasets automatically without adding overhead to the build of milli --- Cargo.toml | 2 +- benchmarks/Cargo.toml | 29 ++++++++++ benchmarks/README.md | 30 ++++++++++ {milli => benchmarks}/benches/songs.rs | 3 +- {milli => benchmarks}/benches/utils.rs | 21 ++----- {milli => benchmarks}/benches/wiki.rs | 3 +- benchmarks/build.rs | 80 ++++++++++++++++++++++++++ benchmarks/src/lib.rs | 5 ++ milli/Cargo.toml | 9 --- milli/benches/README.md | 27 --------- 10 files changed, 154 insertions(+), 55 deletions(-) create mode 100644 benchmarks/Cargo.toml create mode 100644 benchmarks/README.md rename {milli => benchmarks}/benches/songs.rs (99%) rename {milli => benchmarks}/benches/utils.rs (81%) rename {milli => benchmarks}/benches/wiki.rs (98%) create mode 100644 benchmarks/build.rs create mode 100644 benchmarks/src/lib.rs delete mode 100644 milli/benches/README.md diff --git a/Cargo.toml b/Cargo.toml index a60c293e3..ff0b2582a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["milli", "http-ui", "infos", "helpers", "search"] +members = ["milli", "http-ui", "benchmarks", "infos", "helpers", "search"] default-members = ["milli"] [profile.release] diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml new file mode 100644 index 000000000..f7b66fe3a --- /dev/null +++ b/benchmarks/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "benchmarks" +version = "0.1.0" +edition = "2018" +publish = false + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +milli = { path = "../milli" } + +[dev-dependencies] +heed = "*" # we want to use the version milli uses +criterion = "0.3.4" + +[build-dependencies] +anyhow = "1.0" +bytes = "1.0" +flate2 = "1.0.20" +convert_case = "0.4" +reqwest = { version = "0.11.3", features = ["blocking", "rustls-tls"], default-features = false } + +[[bench]] +name = "songs" +harness = false + +[[bench]] +name = "wiki" +harness = false diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 000000000..8c91700e9 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,30 @@ +Benchmarks +========== + +For our benchmark we are using a small subset of the dataset `songs.csv`. It was generated with this command: +``` +xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv +``` +You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz) +And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz). + +We also use a subset of `wikipedia-articles.csv` that was generated with the following command: +``` +xsv sample --seed 42 500000 wikipedia-articles.csv -o smol-wikipedia-articles.csv +``` +You can download the original [here](https://meili-datasets.s3.fr-par.scw.cloud/wikipedia-articles.csv.gz) and the subset [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-wikipedia-articles.csv.gz). + +----- + +- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h +- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h +- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h + +By default the benchmarks will be downloaded and uncompressed automatically in the target directory. +If you don't want to download the datasets everytime you updates something on the code you can specify a custom directory with the env variable `MILLI_BENCH_DATASETS_PATH`: +``` +mkdir ~/datasets +MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded +touch build.rs +MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded +``` diff --git a/milli/benches/songs.rs b/benchmarks/benches/songs.rs similarity index 99% rename from milli/benches/songs.rs rename to benchmarks/benches/songs.rs index 430b73a40..dd52a0afc 100644 --- a/milli/benches/songs.rs +++ b/benchmarks/benches/songs.rs @@ -1,3 +1,4 @@ +mod datasets_paths; mod utils; use criterion::{criterion_group, criterion_main}; @@ -33,7 +34,7 @@ fn base_conf(builder: &mut Settings) { } const BASE_CONF: Conf = Conf { - dataset: "smol-songs.csv", + dataset: datasets_paths::SMOL_SONGS, queries: &[ "john ", // 9097 "david ", // 4794 diff --git a/milli/benches/utils.rs b/benchmarks/benches/utils.rs similarity index 81% rename from milli/benches/utils.rs rename to benchmarks/benches/utils.rs index f3f5e9bf6..e0feb9b0e 100644 --- a/milli/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -7,15 +7,6 @@ use milli::{ FacetCondition, Index, }; -/// The name of the environment variable used to select the path -/// of the directory containing the datasets -const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH"; - -/// The default path for the dataset if nothing is specified -/// By default we chose `milli/benches` because any cargo command ran in `milli/milli/**` will be -/// executed with a pwd of `milli/milli` -const DEFAULT_DATASETS_PATH: &str = "milli/benches"; - pub struct Conf<'a> { /// where we are going to create our database.mmdb directory /// each benchmark will first try to delete it and then recreate it @@ -33,6 +24,8 @@ pub struct Conf<'a> { pub facet_condition: Option<&'a str>, /// enable or disable the optional words on the query pub optional_words: bool, + /// primary key, if there is None we'll auto-generate docids for every documents + pub primary_key: Option<&'a str>, } impl Conf<'_> { @@ -47,6 +40,7 @@ impl Conf<'_> { configure: Self::nop, facet_condition: None, optional_words: true, + primary_key: None, }; } @@ -86,13 +80,8 @@ pub fn base_setup(conf: &Conf) -> Index { let mut builder = update_builder.index_documents(&mut wtxn, &index); builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - // we called from cargo the current directory is supposed to be milli/milli - let base_dataset_path = std::env::vars() - .find(|var| var.0 == BASE_DATASETS_PATH_KEY) - .map_or(DEFAULT_DATASETS_PATH.to_owned(), |(_key, value)| value); - let dataset_path = format!("{}/{}", base_dataset_path, conf.dataset); - let reader = File::open(&dataset_path) - .expect(&format!("could not find the dataset in: {}", &dataset_path)); + let reader = File::open(conf.dataset) + .expect(&format!("could not find the dataset in: {}", conf.dataset)); builder.execute(reader, |_, _| ()).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/benches/wiki.rs b/benchmarks/benches/wiki.rs similarity index 98% rename from milli/benches/wiki.rs rename to benchmarks/benches/wiki.rs index 8c15f11ca..99ecff2ce 100644 --- a/milli/benches/wiki.rs +++ b/benchmarks/benches/wiki.rs @@ -1,3 +1,4 @@ +mod datasets_paths; mod utils; use criterion::{criterion_group, criterion_main}; @@ -16,7 +17,7 @@ fn base_conf(builder: &mut Settings) { } const BASE_CONF: Conf = Conf { - dataset: "smol-wiki-articles.csv", + dataset: datasets_paths::SMOL_WIKI_ARTICLES, queries: &[ "mingus ", // 46 candidates "miles davis ", // 159 diff --git a/benchmarks/build.rs b/benchmarks/build.rs new file mode 100644 index 000000000..dc92a1a4c --- /dev/null +++ b/benchmarks/build.rs @@ -0,0 +1,80 @@ +use std::path::{Path, PathBuf}; +use std::{env, fs}; +use std::{ + fs::File, + io::{Cursor, Read, Seek, Write}, +}; + +use bytes::Bytes; +use convert_case::{Case, Casing}; +use flate2::read::GzDecoder; +use reqwest::IntoUrl; + +const BASE_URL: &str = "https://meili-datasets.s3.fr-par.scw.cloud/benchmarks"; + +const DATASET_SONGS: &str = "smol-songs"; +const DATASET_WIKI: &str = "smol-wiki-articles"; + +/// The name of the environment variable used to select the path +/// of the directory containing the datasets +const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH"; + +fn main() -> anyhow::Result<()> { + let out_dir = PathBuf::from(env::var(BASE_DATASETS_PATH_KEY).unwrap_or(env::var("OUT_DIR")?)); + + let benches_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?).join("benches"); + let mut manifest_paths_file = File::create(benches_dir.join("datasets_paths.rs"))?; + writeln!( + manifest_paths_file, + r#"//! This file is generated by the build script. +//! Do not modify by hand, use the build.rs file. +#![allow(dead_code)] +"# + )?; + writeln!(manifest_paths_file)?; + + for dataset in &[DATASET_SONGS, DATASET_WIKI] { + let out_path = out_dir.join(dataset); + let out_file = out_path.with_extension("csv"); + + writeln!( + &mut manifest_paths_file, + r#"pub const {}: &str = {:?};"#, + dataset.to_case(Case::ScreamingSnake), + out_file.display(), + )?; + + if out_file.exists() { + eprintln!("The dataset {} already exists on the file system and will not be downloaded again", dataset); + continue; + } + let url = format!("{}/{}.csv.gz", BASE_URL, dataset); + eprintln!("downloading: {}", url); + let bytes = download_dataset(url.clone())?; + eprintln!("{} downloaded successfully", url); + eprintln!("uncompressing in {}", out_path.display()); + uncompress_in_file(bytes, &out_file)?; + } + + Ok(()) +} + +fn download_dataset(url: U) -> anyhow::Result> { + let bytes = reqwest::blocking::Client::builder() + .timeout(None) + .build()? + .get(url) + .send()? + .bytes()?; + Ok(Cursor::new(bytes)) +} + +fn uncompress_in_file>(bytes: R, path: P) -> anyhow::Result<()> { + let path = path.as_ref(); + let mut gz = GzDecoder::new(bytes); + let mut dataset = Vec::new(); + gz.read_to_end(&mut dataset)?; + + fs::write(path, dataset)?; + Ok(()) +} diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs new file mode 100644 index 000000000..4281ec115 --- /dev/null +++ b/benchmarks/src/lib.rs @@ -0,0 +1,5 @@ +//! This library is only used to isolate the benchmarks +//! from the original milli library. +//! +//! It does not include interesting functions for milli library +//! users only for milli contributors. diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 1c0f74613..2af6a9042 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -53,17 +53,8 @@ tinytemplate = "=1.1.0" [dev-dependencies] big_s = "1.0.2" -criterion = "0.3.4" maplit = "1.0.2" rand = "0.8.3" [features] default = [] - -[[bench]] -name = "songs" -harness = false - -[[bench]] -name = "wiki" -harness = false diff --git a/milli/benches/README.md b/milli/benches/README.md deleted file mode 100644 index b2c1aec15..000000000 --- a/milli/benches/README.md +++ /dev/null @@ -1,27 +0,0 @@ -Benchmarks -========== - -For our benchmark we are using a small subset of the dataset songs.csv. It was generated with this command: -``` -xsv sample --seed 42 song.csv -o smol-songs.csv -``` -You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz) -And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz). - -You need to put this file in the current directory: `milli/milli/benches/smol-songs.csv.gz` -You can run the following command from the root of this git repository -``` -wget https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz -O milli/benches/smol-songs.csv.gz -``` - -- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h -- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h -- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h - -By default the benchmarks expect the datasets to be uncompressed and present in `milli/milli/benches`, but you can also specify your own path with the environment variable `MILLI_BENCH_DATASETS_PATH` like that: -``` -MILLI_BENCH_DATASETS_PATH=~/Downloads/datasets cargo bench --bench songs -``` - -Our benchmarking suite uses criterion which allow you to do a lot of configuration, see the documentation [here](https://bheisler.github.io/criterion.rs/book/user_guide/user_guide.html) -