move the benchmarks to another crate so we can download the datasets automatically without adding overhead to the build of milli

This commit is contained in:
tamo 2021-05-25 17:09:14 +02:00 committed by Tamo
parent 3c84075d2d
commit 06c414a753
No known key found for this signature in database
GPG key ID: 20CD8020AFA88D69
10 changed files with 154 additions and 55 deletions

29
benchmarks/Cargo.toml Normal file
View file

@ -0,0 +1,29 @@
[package]
name = "benchmarks"
version = "0.1.0"
edition = "2018"
publish = false
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
milli = { path = "../milli" }
[dev-dependencies]
heed = "*" # we want to use the version milli uses
criterion = "0.3.4"
[build-dependencies]
anyhow = "1.0"
bytes = "1.0"
flate2 = "1.0.20"
convert_case = "0.4"
reqwest = { version = "0.11.3", features = ["blocking", "rustls-tls"], default-features = false }
[[bench]]
name = "songs"
harness = false
[[bench]]
name = "wiki"
harness = false

30
benchmarks/README.md Normal file
View file

@ -0,0 +1,30 @@
Benchmarks
==========
For our benchmark we are using a small subset of the dataset `songs.csv`. It was generated with this command:
```
xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv
```
You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)
And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz).
We also use a subset of `wikipedia-articles.csv` that was generated with the following command:
```
xsv sample --seed 42 500000 wikipedia-articles.csv -o smol-wikipedia-articles.csv
```
You can download the original [here](https://meili-datasets.s3.fr-par.scw.cloud/wikipedia-articles.csv.gz) and the subset [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-wikipedia-articles.csv.gz).
-----
- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h
- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h
- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h
By default the benchmarks will be downloaded and uncompressed automatically in the target directory.
If you don't want to download the datasets everytime you updates something on the code you can specify a custom directory with the env variable `MILLI_BENCH_DATASETS_PATH`:
```
mkdir ~/datasets
MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded
touch build.rs
MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded
```

210
benchmarks/benches/songs.rs Normal file
View file

@ -0,0 +1,210 @@
mod datasets_paths;
mod utils;
use criterion::{criterion_group, criterion_main};
use milli::update::Settings;
use utils::Conf;
fn base_conf(builder: &mut Settings) {
let displayed_fields = [
"id", "title", "album", "artist", "genre", "country", "released", "duration",
]
.iter()
.map(|s| s.to_string())
.collect();
builder.set_displayed_fields(displayed_fields);
let searchable_fields = ["title", "album", "artist"]
.iter()
.map(|s| s.to_string())
.collect();
builder.set_searchable_fields(searchable_fields);
let faceted_fields = [
("released-timestamp", "number"),
("duration-float", "number"),
("genre", "string"),
("country", "string"),
("artist", "string"),
]
.iter()
.map(|(a, b)| (a.to_string(), b.to_string()))
.collect();
builder.set_faceted_fields(faceted_fields);
}
const BASE_CONF: Conf = Conf {
dataset: datasets_paths::SMOL_SONGS,
queries: &[
"john ", // 9097
"david ", // 4794
"charles ", // 1957
"david bowie ", // 1200
"michael jackson ", // 600
"thelonious monk ", // 303
"charles mingus ", // 142
"marcus miller ", // 60
"tamo ", // 13
"Notstandskomitee ", // 4
],
configure: base_conf,
..Conf::BASE
};
fn bench_songs(c: &mut criterion::Criterion) {
let default_criterion: Vec<String> = milli::default_criteria()
.iter()
.map(|criteria| criteria.to_string())
.collect();
let default_criterion = default_criterion.iter().map(|s| s.as_str());
let asc_default: Vec<&str> = std::iter::once("asc(released-timestamp)")
.chain(default_criterion.clone())
.collect();
let desc_default: Vec<&str> = std::iter::once("desc(released-timestamp)")
.chain(default_criterion.clone())
.collect();
let basic_with_quote: Vec<String> = BASE_CONF
.queries
.iter()
.map(|s| {
s.trim()
.split(' ')
.map(|s| format!(r#""{}""#, s))
.collect::<Vec<String>>()
.join(" ")
})
.collect();
let basic_with_quote: &[&str] = &basic_with_quote
.iter()
.map(|s| s.as_str())
.collect::<Vec<&str>>();
let confs = &[
/* first we bench each criterion alone */
utils::Conf {
group_name: "proximity",
queries: &[
"black saint sinner lady ",
"les dangeureuses 1960 ",
"The Disneyland Sing-Along Chorus ",
"Under Great Northern Lights ",
"7000 Danses Un Jour Dans Notre Vie ",
],
criterion: Some(&["proximity"]),
optional_words: false,
..BASE_CONF
},
utils::Conf {
group_name: "typo",
queries: &[
"mongus ",
"thelonius monk ",
"Disnaylande ",
"the white striper ",
"indochie ",
"indochien ",
"klub des loopers ",
"fear of the duck ",
"michel depech ",
"stromal ",
"dire straights ",
"Arethla Franklin ",
],
criterion: Some(&["typo"]),
optional_words: false,
..BASE_CONF
},
utils::Conf {
group_name: "words",
queries: &[
"the black saint and the sinner lady and the good doggo ", // four words to pop
"les liaisons dangeureuses 1793 ", // one word to pop
"The Disneyland Children's Sing-Alone song ", // two words to pop
"seven nation mummy ", // one word to pop
"7000 Danses / Le Baiser / je me trompe de mots ", // four words to pop
"Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop
"whathavenotnsuchforth and a good amount of words to pop to match the first one ", // 13
],
criterion: Some(&["words"]),
..BASE_CONF
},
utils::Conf {
group_name: "asc",
criterion: Some(&["asc(released-timestamp)"]),
..BASE_CONF
},
utils::Conf {
group_name: "desc",
criterion: Some(&["desc(released-timestamp)"]),
..BASE_CONF
},
/* then we bench the asc and desc criterion on top of the default criterion */
utils::Conf {
group_name: "asc + default",
criterion: Some(&asc_default[..]),
..BASE_CONF
},
utils::Conf {
group_name: "desc + default",
criterion: Some(&desc_default[..]),
..BASE_CONF
},
/* we bench the filters with the default request */
utils::Conf {
group_name: "basic filter: <=",
facet_condition: Some("released-timestamp <= 946728000"), // year 2000
..BASE_CONF
},
utils::Conf {
group_name: "basic filter: TO",
facet_condition: Some("released-timestamp 946728000 TO 1262347200"), // year 2000 to 2010
..BASE_CONF
},
utils::Conf {
group_name: "big filter",
facet_condition: Some("released-timestamp != 1262347200 AND (NOT (released-timestamp = 946728000)) AND (duration-float = 1 OR (duration-float 1.1 TO 1.5 AND released-timestamp > 315576000))"),
..BASE_CONF
},
/* the we bench some global / normal search with all the default criterion in the default
* order */
utils::Conf {
group_name: "basic placeholder",
queries: &[""],
..BASE_CONF
},
utils::Conf {
group_name: "basic without quote",
queries: &BASE_CONF
.queries
.iter()
.map(|s| s.trim()) // we remove the space at the end of each request
.collect::<Vec<&str>>(),
..BASE_CONF
},
utils::Conf {
group_name: "basic with quote",
queries: basic_with_quote,
..BASE_CONF
},
utils::Conf {
group_name: "prefix search",
queries: &[
"s", // 500k+ results
"a", //
"b", //
"i", //
"x", // only 7k results
],
..BASE_CONF
},
];
utils::run_benches(c, confs);
}
criterion_group!(benches, bench_songs);
criterion_main!(benches);

114
benchmarks/benches/utils.rs Normal file
View file

@ -0,0 +1,114 @@
use std::fs::{create_dir_all, remove_dir_all, File};
use criterion::BenchmarkId;
use heed::EnvOpenOptions;
use milli::{
update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat},
FacetCondition, Index,
};
pub struct Conf<'a> {
/// where we are going to create our database.mmdb directory
/// each benchmark will first try to delete it and then recreate it
pub database_name: &'a str,
/// the dataset to be used, it must be an uncompressed csv
pub dataset: &'a str,
pub group_name: &'a str,
pub queries: &'a [&'a str],
/// here you can change which criterion are used and in which order.
/// - if you specify something all the base configuration will be thrown out
/// - if you don't specify anything (None) the default configuration will be kept
pub criterion: Option<&'a [&'a str]>,
/// the last chance to configure your database as you want
pub configure: fn(&mut Settings),
pub facet_condition: Option<&'a str>,
/// enable or disable the optional words on the query
pub optional_words: bool,
/// primary key, if there is None we'll auto-generate docids for every documents
pub primary_key: Option<&'a str>,
}
impl Conf<'_> {
fn nop(_builder: &mut Settings) {}
pub const BASE: Self = Conf {
database_name: "benches.mmdb",
dataset: "",
group_name: "",
queries: &[],
criterion: None,
configure: Self::nop,
facet_condition: None,
optional_words: true,
primary_key: None,
};
}
pub fn base_setup(conf: &Conf) -> Index {
match remove_dir_all(&conf.database_name) {
Ok(_) => (),
Err(e) if e.kind() == std::io::ErrorKind::NotFound => (),
Err(e) => panic!("{}", e),
}
create_dir_all(&conf.database_name).unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
options.max_readers(10);
let index = Index::new(options, conf.database_name).unwrap();
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.settings(&mut wtxn, &index);
if let Some(criterion) = conf.criterion {
builder.reset_faceted_fields();
builder.reset_criteria();
builder.reset_stop_words();
let criterion = criterion.iter().map(|s| s.to_string()).collect();
builder.set_criteria(criterion);
}
(conf.configure)(&mut builder);
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(conf.dataset)
.expect(&format!("could not find the dataset in: {}", conf.dataset));
builder.execute(reader, |_, _| ()).unwrap();
wtxn.commit().unwrap();
index
}
pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
for conf in confs {
let index = base_setup(conf);
let mut group = c.benchmark_group(&format!("{}: {}", conf.dataset, conf.group_name));
for &query in conf.queries {
group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| {
b.iter(|| {
let rtxn = index.read_txn().unwrap();
let mut search = index.search(&rtxn);
search.query(query).optional_words(conf.optional_words);
if let Some(facet_condition) = conf.facet_condition {
let facet_condition =
FacetCondition::from_str(&rtxn, &index, facet_condition).unwrap();
search.facet_condition(facet_condition);
}
let _ids = search.execute().unwrap();
});
});
}
group.finish();
}
}

133
benchmarks/benches/wiki.rs Normal file
View file

@ -0,0 +1,133 @@
mod datasets_paths;
mod utils;
use criterion::{criterion_group, criterion_main};
use milli::update::Settings;
use utils::Conf;
fn base_conf(builder: &mut Settings) {
let displayed_fields = ["title", "body", "url"]
.iter()
.map(|s| s.to_string())
.collect();
builder.set_displayed_fields(displayed_fields);
let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect();
builder.set_searchable_fields(searchable_fields);
}
const BASE_CONF: Conf = Conf {
dataset: datasets_paths::SMOL_WIKI_ARTICLES,
queries: &[
"mingus ", // 46 candidates
"miles davis ", // 159
"rock and roll ", // 1007
"machine ", // 3448
"spain ", // 7002
"japan ", // 10.593
"france ", // 17.616
"film ", // 24.959
],
configure: base_conf,
..Conf::BASE
};
fn bench_songs(c: &mut criterion::Criterion) {
let basic_with_quote: Vec<String> = BASE_CONF
.queries
.iter()
.map(|s| {
s.trim()
.split(' ')
.map(|s| format!(r#""{}""#, s))
.collect::<Vec<String>>()
.join(" ")
})
.collect();
let basic_with_quote: &[&str] = &basic_with_quote
.iter()
.map(|s| s.as_str())
.collect::<Vec<&str>>();
let confs = &[
/* first we bench each criterion alone */
utils::Conf {
group_name: "proximity",
queries: &[
"herald sings ",
"april paris ",
"tea two ",
"diesel engine ",
],
criterion: Some(&["proximity"]),
optional_words: false,
..BASE_CONF
},
utils::Conf {
group_name: "typo",
queries: &[
"migrosoft ",
"linax ",
"Disnaylande ",
"phytogropher ",
"nympalidea ",
"aritmetric ",
"the fronce ",
"sisan ",
],
criterion: Some(&["typo"]),
optional_words: false,
..BASE_CONF
},
utils::Conf {
group_name: "words",
queries: &[
"the black saint and the sinner lady and the good doggo ", // four words to pop, 27 results
"Kameya Tokujirō mingus monk ", // two words to pop, 55
"Ulrich Hensel meilisearch milli ", // two words to pop, 306
"Idaho Bellevue pizza ", // one word to pop, 800
"Abraham machin ", // one word to pop, 1141
],
criterion: Some(&["words"]),
..BASE_CONF
},
/* the we bench some global / normal search with all the default criterion in the default
* order */
utils::Conf {
group_name: "basic placeholder",
queries: &[""],
..BASE_CONF
},
utils::Conf {
group_name: "basic without quote",
queries: &BASE_CONF
.queries
.iter()
.map(|s| s.trim()) // we remove the space at the end of each request
.collect::<Vec<&str>>(),
..BASE_CONF
},
utils::Conf {
group_name: "basic with quote",
queries: basic_with_quote,
..BASE_CONF
},
utils::Conf {
group_name: "prefix search",
queries: &[
"t", // 453k results
"c", // 405k
"g", // 318k
"j", // 227k
"q", // 71k
"x", // 17k
],
..BASE_CONF
},
];
utils::run_benches(c, confs);
}
criterion_group!(benches, bench_songs);
criterion_main!(benches);

80
benchmarks/build.rs Normal file
View file

@ -0,0 +1,80 @@
use std::path::{Path, PathBuf};
use std::{env, fs};
use std::{
fs::File,
io::{Cursor, Read, Seek, Write},
};
use bytes::Bytes;
use convert_case::{Case, Casing};
use flate2::read::GzDecoder;
use reqwest::IntoUrl;
const BASE_URL: &str = "https://meili-datasets.s3.fr-par.scw.cloud/benchmarks";
const DATASET_SONGS: &str = "smol-songs";
const DATASET_WIKI: &str = "smol-wiki-articles";
/// The name of the environment variable used to select the path
/// of the directory containing the datasets
const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH";
fn main() -> anyhow::Result<()> {
let out_dir = PathBuf::from(env::var(BASE_DATASETS_PATH_KEY).unwrap_or(env::var("OUT_DIR")?));
let benches_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?).join("benches");
let mut manifest_paths_file = File::create(benches_dir.join("datasets_paths.rs"))?;
writeln!(
manifest_paths_file,
r#"//! This file is generated by the build script.
//! Do not modify by hand, use the build.rs file.
#![allow(dead_code)]
"#
)?;
writeln!(manifest_paths_file)?;
for dataset in &[DATASET_SONGS, DATASET_WIKI] {
let out_path = out_dir.join(dataset);
let out_file = out_path.with_extension("csv");
writeln!(
&mut manifest_paths_file,
r#"pub const {}: &str = {:?};"#,
dataset.to_case(Case::ScreamingSnake),
out_file.display(),
)?;
if out_file.exists() {
eprintln!("The dataset {} already exists on the file system and will not be downloaded again", dataset);
continue;
}
let url = format!("{}/{}.csv.gz", BASE_URL, dataset);
eprintln!("downloading: {}", url);
let bytes = download_dataset(url.clone())?;
eprintln!("{} downloaded successfully", url);
eprintln!("uncompressing in {}", out_path.display());
uncompress_in_file(bytes, &out_file)?;
}
Ok(())
}
fn download_dataset<U: IntoUrl>(url: U) -> anyhow::Result<Cursor<Bytes>> {
let bytes = reqwest::blocking::Client::builder()
.timeout(None)
.build()?
.get(url)
.send()?
.bytes()?;
Ok(Cursor::new(bytes))
}
fn uncompress_in_file<R: Read + Seek, P: AsRef<Path>>(bytes: R, path: P) -> anyhow::Result<()> {
let path = path.as_ref();
let mut gz = GzDecoder::new(bytes);
let mut dataset = Vec::new();
gz.read_to_end(&mut dataset)?;
fs::write(path, dataset)?;
Ok(())
}

5
benchmarks/src/lib.rs Normal file
View file

@ -0,0 +1,5 @@
//! This library is only used to isolate the benchmarks
//! from the original milli library.
//!
//! It does not include interesting functions for milli library
//! users only for milli contributors.