287: Add benchmarks for indexing r=Kerollmops a=irevoire

closes #274 
I don't really know how much time this will take on our bench machine. I'm afraid the wiki dataset will take a really long time to bench (it takes 1h30 on my computer).

If you are ok with it, I would like to merge this first PR since it introduces a first set of benchmarks and see how much time it takes in reality on our setup.

Co-authored-by: Tamo <tamo@meilisearch.com>
This commit is contained in:
bors[bot] 2021-07-07 15:41:15 +00:00 committed by GitHub
commit 16698f714b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 336 additions and 12 deletions

View File

@ -4,7 +4,7 @@ on:
workflow_dispatch: workflow_dispatch:
inputs: inputs:
dataset_name: dataset_name:
description: 'The name of the dataset used to benchmark (songs or wiki)' description: 'The name of the dataset used to benchmark (songs, wiki or indexing)'
required: false required: false
default: 'songs' default: 'songs'

View File

@ -28,3 +28,7 @@ harness = false
[[bench]] [[bench]]
name = "wiki" name = "wiki"
harness = false harness = false
[[bench]]
name = "indexing"
harness = false

View File

@ -30,13 +30,13 @@ _[More about critcmp](https://github.com/BurntSushi/critcmp)._
### On your machine ### On your machine
To run all the benchmarks (~4h): To run all the benchmarks (~5h):
```bash ```bash
cargo bench cargo bench
``` ```
To run only the `songs` (~1h) or `wiki` (~3h) benchmark: To run only the `songs` (~1h), `wiki` (~3h) or `indexing` (~4h) benchmark:
```bash ```bash
cargo bench --bench <dataset name> cargo bench --bench <dataset name>
@ -47,7 +47,7 @@ If you don't want to download the datasets every time you update something on th
```bash ```bash
mkdir ~/datasets mkdir ~/datasets
MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the three datasets are downloaded
touch build.rs touch build.rs
MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded
``` ```
@ -84,6 +84,7 @@ Run the comparison script:
The benchmarks are available for the following datasets: The benchmarks are available for the following datasets:
- `songs` - `songs`
- `wiki` - `wiki`
- `movies`
### Songs ### Songs
@ -107,5 +108,9 @@ It was generated with the following command:
xsv sample --seed 42 500000 wiki-articles.csv -o smol-wiki-articles.csv xsv sample --seed 42 500000 wiki-articles.csv -o smol-wiki-articles.csv
``` ```
_[Download the generated `wiki` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-wiki-articles.csv.gz)._ ### Movies
`movies` is a really small dataset we uses as our example in the [getting started](https://docs.meilisearch.com/learn/getting_started/)
_[Download the `movies` dataset](https://docs.meilisearch.com/movies.json)._

View File

@ -0,0 +1,314 @@
mod datasets_paths;
use std::fs::{create_dir_all, remove_dir_all, File};
use std::path::Path;
use criterion::{criterion_group, criterion_main, Criterion};
use heed::EnvOpenOptions;
use milli::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat};
use milli::Index;
#[cfg(target_os = "linux")]
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
fn setup_dir(path: impl AsRef<Path>) {
match remove_dir_all(path.as_ref()) {
Ok(_) => (),
Err(e) if e.kind() == std::io::ErrorKind::NotFound => (),
Err(e) => panic!("{}", e),
}
create_dir_all(path).unwrap();
}
fn setup_index() -> Index {
let path = "benches.mmdb";
setup_dir(&path);
let mut options = EnvOpenOptions::new();
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
options.max_readers(10);
Index::new(options, path).unwrap()
}
fn indexing_songs_default(c: &mut Criterion) {
let index = setup_index();
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.settings(&mut wtxn, &index);
builder.set_primary_key("id".to_owned());
let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"]
.iter()
.map(|s| s.to_string())
.collect();
builder.set_displayed_fields(displayed_fields);
let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
builder.set_searchable_fields(searchable_fields);
let faceted_fields = ["released-timestamp", "duration-float", "genre", "country", "artist"]
.iter()
.map(|s| s.to_string())
.collect();
builder.set_filterable_fields(faceted_fields);
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();
let index_ref = &index;
let mut group = c.benchmark_group("indexing");
group.sample_size(10);
group.bench_function("Indexing songs with default settings", |b| {
b.iter_with_setup(
move || {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index_ref.write_txn().unwrap();
let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
()
},
move |_| {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index_ref.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
"could not find the dataset in: {}",
datasets_paths::SMOL_SONGS
));
builder.execute(reader, |_, _| ()).unwrap();
wtxn.commit().unwrap();
},
)
});
index.prepare_for_closing().wait();
}
fn indexing_songs_without_faceted_numbers(c: &mut Criterion) {
let index = setup_index();
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.settings(&mut wtxn, &index);
builder.set_primary_key("id".to_owned());
let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"]
.iter()
.map(|s| s.to_string())
.collect();
builder.set_displayed_fields(displayed_fields);
let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
builder.set_searchable_fields(searchable_fields);
let faceted_fields = ["genre", "country", "artist"].iter().map(|s| s.to_string()).collect();
builder.set_filterable_fields(faceted_fields);
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();
let index_ref = &index;
let mut group = c.benchmark_group("indexing");
group.sample_size(10);
group.bench_function("Indexing songs without faceted numbers", |b| {
b.iter_with_setup(
move || {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index_ref.write_txn().unwrap();
let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
()
},
move |_| {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index_ref.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
"could not find the dataset in: {}",
datasets_paths::SMOL_SONGS
));
builder.execute(reader, |_, _| ()).unwrap();
wtxn.commit().unwrap();
},
)
});
index.prepare_for_closing().wait();
}
fn indexing_songs_without_faceted_fields(c: &mut Criterion) {
let index = setup_index();
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.settings(&mut wtxn, &index);
builder.set_primary_key("id".to_owned());
let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"]
.iter()
.map(|s| s.to_string())
.collect();
builder.set_displayed_fields(displayed_fields);
let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
builder.set_searchable_fields(searchable_fields);
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();
let index_ref = &index;
let mut group = c.benchmark_group("indexing");
group.sample_size(10);
group.bench_function("Indexing songs without any facets", |b| {
b.iter_with_setup(
move || {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index_ref.write_txn().unwrap();
let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
()
},
move |_| {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index_ref.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
"could not find the dataset in: {}",
datasets_paths::SMOL_SONGS
));
builder.execute(reader, |_, _| ()).unwrap();
wtxn.commit().unwrap();
},
)
});
index.prepare_for_closing().wait();
}
fn indexing_wiki(c: &mut Criterion) {
let index = setup_index();
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.settings(&mut wtxn, &index);
builder.set_primary_key("id".to_owned());
let displayed_fields = ["title", "body", "url"].iter().map(|s| s.to_string()).collect();
builder.set_displayed_fields(displayed_fields);
let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect();
builder.set_searchable_fields(searchable_fields);
// there is NO faceted fields at all
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();
let index_ref = &index;
let mut group = c.benchmark_group("indexing");
group.sample_size(10);
group.bench_function("Indexing wiki", |b| {
b.iter_with_setup(
move || {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index_ref.write_txn().unwrap();
let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
()
},
move |_| {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index_ref.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(datasets_paths::SMOL_WIKI_ARTICLES).expect(&format!(
"could not find the dataset in: {}",
datasets_paths::SMOL_SONGS
));
builder.execute(reader, |_, _| ()).unwrap();
wtxn.commit().unwrap();
},
)
});
index.prepare_for_closing().wait();
}
fn indexing_movies_default(c: &mut Criterion) {
let index = setup_index();
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.settings(&mut wtxn, &index);
builder.set_primary_key("id".to_owned());
let displayed_fields = ["title", "poster", "overview", "release_date", "genres"]
.iter()
.map(|s| s.to_string())
.collect();
builder.set_displayed_fields(displayed_fields);
let searchable_fields = ["title", "overview"].iter().map(|s| s.to_string()).collect();
builder.set_searchable_fields(searchable_fields);
let faceted_fields = ["released_date", "genres"].iter().map(|s| s.to_string()).collect();
builder.set_filterable_fields(faceted_fields);
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();
let index_ref = &index;
let mut group = c.benchmark_group("indexing");
group.sample_size(10);
group.bench_function("Indexing movies with default settings", |b| {
b.iter_with_setup(
move || {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index_ref.write_txn().unwrap();
let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
()
},
move |_| {
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index_ref.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
builder.update_format(UpdateFormat::Json);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(datasets_paths::MOVIES)
.expect(&format!("could not find the dataset in: {}", datasets_paths::MOVIES));
builder.execute(reader, |_, _| ()).unwrap();
wtxn.commit().unwrap();
},
)
});
index.prepare_for_closing().wait();
}
criterion_group!(
benches,
indexing_songs_default,
indexing_songs_without_faceted_numbers,
indexing_songs_without_faceted_fields,
indexing_wiki,
indexing_movies_default
);
criterion_main!(benches);

View File

@ -10,8 +10,9 @@ use reqwest::IntoUrl;
const BASE_URL: &str = "https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets"; const BASE_URL: &str = "https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets";
const DATASET_SONGS: &str = "smol-songs"; const DATASET_SONGS: (&str, &str) = ("smol-songs", "csv");
const DATASET_WIKI: &str = "smol-wiki-articles"; const DATASET_WIKI: (&str, &str) = ("smol-wiki-articles", "csv");
const DATASET_MOVIES: (&str, &str) = ("movies", "json");
/// The name of the environment variable used to select the path /// The name of the environment variable used to select the path
/// of the directory containing the datasets /// of the directory containing the datasets
@ -31,9 +32,9 @@ fn main() -> anyhow::Result<()> {
)?; )?;
writeln!(manifest_paths_file)?; writeln!(manifest_paths_file)?;
for dataset in &[DATASET_SONGS, DATASET_WIKI] { for (dataset, extension) in [DATASET_SONGS, DATASET_WIKI, DATASET_MOVIES] {
let out_path = out_dir.join(dataset); let out_path = out_dir.join(dataset);
let out_file = out_path.with_extension("csv"); let out_file = out_path.with_extension(extension);
writeln!( writeln!(
&mut manifest_paths_file, &mut manifest_paths_file,
@ -45,15 +46,15 @@ fn main() -> anyhow::Result<()> {
if out_file.exists() { if out_file.exists() {
eprintln!( eprintln!(
"The dataset {} already exists on the file system and will not be downloaded again", "The dataset {} already exists on the file system and will not be downloaded again",
dataset out_path.display(),
); );
continue; continue;
} }
let url = format!("{}/{}.csv.gz", BASE_URL, dataset); let url = format!("{}/{}.{}.gz", BASE_URL, dataset, extension);
eprintln!("downloading: {}", url); eprintln!("downloading: {}", url);
let bytes = download_dataset(url.clone())?; let bytes = download_dataset(url.clone())?;
eprintln!("{} downloaded successfully", url); eprintln!("{} downloaded successfully", url);
eprintln!("uncompressing in {}", out_path.display()); eprintln!("uncompressing in {}", out_file.display());
uncompress_in_file(bytes, &out_file)?; uncompress_in_file(bytes, &out_file)?;
} }