mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-24 21:50:07 +01:00
Merge #287
287: Add benchmarks for indexing r=Kerollmops a=irevoire closes #274 I don't really know how much time this will take on our bench machine. I'm afraid the wiki dataset will take a really long time to bench (it takes 1h30 on my computer). If you are ok with it, I would like to merge this first PR since it introduces a first set of benchmarks and see how much time it takes in reality on our setup. Co-authored-by: Tamo <tamo@meilisearch.com>
This commit is contained in:
commit
16698f714b
2
.github/workflows/benchmarks.yml
vendored
2
.github/workflows/benchmarks.yml
vendored
@ -4,7 +4,7 @@ on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
dataset_name:
|
||||
description: 'The name of the dataset used to benchmark (songs or wiki)'
|
||||
description: 'The name of the dataset used to benchmark (songs, wiki or indexing)'
|
||||
required: false
|
||||
default: 'songs'
|
||||
|
||||
|
@ -28,3 +28,7 @@ harness = false
|
||||
[[bench]]
|
||||
name = "wiki"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "indexing"
|
||||
harness = false
|
||||
|
@ -30,13 +30,13 @@ _[More about critcmp](https://github.com/BurntSushi/critcmp)._
|
||||
|
||||
### On your machine
|
||||
|
||||
To run all the benchmarks (~4h):
|
||||
To run all the benchmarks (~5h):
|
||||
|
||||
```bash
|
||||
cargo bench
|
||||
```
|
||||
|
||||
To run only the `songs` (~1h) or `wiki` (~3h) benchmark:
|
||||
To run only the `songs` (~1h), `wiki` (~3h) or `indexing` (~4h) benchmark:
|
||||
|
||||
```bash
|
||||
cargo bench --bench <dataset name>
|
||||
@ -47,7 +47,7 @@ If you don't want to download the datasets every time you update something on th
|
||||
|
||||
```bash
|
||||
mkdir ~/datasets
|
||||
MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded
|
||||
MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the three datasets are downloaded
|
||||
touch build.rs
|
||||
MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded
|
||||
```
|
||||
@ -84,6 +84,7 @@ Run the comparison script:
|
||||
The benchmarks are available for the following datasets:
|
||||
- `songs`
|
||||
- `wiki`
|
||||
- `movies`
|
||||
|
||||
### Songs
|
||||
|
||||
@ -107,5 +108,9 @@ It was generated with the following command:
|
||||
xsv sample --seed 42 500000 wiki-articles.csv -o smol-wiki-articles.csv
|
||||
```
|
||||
|
||||
_[Download the generated `wiki` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-wiki-articles.csv.gz)._
|
||||
### Movies
|
||||
|
||||
`movies` is a really small dataset we uses as our example in the [getting started](https://docs.meilisearch.com/learn/getting_started/)
|
||||
|
||||
_[Download the `movies` dataset](https://docs.meilisearch.com/movies.json)._
|
||||
|
||||
|
314
benchmarks/benches/indexing.rs
Normal file
314
benchmarks/benches/indexing.rs
Normal file
@ -0,0 +1,314 @@
|
||||
mod datasets_paths;
|
||||
|
||||
use std::fs::{create_dir_all, remove_dir_all, File};
|
||||
use std::path::Path;
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use heed::EnvOpenOptions;
|
||||
use milli::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat};
|
||||
use milli::Index;
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
#[global_allocator]
|
||||
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
||||
|
||||
fn setup_dir(path: impl AsRef<Path>) {
|
||||
match remove_dir_all(path.as_ref()) {
|
||||
Ok(_) => (),
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => (),
|
||||
Err(e) => panic!("{}", e),
|
||||
}
|
||||
create_dir_all(path).unwrap();
|
||||
}
|
||||
|
||||
fn setup_index() -> Index {
|
||||
let path = "benches.mmdb";
|
||||
setup_dir(&path);
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||
options.max_readers(10);
|
||||
Index::new(options, path).unwrap()
|
||||
}
|
||||
|
||||
fn indexing_songs_default(c: &mut Criterion) {
|
||||
let index = setup_index();
|
||||
|
||||
let update_builder = UpdateBuilder::new(0);
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut builder = update_builder.settings(&mut wtxn, &index);
|
||||
|
||||
builder.set_primary_key("id".to_owned());
|
||||
let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"]
|
||||
.iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
builder.set_displayed_fields(displayed_fields);
|
||||
|
||||
let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
|
||||
builder.set_searchable_fields(searchable_fields);
|
||||
|
||||
let faceted_fields = ["released-timestamp", "duration-float", "genre", "country", "artist"]
|
||||
.iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
builder.set_filterable_fields(faceted_fields);
|
||||
builder.execute(|_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
let index_ref = &index;
|
||||
|
||||
let mut group = c.benchmark_group("indexing");
|
||||
group.sample_size(10);
|
||||
group.bench_function("Indexing songs with default settings", |b| {
|
||||
b.iter_with_setup(
|
||||
move || {
|
||||
let update_builder = UpdateBuilder::new(0);
|
||||
let mut wtxn = index_ref.write_txn().unwrap();
|
||||
let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
|
||||
builder.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
()
|
||||
},
|
||||
move |_| {
|
||||
let update_builder = UpdateBuilder::new(0);
|
||||
let mut wtxn = index_ref.write_txn().unwrap();
|
||||
let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
|
||||
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
|
||||
let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
|
||||
"could not find the dataset in: {}",
|
||||
datasets_paths::SMOL_SONGS
|
||||
));
|
||||
builder.execute(reader, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
},
|
||||
)
|
||||
});
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
}
|
||||
|
||||
fn indexing_songs_without_faceted_numbers(c: &mut Criterion) {
|
||||
let index = setup_index();
|
||||
|
||||
let update_builder = UpdateBuilder::new(0);
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut builder = update_builder.settings(&mut wtxn, &index);
|
||||
|
||||
builder.set_primary_key("id".to_owned());
|
||||
let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"]
|
||||
.iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
builder.set_displayed_fields(displayed_fields);
|
||||
|
||||
let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
|
||||
builder.set_searchable_fields(searchable_fields);
|
||||
|
||||
let faceted_fields = ["genre", "country", "artist"].iter().map(|s| s.to_string()).collect();
|
||||
builder.set_filterable_fields(faceted_fields);
|
||||
builder.execute(|_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
let index_ref = &index;
|
||||
|
||||
let mut group = c.benchmark_group("indexing");
|
||||
group.sample_size(10);
|
||||
group.bench_function("Indexing songs without faceted numbers", |b| {
|
||||
b.iter_with_setup(
|
||||
move || {
|
||||
let update_builder = UpdateBuilder::new(0);
|
||||
let mut wtxn = index_ref.write_txn().unwrap();
|
||||
let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
|
||||
builder.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
()
|
||||
},
|
||||
move |_| {
|
||||
let update_builder = UpdateBuilder::new(0);
|
||||
let mut wtxn = index_ref.write_txn().unwrap();
|
||||
let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
|
||||
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
|
||||
let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
|
||||
"could not find the dataset in: {}",
|
||||
datasets_paths::SMOL_SONGS
|
||||
));
|
||||
builder.execute(reader, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
},
|
||||
)
|
||||
});
|
||||
index.prepare_for_closing().wait();
|
||||
}
|
||||
|
||||
fn indexing_songs_without_faceted_fields(c: &mut Criterion) {
|
||||
let index = setup_index();
|
||||
|
||||
let update_builder = UpdateBuilder::new(0);
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut builder = update_builder.settings(&mut wtxn, &index);
|
||||
|
||||
builder.set_primary_key("id".to_owned());
|
||||
let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"]
|
||||
.iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
builder.set_displayed_fields(displayed_fields);
|
||||
|
||||
let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
|
||||
builder.set_searchable_fields(searchable_fields);
|
||||
builder.execute(|_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
let index_ref = &index;
|
||||
|
||||
let mut group = c.benchmark_group("indexing");
|
||||
group.sample_size(10);
|
||||
group.bench_function("Indexing songs without any facets", |b| {
|
||||
b.iter_with_setup(
|
||||
move || {
|
||||
let update_builder = UpdateBuilder::new(0);
|
||||
let mut wtxn = index_ref.write_txn().unwrap();
|
||||
let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
|
||||
builder.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
()
|
||||
},
|
||||
move |_| {
|
||||
let update_builder = UpdateBuilder::new(0);
|
||||
let mut wtxn = index_ref.write_txn().unwrap();
|
||||
let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
|
||||
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
|
||||
let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
|
||||
"could not find the dataset in: {}",
|
||||
datasets_paths::SMOL_SONGS
|
||||
));
|
||||
builder.execute(reader, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
},
|
||||
)
|
||||
});
|
||||
index.prepare_for_closing().wait();
|
||||
}
|
||||
|
||||
fn indexing_wiki(c: &mut Criterion) {
|
||||
let index = setup_index();
|
||||
|
||||
let update_builder = UpdateBuilder::new(0);
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut builder = update_builder.settings(&mut wtxn, &index);
|
||||
|
||||
builder.set_primary_key("id".to_owned());
|
||||
let displayed_fields = ["title", "body", "url"].iter().map(|s| s.to_string()).collect();
|
||||
builder.set_displayed_fields(displayed_fields);
|
||||
|
||||
let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect();
|
||||
builder.set_searchable_fields(searchable_fields);
|
||||
|
||||
// there is NO faceted fields at all
|
||||
|
||||
builder.execute(|_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
let index_ref = &index;
|
||||
|
||||
let mut group = c.benchmark_group("indexing");
|
||||
group.sample_size(10);
|
||||
group.bench_function("Indexing wiki", |b| {
|
||||
b.iter_with_setup(
|
||||
move || {
|
||||
let update_builder = UpdateBuilder::new(0);
|
||||
let mut wtxn = index_ref.write_txn().unwrap();
|
||||
let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
|
||||
builder.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
()
|
||||
},
|
||||
move |_| {
|
||||
let update_builder = UpdateBuilder::new(0);
|
||||
let mut wtxn = index_ref.write_txn().unwrap();
|
||||
let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
|
||||
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
|
||||
let reader = File::open(datasets_paths::SMOL_WIKI_ARTICLES).expect(&format!(
|
||||
"could not find the dataset in: {}",
|
||||
datasets_paths::SMOL_SONGS
|
||||
));
|
||||
builder.execute(reader, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
},
|
||||
)
|
||||
});
|
||||
index.prepare_for_closing().wait();
|
||||
}
|
||||
|
||||
fn indexing_movies_default(c: &mut Criterion) {
|
||||
let index = setup_index();
|
||||
|
||||
let update_builder = UpdateBuilder::new(0);
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut builder = update_builder.settings(&mut wtxn, &index);
|
||||
|
||||
builder.set_primary_key("id".to_owned());
|
||||
let displayed_fields = ["title", "poster", "overview", "release_date", "genres"]
|
||||
.iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
builder.set_displayed_fields(displayed_fields);
|
||||
|
||||
let searchable_fields = ["title", "overview"].iter().map(|s| s.to_string()).collect();
|
||||
builder.set_searchable_fields(searchable_fields);
|
||||
|
||||
let faceted_fields = ["released_date", "genres"].iter().map(|s| s.to_string()).collect();
|
||||
builder.set_filterable_fields(faceted_fields);
|
||||
|
||||
builder.execute(|_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
let index_ref = &index;
|
||||
|
||||
let mut group = c.benchmark_group("indexing");
|
||||
group.sample_size(10);
|
||||
group.bench_function("Indexing movies with default settings", |b| {
|
||||
b.iter_with_setup(
|
||||
move || {
|
||||
let update_builder = UpdateBuilder::new(0);
|
||||
let mut wtxn = index_ref.write_txn().unwrap();
|
||||
let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
|
||||
builder.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
()
|
||||
},
|
||||
move |_| {
|
||||
let update_builder = UpdateBuilder::new(0);
|
||||
let mut wtxn = index_ref.write_txn().unwrap();
|
||||
let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
|
||||
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
|
||||
let reader = File::open(datasets_paths::MOVIES)
|
||||
.expect(&format!("could not find the dataset in: {}", datasets_paths::MOVIES));
|
||||
builder.execute(reader, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
},
|
||||
)
|
||||
});
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
indexing_songs_default,
|
||||
indexing_songs_without_faceted_numbers,
|
||||
indexing_songs_without_faceted_fields,
|
||||
indexing_wiki,
|
||||
indexing_movies_default
|
||||
);
|
||||
criterion_main!(benches);
|
@ -10,8 +10,9 @@ use reqwest::IntoUrl;
|
||||
|
||||
const BASE_URL: &str = "https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets";
|
||||
|
||||
const DATASET_SONGS: &str = "smol-songs";
|
||||
const DATASET_WIKI: &str = "smol-wiki-articles";
|
||||
const DATASET_SONGS: (&str, &str) = ("smol-songs", "csv");
|
||||
const DATASET_WIKI: (&str, &str) = ("smol-wiki-articles", "csv");
|
||||
const DATASET_MOVIES: (&str, &str) = ("movies", "json");
|
||||
|
||||
/// The name of the environment variable used to select the path
|
||||
/// of the directory containing the datasets
|
||||
@ -31,9 +32,9 @@ fn main() -> anyhow::Result<()> {
|
||||
)?;
|
||||
writeln!(manifest_paths_file)?;
|
||||
|
||||
for dataset in &[DATASET_SONGS, DATASET_WIKI] {
|
||||
for (dataset, extension) in [DATASET_SONGS, DATASET_WIKI, DATASET_MOVIES] {
|
||||
let out_path = out_dir.join(dataset);
|
||||
let out_file = out_path.with_extension("csv");
|
||||
let out_file = out_path.with_extension(extension);
|
||||
|
||||
writeln!(
|
||||
&mut manifest_paths_file,
|
||||
@ -45,15 +46,15 @@ fn main() -> anyhow::Result<()> {
|
||||
if out_file.exists() {
|
||||
eprintln!(
|
||||
"The dataset {} already exists on the file system and will not be downloaded again",
|
||||
dataset
|
||||
out_path.display(),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
let url = format!("{}/{}.csv.gz", BASE_URL, dataset);
|
||||
let url = format!("{}/{}.{}.gz", BASE_URL, dataset, extension);
|
||||
eprintln!("downloading: {}", url);
|
||||
let bytes = download_dataset(url.clone())?;
|
||||
eprintln!("{} downloaded successfully", url);
|
||||
eprintln!("uncompressing in {}", out_path.display());
|
||||
eprintln!("uncompressing in {}", out_file.display());
|
||||
uncompress_in_file(bytes, &out_file)?;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user