mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-23 13:24:27 +01:00
Merge #287
287: Add benchmarks for indexing r=Kerollmops a=irevoire closes #274 I don't really know how much time this will take on our bench machine. I'm afraid the wiki dataset will take a really long time to bench (it takes 1h30 on my computer). If you are ok with it, I would like to merge this first PR since it introduces a first set of benchmarks and see how much time it takes in reality on our setup. Co-authored-by: Tamo <tamo@meilisearch.com>
This commit is contained in:
commit
16698f714b
2
.github/workflows/benchmarks.yml
vendored
2
.github/workflows/benchmarks.yml
vendored
@ -4,7 +4,7 @@ on:
|
|||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
inputs:
|
inputs:
|
||||||
dataset_name:
|
dataset_name:
|
||||||
description: 'The name of the dataset used to benchmark (songs or wiki)'
|
description: 'The name of the dataset used to benchmark (songs, wiki or indexing)'
|
||||||
required: false
|
required: false
|
||||||
default: 'songs'
|
default: 'songs'
|
||||||
|
|
||||||
|
@ -28,3 +28,7 @@ harness = false
|
|||||||
[[bench]]
|
[[bench]]
|
||||||
name = "wiki"
|
name = "wiki"
|
||||||
harness = false
|
harness = false
|
||||||
|
|
||||||
|
[[bench]]
|
||||||
|
name = "indexing"
|
||||||
|
harness = false
|
||||||
|
@ -30,13 +30,13 @@ _[More about critcmp](https://github.com/BurntSushi/critcmp)._
|
|||||||
|
|
||||||
### On your machine
|
### On your machine
|
||||||
|
|
||||||
To run all the benchmarks (~4h):
|
To run all the benchmarks (~5h):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cargo bench
|
cargo bench
|
||||||
```
|
```
|
||||||
|
|
||||||
To run only the `songs` (~1h) or `wiki` (~3h) benchmark:
|
To run only the `songs` (~1h), `wiki` (~3h) or `indexing` (~4h) benchmark:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cargo bench --bench <dataset name>
|
cargo bench --bench <dataset name>
|
||||||
@ -47,7 +47,7 @@ If you don't want to download the datasets every time you update something on th
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
mkdir ~/datasets
|
mkdir ~/datasets
|
||||||
MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded
|
MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the three datasets are downloaded
|
||||||
touch build.rs
|
touch build.rs
|
||||||
MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded
|
MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded
|
||||||
```
|
```
|
||||||
@ -84,6 +84,7 @@ Run the comparison script:
|
|||||||
The benchmarks are available for the following datasets:
|
The benchmarks are available for the following datasets:
|
||||||
- `songs`
|
- `songs`
|
||||||
- `wiki`
|
- `wiki`
|
||||||
|
- `movies`
|
||||||
|
|
||||||
### Songs
|
### Songs
|
||||||
|
|
||||||
@ -107,5 +108,9 @@ It was generated with the following command:
|
|||||||
xsv sample --seed 42 500000 wiki-articles.csv -o smol-wiki-articles.csv
|
xsv sample --seed 42 500000 wiki-articles.csv -o smol-wiki-articles.csv
|
||||||
```
|
```
|
||||||
|
|
||||||
_[Download the generated `wiki` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-wiki-articles.csv.gz)._
|
### Movies
|
||||||
|
|
||||||
|
`movies` is a really small dataset we uses as our example in the [getting started](https://docs.meilisearch.com/learn/getting_started/)
|
||||||
|
|
||||||
|
_[Download the `movies` dataset](https://docs.meilisearch.com/movies.json)._
|
||||||
|
|
||||||
|
314
benchmarks/benches/indexing.rs
Normal file
314
benchmarks/benches/indexing.rs
Normal file
@ -0,0 +1,314 @@
|
|||||||
|
mod datasets_paths;
|
||||||
|
|
||||||
|
use std::fs::{create_dir_all, remove_dir_all, File};
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
use criterion::{criterion_group, criterion_main, Criterion};
|
||||||
|
use heed::EnvOpenOptions;
|
||||||
|
use milli::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat};
|
||||||
|
use milli::Index;
|
||||||
|
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
#[global_allocator]
|
||||||
|
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
||||||
|
|
||||||
|
fn setup_dir(path: impl AsRef<Path>) {
|
||||||
|
match remove_dir_all(path.as_ref()) {
|
||||||
|
Ok(_) => (),
|
||||||
|
Err(e) if e.kind() == std::io::ErrorKind::NotFound => (),
|
||||||
|
Err(e) => panic!("{}", e),
|
||||||
|
}
|
||||||
|
create_dir_all(path).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn setup_index() -> Index {
|
||||||
|
let path = "benches.mmdb";
|
||||||
|
setup_dir(&path);
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||||
|
options.max_readers(10);
|
||||||
|
Index::new(options, path).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn indexing_songs_default(c: &mut Criterion) {
|
||||||
|
let index = setup_index();
|
||||||
|
|
||||||
|
let update_builder = UpdateBuilder::new(0);
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let mut builder = update_builder.settings(&mut wtxn, &index);
|
||||||
|
|
||||||
|
builder.set_primary_key("id".to_owned());
|
||||||
|
let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"]
|
||||||
|
.iter()
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
.collect();
|
||||||
|
builder.set_displayed_fields(displayed_fields);
|
||||||
|
|
||||||
|
let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
|
||||||
|
builder.set_searchable_fields(searchable_fields);
|
||||||
|
|
||||||
|
let faceted_fields = ["released-timestamp", "duration-float", "genre", "country", "artist"]
|
||||||
|
.iter()
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
.collect();
|
||||||
|
builder.set_filterable_fields(faceted_fields);
|
||||||
|
builder.execute(|_, _| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
let index_ref = &index;
|
||||||
|
|
||||||
|
let mut group = c.benchmark_group("indexing");
|
||||||
|
group.sample_size(10);
|
||||||
|
group.bench_function("Indexing songs with default settings", |b| {
|
||||||
|
b.iter_with_setup(
|
||||||
|
move || {
|
||||||
|
let update_builder = UpdateBuilder::new(0);
|
||||||
|
let mut wtxn = index_ref.write_txn().unwrap();
|
||||||
|
let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
|
||||||
|
builder.execute().unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
()
|
||||||
|
},
|
||||||
|
move |_| {
|
||||||
|
let update_builder = UpdateBuilder::new(0);
|
||||||
|
let mut wtxn = index_ref.write_txn().unwrap();
|
||||||
|
let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
|
||||||
|
|
||||||
|
builder.update_format(UpdateFormat::Csv);
|
||||||
|
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
|
||||||
|
let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
|
||||||
|
"could not find the dataset in: {}",
|
||||||
|
datasets_paths::SMOL_SONGS
|
||||||
|
));
|
||||||
|
builder.execute(reader, |_, _| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
},
|
||||||
|
)
|
||||||
|
});
|
||||||
|
|
||||||
|
index.prepare_for_closing().wait();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn indexing_songs_without_faceted_numbers(c: &mut Criterion) {
|
||||||
|
let index = setup_index();
|
||||||
|
|
||||||
|
let update_builder = UpdateBuilder::new(0);
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let mut builder = update_builder.settings(&mut wtxn, &index);
|
||||||
|
|
||||||
|
builder.set_primary_key("id".to_owned());
|
||||||
|
let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"]
|
||||||
|
.iter()
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
.collect();
|
||||||
|
builder.set_displayed_fields(displayed_fields);
|
||||||
|
|
||||||
|
let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
|
||||||
|
builder.set_searchable_fields(searchable_fields);
|
||||||
|
|
||||||
|
let faceted_fields = ["genre", "country", "artist"].iter().map(|s| s.to_string()).collect();
|
||||||
|
builder.set_filterable_fields(faceted_fields);
|
||||||
|
builder.execute(|_, _| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
let index_ref = &index;
|
||||||
|
|
||||||
|
let mut group = c.benchmark_group("indexing");
|
||||||
|
group.sample_size(10);
|
||||||
|
group.bench_function("Indexing songs without faceted numbers", |b| {
|
||||||
|
b.iter_with_setup(
|
||||||
|
move || {
|
||||||
|
let update_builder = UpdateBuilder::new(0);
|
||||||
|
let mut wtxn = index_ref.write_txn().unwrap();
|
||||||
|
let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
|
||||||
|
builder.execute().unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
()
|
||||||
|
},
|
||||||
|
move |_| {
|
||||||
|
let update_builder = UpdateBuilder::new(0);
|
||||||
|
let mut wtxn = index_ref.write_txn().unwrap();
|
||||||
|
let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
|
||||||
|
|
||||||
|
builder.update_format(UpdateFormat::Csv);
|
||||||
|
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
|
||||||
|
let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
|
||||||
|
"could not find the dataset in: {}",
|
||||||
|
datasets_paths::SMOL_SONGS
|
||||||
|
));
|
||||||
|
builder.execute(reader, |_, _| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
},
|
||||||
|
)
|
||||||
|
});
|
||||||
|
index.prepare_for_closing().wait();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn indexing_songs_without_faceted_fields(c: &mut Criterion) {
|
||||||
|
let index = setup_index();
|
||||||
|
|
||||||
|
let update_builder = UpdateBuilder::new(0);
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let mut builder = update_builder.settings(&mut wtxn, &index);
|
||||||
|
|
||||||
|
builder.set_primary_key("id".to_owned());
|
||||||
|
let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"]
|
||||||
|
.iter()
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
.collect();
|
||||||
|
builder.set_displayed_fields(displayed_fields);
|
||||||
|
|
||||||
|
let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
|
||||||
|
builder.set_searchable_fields(searchable_fields);
|
||||||
|
builder.execute(|_, _| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
let index_ref = &index;
|
||||||
|
|
||||||
|
let mut group = c.benchmark_group("indexing");
|
||||||
|
group.sample_size(10);
|
||||||
|
group.bench_function("Indexing songs without any facets", |b| {
|
||||||
|
b.iter_with_setup(
|
||||||
|
move || {
|
||||||
|
let update_builder = UpdateBuilder::new(0);
|
||||||
|
let mut wtxn = index_ref.write_txn().unwrap();
|
||||||
|
let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
|
||||||
|
builder.execute().unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
()
|
||||||
|
},
|
||||||
|
move |_| {
|
||||||
|
let update_builder = UpdateBuilder::new(0);
|
||||||
|
let mut wtxn = index_ref.write_txn().unwrap();
|
||||||
|
let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
|
||||||
|
|
||||||
|
builder.update_format(UpdateFormat::Csv);
|
||||||
|
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
|
||||||
|
let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
|
||||||
|
"could not find the dataset in: {}",
|
||||||
|
datasets_paths::SMOL_SONGS
|
||||||
|
));
|
||||||
|
builder.execute(reader, |_, _| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
},
|
||||||
|
)
|
||||||
|
});
|
||||||
|
index.prepare_for_closing().wait();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn indexing_wiki(c: &mut Criterion) {
|
||||||
|
let index = setup_index();
|
||||||
|
|
||||||
|
let update_builder = UpdateBuilder::new(0);
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let mut builder = update_builder.settings(&mut wtxn, &index);
|
||||||
|
|
||||||
|
builder.set_primary_key("id".to_owned());
|
||||||
|
let displayed_fields = ["title", "body", "url"].iter().map(|s| s.to_string()).collect();
|
||||||
|
builder.set_displayed_fields(displayed_fields);
|
||||||
|
|
||||||
|
let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect();
|
||||||
|
builder.set_searchable_fields(searchable_fields);
|
||||||
|
|
||||||
|
// there is NO faceted fields at all
|
||||||
|
|
||||||
|
builder.execute(|_, _| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
let index_ref = &index;
|
||||||
|
|
||||||
|
let mut group = c.benchmark_group("indexing");
|
||||||
|
group.sample_size(10);
|
||||||
|
group.bench_function("Indexing wiki", |b| {
|
||||||
|
b.iter_with_setup(
|
||||||
|
move || {
|
||||||
|
let update_builder = UpdateBuilder::new(0);
|
||||||
|
let mut wtxn = index_ref.write_txn().unwrap();
|
||||||
|
let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
|
||||||
|
builder.execute().unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
()
|
||||||
|
},
|
||||||
|
move |_| {
|
||||||
|
let update_builder = UpdateBuilder::new(0);
|
||||||
|
let mut wtxn = index_ref.write_txn().unwrap();
|
||||||
|
let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
|
||||||
|
|
||||||
|
builder.update_format(UpdateFormat::Csv);
|
||||||
|
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
|
||||||
|
let reader = File::open(datasets_paths::SMOL_WIKI_ARTICLES).expect(&format!(
|
||||||
|
"could not find the dataset in: {}",
|
||||||
|
datasets_paths::SMOL_SONGS
|
||||||
|
));
|
||||||
|
builder.execute(reader, |_, _| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
},
|
||||||
|
)
|
||||||
|
});
|
||||||
|
index.prepare_for_closing().wait();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn indexing_movies_default(c: &mut Criterion) {
|
||||||
|
let index = setup_index();
|
||||||
|
|
||||||
|
let update_builder = UpdateBuilder::new(0);
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let mut builder = update_builder.settings(&mut wtxn, &index);
|
||||||
|
|
||||||
|
builder.set_primary_key("id".to_owned());
|
||||||
|
let displayed_fields = ["title", "poster", "overview", "release_date", "genres"]
|
||||||
|
.iter()
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
.collect();
|
||||||
|
builder.set_displayed_fields(displayed_fields);
|
||||||
|
|
||||||
|
let searchable_fields = ["title", "overview"].iter().map(|s| s.to_string()).collect();
|
||||||
|
builder.set_searchable_fields(searchable_fields);
|
||||||
|
|
||||||
|
let faceted_fields = ["released_date", "genres"].iter().map(|s| s.to_string()).collect();
|
||||||
|
builder.set_filterable_fields(faceted_fields);
|
||||||
|
|
||||||
|
builder.execute(|_, _| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
let index_ref = &index;
|
||||||
|
|
||||||
|
let mut group = c.benchmark_group("indexing");
|
||||||
|
group.sample_size(10);
|
||||||
|
group.bench_function("Indexing movies with default settings", |b| {
|
||||||
|
b.iter_with_setup(
|
||||||
|
move || {
|
||||||
|
let update_builder = UpdateBuilder::new(0);
|
||||||
|
let mut wtxn = index_ref.write_txn().unwrap();
|
||||||
|
let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
|
||||||
|
builder.execute().unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
()
|
||||||
|
},
|
||||||
|
move |_| {
|
||||||
|
let update_builder = UpdateBuilder::new(0);
|
||||||
|
let mut wtxn = index_ref.write_txn().unwrap();
|
||||||
|
let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
|
||||||
|
|
||||||
|
builder.update_format(UpdateFormat::Json);
|
||||||
|
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
|
||||||
|
let reader = File::open(datasets_paths::MOVIES)
|
||||||
|
.expect(&format!("could not find the dataset in: {}", datasets_paths::MOVIES));
|
||||||
|
builder.execute(reader, |_, _| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
},
|
||||||
|
)
|
||||||
|
});
|
||||||
|
|
||||||
|
index.prepare_for_closing().wait();
|
||||||
|
}
|
||||||
|
|
||||||
|
criterion_group!(
|
||||||
|
benches,
|
||||||
|
indexing_songs_default,
|
||||||
|
indexing_songs_without_faceted_numbers,
|
||||||
|
indexing_songs_without_faceted_fields,
|
||||||
|
indexing_wiki,
|
||||||
|
indexing_movies_default
|
||||||
|
);
|
||||||
|
criterion_main!(benches);
|
@ -10,8 +10,9 @@ use reqwest::IntoUrl;
|
|||||||
|
|
||||||
const BASE_URL: &str = "https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets";
|
const BASE_URL: &str = "https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets";
|
||||||
|
|
||||||
const DATASET_SONGS: &str = "smol-songs";
|
const DATASET_SONGS: (&str, &str) = ("smol-songs", "csv");
|
||||||
const DATASET_WIKI: &str = "smol-wiki-articles";
|
const DATASET_WIKI: (&str, &str) = ("smol-wiki-articles", "csv");
|
||||||
|
const DATASET_MOVIES: (&str, &str) = ("movies", "json");
|
||||||
|
|
||||||
/// The name of the environment variable used to select the path
|
/// The name of the environment variable used to select the path
|
||||||
/// of the directory containing the datasets
|
/// of the directory containing the datasets
|
||||||
@ -31,9 +32,9 @@ fn main() -> anyhow::Result<()> {
|
|||||||
)?;
|
)?;
|
||||||
writeln!(manifest_paths_file)?;
|
writeln!(manifest_paths_file)?;
|
||||||
|
|
||||||
for dataset in &[DATASET_SONGS, DATASET_WIKI] {
|
for (dataset, extension) in [DATASET_SONGS, DATASET_WIKI, DATASET_MOVIES] {
|
||||||
let out_path = out_dir.join(dataset);
|
let out_path = out_dir.join(dataset);
|
||||||
let out_file = out_path.with_extension("csv");
|
let out_file = out_path.with_extension(extension);
|
||||||
|
|
||||||
writeln!(
|
writeln!(
|
||||||
&mut manifest_paths_file,
|
&mut manifest_paths_file,
|
||||||
@ -45,15 +46,15 @@ fn main() -> anyhow::Result<()> {
|
|||||||
if out_file.exists() {
|
if out_file.exists() {
|
||||||
eprintln!(
|
eprintln!(
|
||||||
"The dataset {} already exists on the file system and will not be downloaded again",
|
"The dataset {} already exists on the file system and will not be downloaded again",
|
||||||
dataset
|
out_path.display(),
|
||||||
);
|
);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
let url = format!("{}/{}.csv.gz", BASE_URL, dataset);
|
let url = format!("{}/{}.{}.gz", BASE_URL, dataset, extension);
|
||||||
eprintln!("downloading: {}", url);
|
eprintln!("downloading: {}", url);
|
||||||
let bytes = download_dataset(url.clone())?;
|
let bytes = download_dataset(url.clone())?;
|
||||||
eprintln!("{} downloaded successfully", url);
|
eprintln!("{} downloaded successfully", url);
|
||||||
eprintln!("uncompressing in {}", out_path.display());
|
eprintln!("uncompressing in {}", out_file.display());
|
||||||
uncompress_in_file(bytes, &out_file)?;
|
uncompress_in_file(bytes, &out_file)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user