Merge #287

287: Add benchmarks for indexing r=Kerollmops a=irevoire closes #274 I don't really know how much time this will take on our bench machine. I'm afraid the wiki dataset will take a really long time to bench (it takes 1h30 on my computer). If you are ok with it, I would like to merge this first PR since it introduces a first set of benchmarks and see how much time it takes in reality on our setup. Co-authored-by: Tamo <tamo@meilisearch.com>
2025-05-17 17:53:57 +02:00 · 2021-07-07 15:41:15 +00:00 · 2021-07-07 15:41:15 +00:00 · 16698f714b
commit 16698f714b
parent 4c9531bdf3 931021fe57
5 changed files with 336 additions and 12 deletions
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@ -4,7 +4,7 @@ on:
  workflow_dispatch:
    inputs:
      dataset_name:
-        description: 'The name of the dataset used to benchmark (songs or wiki)'
+        description: 'The name of the dataset used to benchmark (songs, wiki or indexing)'
        required: false
        default: 'songs'
--- a/benchmarks/Cargo.toml
+++ b/benchmarks/Cargo.toml
@ -28,3 +28,7 @@ harness = false
 [[bench]]
 name = "wiki"
 harness = false
 [[bench]]
 name = "indexing"
 harness = false
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -30,13 +30,13 @@ _[More about critcmp](https://github.com/BurntSushi/critcmp)._
 ### On your machine
-To run all the benchmarks (~4h):
+To run all the benchmarks (~5h):
 ```bash
 cargo bench
 ```
-To run only the `songs` (~1h) or `wiki` (~3h) benchmark:
+To run only the `songs` (~1h), `wiki` (~3h) or `indexing` (~4h) benchmark:
 ```bash
 cargo bench --bench <dataset name>
@ -47,7 +47,7 @@ If you don't want to download the datasets every time you update something on th
 ```bash
 mkdir ~/datasets
-MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded
+MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the three datasets are downloaded
 touch build.rs
 MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded
 ```
@ -84,6 +84,7 @@ Run the comparison script:
 The benchmarks are available for the following datasets:
 - `songs`
 - `wiki`
 - `movies`
 ### Songs
@ -107,5 +108,9 @@ It was generated with the following command:
 xsv sample --seed 42 500000 wiki-articles.csv -o smol-wiki-articles.csv
 ```
-_[Download the generated `wiki` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-wiki-articles.csv.gz)._
+### Movies
 `movies` is a really small dataset we uses as our example in the [getting started](https://docs.meilisearch.com/learn/getting_started/)
 _[Download the `movies` dataset](https://docs.meilisearch.com/movies.json)._
--- a/benchmarks/benches/indexing.rs
+++ b/benchmarks/benches/indexing.rs
@ -0,0 +1,314 @@
 mod datasets_paths;
 use std::fs::{create_dir_all, remove_dir_all, File};
 use std::path::Path;
 use criterion::{criterion_group, criterion_main, Criterion};
 use heed::EnvOpenOptions;
 use milli::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat};
 use milli::Index;
 #[cfg(target_os = "linux")]
 #[global_allocator]
 static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
 fn setup_dir(path: impl AsRef<Path>) {
    match remove_dir_all(path.as_ref()) {
        Ok(_) => (),
        Err(e) if e.kind() == std::io::ErrorKind::NotFound => (),
        Err(e) => panic!("{}", e),
    }
    create_dir_all(path).unwrap();
 }
 fn setup_index() -> Index {
    let path = "benches.mmdb";
    setup_dir(&path);
    let mut options = EnvOpenOptions::new();
    options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
    options.max_readers(10);
    Index::new(options, path).unwrap()
 }
 fn indexing_songs_default(c: &mut Criterion) {
    let index = setup_index();
    let update_builder = UpdateBuilder::new(0);
    let mut wtxn = index.write_txn().unwrap();
    let mut builder = update_builder.settings(&mut wtxn, &index);
    builder.set_primary_key("id".to_owned());
    let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"]
        .iter()
        .map(|s| s.to_string())
        .collect();
    builder.set_displayed_fields(displayed_fields);
    let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
    builder.set_searchable_fields(searchable_fields);
    let faceted_fields = ["released-timestamp", "duration-float", "genre", "country", "artist"]
        .iter()
        .map(|s| s.to_string())
        .collect();
    builder.set_filterable_fields(faceted_fields);
    builder.execute(|_, _| ()).unwrap();
    wtxn.commit().unwrap();
    let index_ref = &index;
    let mut group = c.benchmark_group("indexing");
    group.sample_size(10);
    group.bench_function("Indexing songs with default settings", |b| {
        b.iter_with_setup(
            move || {
                let update_builder = UpdateBuilder::new(0);
                let mut wtxn = index_ref.write_txn().unwrap();
                let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
                builder.execute().unwrap();
                wtxn.commit().unwrap();
                ()
            },
            move |_| {
                let update_builder = UpdateBuilder::new(0);
                let mut wtxn = index_ref.write_txn().unwrap();
                let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
                builder.update_format(UpdateFormat::Csv);
                builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
                let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
                    "could not find the dataset in: {}",
                    datasets_paths::SMOL_SONGS
                ));
                builder.execute(reader, |_, _| ()).unwrap();
                wtxn.commit().unwrap();
            },
        )
    });
    index.prepare_for_closing().wait();
 }
 fn indexing_songs_without_faceted_numbers(c: &mut Criterion) {
    let index = setup_index();
    let update_builder = UpdateBuilder::new(0);
    let mut wtxn = index.write_txn().unwrap();
    let mut builder = update_builder.settings(&mut wtxn, &index);
    builder.set_primary_key("id".to_owned());
    let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"]
        .iter()
        .map(|s| s.to_string())
        .collect();
    builder.set_displayed_fields(displayed_fields);
    let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
    builder.set_searchable_fields(searchable_fields);
    let faceted_fields = ["genre", "country", "artist"].iter().map(|s| s.to_string()).collect();
    builder.set_filterable_fields(faceted_fields);
    builder.execute(|_, _| ()).unwrap();
    wtxn.commit().unwrap();
    let index_ref = &index;
    let mut group = c.benchmark_group("indexing");
    group.sample_size(10);
    group.bench_function("Indexing songs without faceted numbers", |b| {
        b.iter_with_setup(
            move || {
                let update_builder = UpdateBuilder::new(0);
                let mut wtxn = index_ref.write_txn().unwrap();
                let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
                builder.execute().unwrap();
                wtxn.commit().unwrap();
                ()
            },
            move |_| {
                let update_builder = UpdateBuilder::new(0);
                let mut wtxn = index_ref.write_txn().unwrap();
                let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
                builder.update_format(UpdateFormat::Csv);
                builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
                let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
                    "could not find the dataset in: {}",
                    datasets_paths::SMOL_SONGS
                ));
                builder.execute(reader, |_, _| ()).unwrap();
                wtxn.commit().unwrap();
            },
        )
    });
    index.prepare_for_closing().wait();
 }
 fn indexing_songs_without_faceted_fields(c: &mut Criterion) {
    let index = setup_index();
    let update_builder = UpdateBuilder::new(0);
    let mut wtxn = index.write_txn().unwrap();
    let mut builder = update_builder.settings(&mut wtxn, &index);
    builder.set_primary_key("id".to_owned());
    let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"]
        .iter()
        .map(|s| s.to_string())
        .collect();
    builder.set_displayed_fields(displayed_fields);
    let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
    builder.set_searchable_fields(searchable_fields);
    builder.execute(|_, _| ()).unwrap();
    wtxn.commit().unwrap();
    let index_ref = &index;
    let mut group = c.benchmark_group("indexing");
    group.sample_size(10);
    group.bench_function("Indexing songs without any facets", |b| {
        b.iter_with_setup(
            move || {
                let update_builder = UpdateBuilder::new(0);
                let mut wtxn = index_ref.write_txn().unwrap();
                let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
                builder.execute().unwrap();
                wtxn.commit().unwrap();
                ()
            },
            move |_| {
                let update_builder = UpdateBuilder::new(0);
                let mut wtxn = index_ref.write_txn().unwrap();
                let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
                builder.update_format(UpdateFormat::Csv);
                builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
                let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!(
                    "could not find the dataset in: {}",
                    datasets_paths::SMOL_SONGS
                ));
                builder.execute(reader, |_, _| ()).unwrap();
                wtxn.commit().unwrap();
            },
        )
    });
    index.prepare_for_closing().wait();
 }
 fn indexing_wiki(c: &mut Criterion) {
    let index = setup_index();
    let update_builder = UpdateBuilder::new(0);
    let mut wtxn = index.write_txn().unwrap();
    let mut builder = update_builder.settings(&mut wtxn, &index);
    builder.set_primary_key("id".to_owned());
    let displayed_fields = ["title", "body", "url"].iter().map(|s| s.to_string()).collect();
    builder.set_displayed_fields(displayed_fields);
    let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect();
    builder.set_searchable_fields(searchable_fields);
    // there is NO faceted fields at all
    builder.execute(|_, _| ()).unwrap();
    wtxn.commit().unwrap();
    let index_ref = &index;
    let mut group = c.benchmark_group("indexing");
    group.sample_size(10);
    group.bench_function("Indexing wiki", |b| {
        b.iter_with_setup(
            move || {
                let update_builder = UpdateBuilder::new(0);
                let mut wtxn = index_ref.write_txn().unwrap();
                let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
                builder.execute().unwrap();
                wtxn.commit().unwrap();
                ()
            },
            move |_| {
                let update_builder = UpdateBuilder::new(0);
                let mut wtxn = index_ref.write_txn().unwrap();
                let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
                builder.update_format(UpdateFormat::Csv);
                builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
                let reader = File::open(datasets_paths::SMOL_WIKI_ARTICLES).expect(&format!(
                    "could not find the dataset in: {}",
                    datasets_paths::SMOL_SONGS
                ));
                builder.execute(reader, |_, _| ()).unwrap();
                wtxn.commit().unwrap();
            },
        )
    });
    index.prepare_for_closing().wait();
 }
 fn indexing_movies_default(c: &mut Criterion) {
    let index = setup_index();
    let update_builder = UpdateBuilder::new(0);
    let mut wtxn = index.write_txn().unwrap();
    let mut builder = update_builder.settings(&mut wtxn, &index);
    builder.set_primary_key("id".to_owned());
    let displayed_fields = ["title", "poster", "overview", "release_date", "genres"]
        .iter()
        .map(|s| s.to_string())
        .collect();
    builder.set_displayed_fields(displayed_fields);
    let searchable_fields = ["title", "overview"].iter().map(|s| s.to_string()).collect();
    builder.set_searchable_fields(searchable_fields);
    let faceted_fields = ["released_date", "genres"].iter().map(|s| s.to_string()).collect();
    builder.set_filterable_fields(faceted_fields);
    builder.execute(|_, _| ()).unwrap();
    wtxn.commit().unwrap();
    let index_ref = &index;
    let mut group = c.benchmark_group("indexing");
    group.sample_size(10);
    group.bench_function("Indexing movies with default settings", |b| {
        b.iter_with_setup(
            move || {
                let update_builder = UpdateBuilder::new(0);
                let mut wtxn = index_ref.write_txn().unwrap();
                let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap();
                builder.execute().unwrap();
                wtxn.commit().unwrap();
                ()
            },
            move |_| {
                let update_builder = UpdateBuilder::new(0);
                let mut wtxn = index_ref.write_txn().unwrap();
                let mut builder = update_builder.index_documents(&mut wtxn, index_ref);
                builder.update_format(UpdateFormat::Json);
                builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
                let reader = File::open(datasets_paths::MOVIES)
                    .expect(&format!("could not find the dataset in: {}", datasets_paths::MOVIES));
                builder.execute(reader, |_, _| ()).unwrap();
                wtxn.commit().unwrap();
            },
        )
    });
    index.prepare_for_closing().wait();
 }
 criterion_group!(
    benches,
    indexing_songs_default,
    indexing_songs_without_faceted_numbers,
    indexing_songs_without_faceted_fields,
    indexing_wiki,
    indexing_movies_default
 );
 criterion_main!(benches);
--- a/benchmarks/build.rs
+++ b/benchmarks/build.rs
@ -10,8 +10,9 @@ use reqwest::IntoUrl;
 const BASE_URL: &str = "https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets";
-const DATASET_SONGS: &str = "smol-songs";
+const DATASET_SONGS: (&str, &str) = ("smol-songs", "csv");
-const DATASET_WIKI: &str = "smol-wiki-articles";
+const DATASET_WIKI: (&str, &str) = ("smol-wiki-articles", "csv");
 const DATASET_MOVIES: (&str, &str) = ("movies", "json");
 /// The name of the environment variable used to select the path
 /// of the directory containing the datasets
@ -31,9 +32,9 @@ fn main() -> anyhow::Result<()> {
    )?;
    writeln!(manifest_paths_file)?;
-    for dataset in &[DATASET_SONGS, DATASET_WIKI] {
+    for (dataset, extension) in [DATASET_SONGS, DATASET_WIKI, DATASET_MOVIES] {
        let out_path = out_dir.join(dataset);
-        let out_file = out_path.with_extension("csv");
+        let out_file = out_path.with_extension(extension);
        writeln!(
            &mut manifest_paths_file,
@ -45,15 +46,15 @@ fn main() -> anyhow::Result<()> {
        if out_file.exists() {
            eprintln!(
                "The dataset {} already exists on the file system and will not be downloaded again",
-                dataset
+                out_path.display(),
            );
            continue;
        }
-        let url = format!("{}/{}.csv.gz", BASE_URL, dataset);
+        let url = format!("{}/{}.{}.gz", BASE_URL, dataset, extension);
        eprintln!("downloading: {}", url);
        let bytes = download_dataset(url.clone())?;
        eprintln!("{} downloaded successfully", url);
-        eprintln!("uncompressing in {}", out_path.display());
+        eprintln!("uncompressing in {}", out_file.display());
        uncompress_in_file(bytes, &out_file)?;
    }