Add a new songs benchmark to test multi batch indexing

2025-07-03 11:57:07 +02:00 · 2022-02-21 16:30:13 +01:00 · 2022-02-21 16:30:13 +01:00 · ab5247dc64
commit ab5247dc64
parent acd9535588
2 changed files with 86 additions and 1 deletions
--- a/benchmarks/benches/indexing.rs
+++ b/benchmarks/benches/indexing.rs
@ -83,6 +83,77 @@ fn indexing_songs_default(c: &mut Criterion) {
    });
 }

+fn indexing_songs_in_three_batches_default(c: &mut Criterion) {
+    let mut group = c.benchmark_group("indexing");
+    group.sample_size(10);
+    group.bench_function("Indexing songs in three batches with default settings", |b| {
+        b.iter_with_setup(
+            move || {
+                let index = setup_index();
+
+                let config = IndexerConfig::default();
+                let mut wtxn = index.write_txn().unwrap();
+                let mut builder = Settings::new(&mut wtxn, &index, &config);
+
+                builder.set_primary_key("id".to_owned());
+                let displayed_fields =
+                    ["title", "album", "artist", "genre", "country", "released", "duration"]
+                        .iter()
+                        .map(|s| s.to_string())
+                        .collect();
+                builder.set_displayed_fields(displayed_fields);
+
+                let searchable_fields =
+                    ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
+                builder.set_searchable_fields(searchable_fields);
+
+                let faceted_fields =
+                    ["released-timestamp", "duration-float", "genre", "country", "artist"]
+                        .iter()
+                        .map(|s| s.to_string())
+                        .collect();
+                builder.set_filterable_fields(faceted_fields);
+                builder.execute(|_| ()).unwrap();
+
+                // We index only one half of the dataset in the setup part
+                // as we don't care about the time it take.
+                let config = IndexerConfig::default();
+                let indexing_config = IndexDocumentsConfig::default();
+                let mut builder =
+                    IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
+                let documents = utils::documents_from(datasets_paths::SMOL_SONGS_1_2, "csv");
+                builder.add_documents(documents).unwrap();
+                builder.execute().unwrap();
+
+                wtxn.commit().unwrap();
+
+                index
+            },
+            move |index| {
+                let config = IndexerConfig::default();
+                let indexing_config = IndexDocumentsConfig::default();
+                let mut wtxn = index.write_txn().unwrap();
+                let mut builder =
+                    IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
+                let documents = utils::documents_from(datasets_paths::SMOL_SONGS_3_4, "csv");
+                builder.add_documents(documents).unwrap();
+                builder.execute().unwrap();
+
+                let indexing_config = IndexDocumentsConfig::default();
+                let mut builder =
+                    IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ());
+                let documents = utils::documents_from(datasets_paths::SMOL_SONGS_4_4, "csv");
+                builder.add_documents(documents).unwrap();
+                builder.execute().unwrap();
+
+                wtxn.commit().unwrap();
+
+                index.prepare_for_closing().wait();
+            },
+        )
+    });
+}
+
 fn indexing_songs_without_faceted_numbers(c: &mut Criterion) {
    let mut group = c.benchmark_group("indexing");
    group.sample_size(10);
@ -332,6 +403,7 @@ criterion_group!(
    indexing_songs_default,
    indexing_songs_without_faceted_numbers,
    indexing_songs_without_faceted_fields,
+    indexing_songs_in_three_batches_default,
    indexing_wiki,
    indexing_movies_default,
    indexing_geo
--- a/benchmarks/build.rs
+++ b/benchmarks/build.rs
@ -11,10 +11,23 @@ use reqwest::IntoUrl;
 const BASE_URL: &str = "https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets";

 const DATASET_SONGS: (&str, &str) = ("smol-songs", "csv");
+const DATASET_SONGS_1_2: (&str, &str) = ("smol-songs-1_2", "csv");
+const DATASET_SONGS_3_4: (&str, &str) = ("smol-songs-3_4", "csv");
+const DATASET_SONGS_4_4: (&str, &str) = ("smol-songs-4_4", "csv");
 const DATASET_WIKI: (&str, &str) = ("smol-wiki-articles", "csv");
 const DATASET_MOVIES: (&str, &str) = ("movies", "json");
 const DATASET_GEO: (&str, &str) = ("smol-all-countries", "jsonl");

+const ALL_DATASETS: &[(&str, &str)] = &[
+    DATASET_SONGS,
+    DATASET_SONGS_1_2,
+    DATASET_SONGS_3_4,
+    DATASET_SONGS_4_4,
+    DATASET_WIKI,
+    DATASET_MOVIES,
+    DATASET_GEO,
+];
+
 /// The name of the environment variable used to select the path
 /// of the directory containing the datasets
 const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH";
@ -33,7 +46,7 @@ fn main() -> anyhow::Result<()> {
    )?;
    writeln!(manifest_paths_file)?;

-    for (dataset, extension) in [DATASET_SONGS, DATASET_WIKI, DATASET_MOVIES, DATASET_GEO] {
+    for (dataset, extension) in ALL_DATASETS {
        let out_path = out_dir.join(dataset);
        let out_file = out_path.with_extension(extension);