move the benchmarks to another crate so we can download the datasets automatically without adding overhead to the build of milli

2025-07-03 11:57:07 +02:00 · 2021-05-25 17:09:14 +02:00 · 2021-05-25 17:09:14 +02:00 · 06c414a753
commit 06c414a753
parent 3c84075d2d
10 changed files with 154 additions and 55 deletions
--- a/benchmarks/Cargo.toml
+++ b/benchmarks/Cargo.toml
@ -0,0 +1,29 @@
+[package]
+name = "benchmarks"
+version = "0.1.0"
+edition = "2018"
+publish = false
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+milli = { path = "../milli" }
+
+[dev-dependencies]
+heed = "*" # we want to use the version milli uses
+criterion = "0.3.4"
+
+[build-dependencies]
+anyhow = "1.0"
+bytes = "1.0"
+flate2 = "1.0.20"
+convert_case = "0.4"
+reqwest = { version = "0.11.3", features = ["blocking", "rustls-tls"], default-features = false }
+
+[[bench]]
+name = "songs"
+harness = false
+
+[[bench]]
+name = "wiki"
+harness = false
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -0,0 +1,30 @@
+Benchmarks
+==========
+
+For our benchmark we are using a small subset of the dataset `songs.csv`. It was generated with this command:
+```
+xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv
+```
+You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)
+And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz).
+
+We also use a subset of `wikipedia-articles.csv` that was generated with the following command:
+```
+xsv sample --seed 42 500000 wikipedia-articles.csv -o smol-wikipedia-articles.csv
+```
+You can download the original [here](https://meili-datasets.s3.fr-par.scw.cloud/wikipedia-articles.csv.gz) and the subset [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-wikipedia-articles.csv.gz).
+
+-----
+
+- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h
+- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h
+- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h
+
+By default the benchmarks will be downloaded and uncompressed automatically in the target directory.
+If you don't want to download the datasets everytime you updates something on the code you can specify a custom directory with the env variable `MILLI_BENCH_DATASETS_PATH`:
+```
+mkdir ~/datasets
+MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded
+touch build.rs
+MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded
+```
--- a/benchmarks/benches/songs.rs
+++ b/benchmarks/benches/songs.rs
@ -0,0 +1,210 @@
+mod datasets_paths;
+mod utils;
+
+use criterion::{criterion_group, criterion_main};
+use milli::update::Settings;
+use utils::Conf;
+
+fn base_conf(builder: &mut Settings) {
+    let displayed_fields = [
+        "id", "title", "album", "artist", "genre", "country", "released", "duration",
+    ]
+    .iter()
+    .map(|s| s.to_string())
+    .collect();
+    builder.set_displayed_fields(displayed_fields);
+
+    let searchable_fields = ["title", "album", "artist"]
+        .iter()
+        .map(|s| s.to_string())
+        .collect();
+    builder.set_searchable_fields(searchable_fields);
+
+    let faceted_fields = [
+        ("released-timestamp", "number"),
+        ("duration-float", "number"),
+        ("genre", "string"),
+        ("country", "string"),
+        ("artist", "string"),
+    ]
+    .iter()
+    .map(|(a, b)| (a.to_string(), b.to_string()))
+    .collect();
+    builder.set_faceted_fields(faceted_fields);
+}
+
+const BASE_CONF: Conf = Conf {
+    dataset: datasets_paths::SMOL_SONGS,
+    queries: &[
+        "john ",             // 9097
+        "david ",            // 4794
+        "charles ",          // 1957
+        "david bowie ",      // 1200
+        "michael jackson ",  // 600
+        "thelonious monk ",  // 303
+        "charles mingus ",   // 142
+        "marcus miller ",    // 60
+        "tamo ",             // 13
+        "Notstandskomitee ", // 4
+    ],
+    configure: base_conf,
+    ..Conf::BASE
+};
+
+fn bench_songs(c: &mut criterion::Criterion) {
+    let default_criterion: Vec<String> = milli::default_criteria()
+        .iter()
+        .map(|criteria| criteria.to_string())
+        .collect();
+    let default_criterion = default_criterion.iter().map(|s| s.as_str());
+    let asc_default: Vec<&str> = std::iter::once("asc(released-timestamp)")
+        .chain(default_criterion.clone())
+        .collect();
+    let desc_default: Vec<&str> = std::iter::once("desc(released-timestamp)")
+        .chain(default_criterion.clone())
+        .collect();
+
+    let basic_with_quote: Vec<String> = BASE_CONF
+        .queries
+        .iter()
+        .map(|s| {
+            s.trim()
+                .split(' ')
+                .map(|s| format!(r#""{}""#, s))
+                .collect::<Vec<String>>()
+                .join(" ")
+        })
+        .collect();
+    let basic_with_quote: &[&str] = &basic_with_quote
+        .iter()
+        .map(|s| s.as_str())
+        .collect::<Vec<&str>>();
+
+    let confs = &[
+        /* first we bench each criterion alone */
+        utils::Conf {
+            group_name: "proximity",
+            queries: &[
+                "black saint sinner lady ",
+                "les dangeureuses 1960 ",
+                "The Disneyland Sing-Along Chorus ",
+                "Under Great Northern Lights ",
+                "7000 Danses Un Jour Dans Notre Vie ",
+            ],
+            criterion: Some(&["proximity"]),
+            optional_words: false,
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "typo",
+            queries: &[
+                "mongus ",
+                "thelonius monk ",
+                "Disnaylande ",
+                "the white striper ",
+                "indochie ",
+                "indochien ",
+                "klub des loopers ",
+                "fear of the duck ",
+                "michel depech ",
+                "stromal ",
+                "dire straights ",
+                "Arethla Franklin ",
+            ],
+            criterion: Some(&["typo"]),
+            optional_words: false,
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "words",
+            queries: &[
+                "the black saint and the sinner lady and the good doggo ", // four words to pop
+                "les liaisons dangeureuses 1793 ",                         // one word to pop
+                "The Disneyland Children's Sing-Alone song ",              // two words to pop
+                "seven nation mummy ",                                     // one word to pop
+                "7000 Danses / Le Baiser / je me trompe de mots ",         // four words to pop
+                "Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop
+                "whathavenotnsuchforth and a good amount of words to pop to match the first one ", // 13
+            ],
+            criterion: Some(&["words"]),
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "asc",
+            criterion: Some(&["asc(released-timestamp)"]),
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "desc",
+            criterion: Some(&["desc(released-timestamp)"]),
+            ..BASE_CONF
+        },
+
+        /* then we bench the asc and desc criterion on top of the default criterion */
+        utils::Conf {
+            group_name: "asc + default",
+            criterion: Some(&asc_default[..]),
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "desc + default",
+            criterion: Some(&desc_default[..]),
+            ..BASE_CONF
+        },
+
+        /* we bench the filters with the default request */
+        utils::Conf {
+            group_name: "basic filter: <=",
+            facet_condition: Some("released-timestamp <= 946728000"), // year 2000
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "basic filter: TO",
+            facet_condition: Some("released-timestamp 946728000 TO 1262347200"), // year 2000 to 2010
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "big filter",
+            facet_condition: Some("released-timestamp != 1262347200 AND (NOT (released-timestamp = 946728000)) AND (duration-float = 1 OR (duration-float 1.1 TO 1.5 AND released-timestamp > 315576000))"),
+            ..BASE_CONF
+        },
+
+        /* the we bench some global / normal search with all the default criterion in the default
+         * order */
+        utils::Conf {
+            group_name: "basic placeholder",
+            queries: &[""],
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "basic without quote",
+            queries: &BASE_CONF
+                .queries
+                .iter()
+                .map(|s| s.trim()) // we remove the space at the end of each request
+                .collect::<Vec<&str>>(),
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "basic with quote",
+            queries: basic_with_quote,
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "prefix search",
+            queries: &[
+                "s", // 500k+ results
+                "a", //
+                "b", //
+                "i", //
+                "x", // only 7k results
+            ],
+            ..BASE_CONF
+        },
+    ];
+
+    utils::run_benches(c, confs);
+}
+
+criterion_group!(benches, bench_songs);
+criterion_main!(benches);
--- a/benchmarks/benches/utils.rs
+++ b/benchmarks/benches/utils.rs
@ -0,0 +1,114 @@
+use std::fs::{create_dir_all, remove_dir_all, File};
+
+use criterion::BenchmarkId;
+use heed::EnvOpenOptions;
+use milli::{
+    update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat},
+    FacetCondition, Index,
+};
+
+pub struct Conf<'a> {
+    /// where we are going to create our database.mmdb directory
+    /// each benchmark will first try to delete it and then recreate it
+    pub database_name: &'a str,
+    /// the dataset to be used, it must be an uncompressed csv
+    pub dataset: &'a str,
+    pub group_name: &'a str,
+    pub queries: &'a [&'a str],
+    /// here you can change which criterion are used and in which order.
+    /// - if you specify something all the base configuration will be thrown out
+    /// - if you don't specify anything (None) the default configuration will be kept
+    pub criterion: Option<&'a [&'a str]>,
+    /// the last chance to configure your database as you want
+    pub configure: fn(&mut Settings),
+    pub facet_condition: Option<&'a str>,
+    /// enable or disable the optional words on the query
+    pub optional_words: bool,
+    /// primary key, if there is None we'll auto-generate docids for every documents
+    pub primary_key: Option<&'a str>,
+}
+
+impl Conf<'_> {
+    fn nop(_builder: &mut Settings) {}
+
+    pub const BASE: Self = Conf {
+        database_name: "benches.mmdb",
+        dataset: "",
+        group_name: "",
+        queries: &[],
+        criterion: None,
+        configure: Self::nop,
+        facet_condition: None,
+        optional_words: true,
+        primary_key: None,
+    };
+}
+
+pub fn base_setup(conf: &Conf) -> Index {
+    match remove_dir_all(&conf.database_name) {
+        Ok(_) => (),
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => (),
+        Err(e) => panic!("{}", e),
+    }
+    create_dir_all(&conf.database_name).unwrap();
+
+    let mut options = EnvOpenOptions::new();
+    options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
+    options.max_readers(10);
+    let index = Index::new(options, conf.database_name).unwrap();
+
+    let update_builder = UpdateBuilder::new(0);
+    let mut wtxn = index.write_txn().unwrap();
+    let mut builder = update_builder.settings(&mut wtxn, &index);
+
+    if let Some(criterion) = conf.criterion {
+        builder.reset_faceted_fields();
+        builder.reset_criteria();
+        builder.reset_stop_words();
+
+        let criterion = criterion.iter().map(|s| s.to_string()).collect();
+        builder.set_criteria(criterion);
+    }
+
+    (conf.configure)(&mut builder);
+
+    builder.execute(|_, _| ()).unwrap();
+    wtxn.commit().unwrap();
+
+    let update_builder = UpdateBuilder::new(0);
+    let mut wtxn = index.write_txn().unwrap();
+    let mut builder = update_builder.index_documents(&mut wtxn, &index);
+    builder.update_format(UpdateFormat::Csv);
+    builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
+    let reader = File::open(conf.dataset)
+        .expect(&format!("could not find the dataset in: {}", conf.dataset));
+    builder.execute(reader, |_, _| ()).unwrap();
+    wtxn.commit().unwrap();
+
+    index
+}
+
+pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
+    for conf in confs {
+        let index = base_setup(conf);
+
+        let mut group = c.benchmark_group(&format!("{}: {}", conf.dataset, conf.group_name));
+
+        for &query in conf.queries {
+            group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| {
+                b.iter(|| {
+                    let rtxn = index.read_txn().unwrap();
+                    let mut search = index.search(&rtxn);
+                    search.query(query).optional_words(conf.optional_words);
+                    if let Some(facet_condition) = conf.facet_condition {
+                        let facet_condition =
+                            FacetCondition::from_str(&rtxn, &index, facet_condition).unwrap();
+                        search.facet_condition(facet_condition);
+                    }
+                    let _ids = search.execute().unwrap();
+                });
+            });
+        }
+        group.finish();
+    }
+}
--- a/benchmarks/benches/wiki.rs
+++ b/benchmarks/benches/wiki.rs
@ -0,0 +1,133 @@
+mod datasets_paths;
+mod utils;
+
+use criterion::{criterion_group, criterion_main};
+use milli::update::Settings;
+use utils::Conf;
+
+fn base_conf(builder: &mut Settings) {
+    let displayed_fields = ["title", "body", "url"]
+        .iter()
+        .map(|s| s.to_string())
+        .collect();
+    builder.set_displayed_fields(displayed_fields);
+
+    let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect();
+    builder.set_searchable_fields(searchable_fields);
+}
+
+const BASE_CONF: Conf = Conf {
+    dataset: datasets_paths::SMOL_WIKI_ARTICLES,
+    queries: &[
+        "mingus ",        // 46 candidates
+        "miles davis ",   // 159
+        "rock and roll ", // 1007
+        "machine ",       // 3448
+        "spain ",         // 7002
+        "japan ",         // 10.593
+        "france ",        // 17.616
+        "film ",          // 24.959
+    ],
+    configure: base_conf,
+    ..Conf::BASE
+};
+
+fn bench_songs(c: &mut criterion::Criterion) {
+    let basic_with_quote: Vec<String> = BASE_CONF
+        .queries
+        .iter()
+        .map(|s| {
+            s.trim()
+                .split(' ')
+                .map(|s| format!(r#""{}""#, s))
+                .collect::<Vec<String>>()
+                .join(" ")
+        })
+        .collect();
+    let basic_with_quote: &[&str] = &basic_with_quote
+        .iter()
+        .map(|s| s.as_str())
+        .collect::<Vec<&str>>();
+
+    let confs = &[
+        /* first we bench each criterion alone */
+        utils::Conf {
+            group_name: "proximity",
+            queries: &[
+                "herald sings ",
+                "april paris ",
+                "tea two ",
+                "diesel engine ",
+            ],
+            criterion: Some(&["proximity"]),
+            optional_words: false,
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "typo",
+            queries: &[
+                "migrosoft ",
+                "linax ",
+                "Disnaylande ",
+                "phytogropher ",
+                "nympalidea ",
+                "aritmetric ",
+                "the fronce ",
+                "sisan ",
+            ],
+            criterion: Some(&["typo"]),
+            optional_words: false,
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "words",
+            queries: &[
+                "the black saint and the sinner lady and the good doggo ", // four words to pop, 27 results
+                "Kameya Tokujirō mingus monk ",                           // two words to pop, 55
+                "Ulrich Hensel meilisearch milli ",                        // two words to pop, 306
+                "Idaho Bellevue pizza ",                                   // one word to pop, 800
+                "Abraham machin ",                                         // one word to pop, 1141
+            ],
+            criterion: Some(&["words"]),
+            ..BASE_CONF
+        },
+        /* the we bench some global / normal search with all the default criterion in the default
+         * order */
+        utils::Conf {
+            group_name: "basic placeholder",
+            queries: &[""],
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "basic without quote",
+            queries: &BASE_CONF
+                .queries
+                .iter()
+                .map(|s| s.trim()) // we remove the space at the end of each request
+                .collect::<Vec<&str>>(),
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "basic with quote",
+            queries: basic_with_quote,
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "prefix search",
+            queries: &[
+                "t", // 453k results
+                "c", // 405k
+                "g", // 318k
+                "j", // 227k
+                "q", // 71k
+                "x", // 17k
+            ],
+            ..BASE_CONF
+        },
+    ];
+
+    utils::run_benches(c, confs);
+}
+
+criterion_group!(benches, bench_songs);
+criterion_main!(benches);
--- a/benchmarks/build.rs
+++ b/benchmarks/build.rs
@ -0,0 +1,80 @@
+use std::path::{Path, PathBuf};
+use std::{env, fs};
+use std::{
+    fs::File,
+    io::{Cursor, Read, Seek, Write},
+};
+
+use bytes::Bytes;
+use convert_case::{Case, Casing};
+use flate2::read::GzDecoder;
+use reqwest::IntoUrl;
+
+const BASE_URL: &str = "https://meili-datasets.s3.fr-par.scw.cloud/benchmarks";
+
+const DATASET_SONGS: &str = "smol-songs";
+const DATASET_WIKI: &str = "smol-wiki-articles";
+
+/// The name of the environment variable used to select the path
+/// of the directory containing the datasets
+const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH";
+
+fn main() -> anyhow::Result<()> {
+    let out_dir = PathBuf::from(env::var(BASE_DATASETS_PATH_KEY).unwrap_or(env::var("OUT_DIR")?));
+
+    let benches_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?).join("benches");
+    let mut manifest_paths_file = File::create(benches_dir.join("datasets_paths.rs"))?;
+    writeln!(
+        manifest_paths_file,
+        r#"//! This file is generated by the build script.
+//! Do not modify by hand, use the build.rs file.
+#![allow(dead_code)]
+"#
+    )?;
+    writeln!(manifest_paths_file)?;
+
+    for dataset in &[DATASET_SONGS, DATASET_WIKI] {
+        let out_path = out_dir.join(dataset);
+        let out_file = out_path.with_extension("csv");
+
+        writeln!(
+            &mut manifest_paths_file,
+            r#"pub const {}: &str = {:?};"#,
+            dataset.to_case(Case::ScreamingSnake),
+            out_file.display(),
+        )?;
+
+        if out_file.exists() {
+            eprintln!("The dataset {} already exists on the file system and will not be downloaded again", dataset);
+            continue;
+        }
+        let url = format!("{}/{}.csv.gz", BASE_URL, dataset);
+        eprintln!("downloading: {}", url);
+        let bytes = download_dataset(url.clone())?;
+        eprintln!("{} downloaded successfully", url);
+        eprintln!("uncompressing in {}", out_path.display());
+        uncompress_in_file(bytes, &out_file)?;
+    }
+
+    Ok(())
+}
+
+fn download_dataset<U: IntoUrl>(url: U) -> anyhow::Result<Cursor<Bytes>> {
+    let bytes = reqwest::blocking::Client::builder()
+        .timeout(None)
+        .build()?
+        .get(url)
+        .send()?
+        .bytes()?;
+    Ok(Cursor::new(bytes))
+}
+
+fn uncompress_in_file<R: Read + Seek, P: AsRef<Path>>(bytes: R, path: P) -> anyhow::Result<()> {
+    let path = path.as_ref();
+    let mut gz = GzDecoder::new(bytes);
+    let mut dataset = Vec::new();
+    gz.read_to_end(&mut dataset)?;
+
+    fs::write(path, dataset)?;
+    Ok(())
+}
--- a/benchmarks/src/lib.rs
+++ b/benchmarks/src/lib.rs
@ -0,0 +1,5 @@
+//! This library is only used to isolate the benchmarks
+//! from the original milli library.
+//!
+//! It does not include interesting functions for milli library
+//! users only for milli contributors.