mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-22 12:54:26 +01:00
move the benchmarks to another crate so we can download the datasets automatically without adding overhead to the build of milli
This commit is contained in:
parent
3c84075d2d
commit
06c414a753
@ -1,5 +1,5 @@
|
||||
[workspace]
|
||||
members = ["milli", "http-ui", "infos", "helpers", "search"]
|
||||
members = ["milli", "http-ui", "benchmarks", "infos", "helpers", "search"]
|
||||
default-members = ["milli"]
|
||||
|
||||
[profile.release]
|
||||
|
29
benchmarks/Cargo.toml
Normal file
29
benchmarks/Cargo.toml
Normal file
@ -0,0 +1,29 @@
|
||||
[package]
|
||||
name = "benchmarks"
|
||||
version = "0.1.0"
|
||||
edition = "2018"
|
||||
publish = false
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
milli = { path = "../milli" }
|
||||
|
||||
[dev-dependencies]
|
||||
heed = "*" # we want to use the version milli uses
|
||||
criterion = "0.3.4"
|
||||
|
||||
[build-dependencies]
|
||||
anyhow = "1.0"
|
||||
bytes = "1.0"
|
||||
flate2 = "1.0.20"
|
||||
convert_case = "0.4"
|
||||
reqwest = { version = "0.11.3", features = ["blocking", "rustls-tls"], default-features = false }
|
||||
|
||||
[[bench]]
|
||||
name = "songs"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "wiki"
|
||||
harness = false
|
30
benchmarks/README.md
Normal file
30
benchmarks/README.md
Normal file
@ -0,0 +1,30 @@
|
||||
Benchmarks
|
||||
==========
|
||||
|
||||
For our benchmark we are using a small subset of the dataset `songs.csv`. It was generated with this command:
|
||||
```
|
||||
xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv
|
||||
```
|
||||
You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)
|
||||
And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz).
|
||||
|
||||
We also use a subset of `wikipedia-articles.csv` that was generated with the following command:
|
||||
```
|
||||
xsv sample --seed 42 500000 wikipedia-articles.csv -o smol-wikipedia-articles.csv
|
||||
```
|
||||
You can download the original [here](https://meili-datasets.s3.fr-par.scw.cloud/wikipedia-articles.csv.gz) and the subset [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-wikipedia-articles.csv.gz).
|
||||
|
||||
-----
|
||||
|
||||
- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h
|
||||
- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h
|
||||
- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h
|
||||
|
||||
By default the benchmarks will be downloaded and uncompressed automatically in the target directory.
|
||||
If you don't want to download the datasets everytime you updates something on the code you can specify a custom directory with the env variable `MILLI_BENCH_DATASETS_PATH`:
|
||||
```
|
||||
mkdir ~/datasets
|
||||
MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded
|
||||
touch build.rs
|
||||
MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded
|
||||
```
|
@ -1,3 +1,4 @@
|
||||
mod datasets_paths;
|
||||
mod utils;
|
||||
|
||||
use criterion::{criterion_group, criterion_main};
|
||||
@ -33,7 +34,7 @@ fn base_conf(builder: &mut Settings) {
|
||||
}
|
||||
|
||||
const BASE_CONF: Conf = Conf {
|
||||
dataset: "smol-songs.csv",
|
||||
dataset: datasets_paths::SMOL_SONGS,
|
||||
queries: &[
|
||||
"john ", // 9097
|
||||
"david ", // 4794
|
@ -7,15 +7,6 @@ use milli::{
|
||||
FacetCondition, Index,
|
||||
};
|
||||
|
||||
/// The name of the environment variable used to select the path
|
||||
/// of the directory containing the datasets
|
||||
const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH";
|
||||
|
||||
/// The default path for the dataset if nothing is specified
|
||||
/// By default we chose `milli/benches` because any cargo command ran in `milli/milli/**` will be
|
||||
/// executed with a pwd of `milli/milli`
|
||||
const DEFAULT_DATASETS_PATH: &str = "milli/benches";
|
||||
|
||||
pub struct Conf<'a> {
|
||||
/// where we are going to create our database.mmdb directory
|
||||
/// each benchmark will first try to delete it and then recreate it
|
||||
@ -33,6 +24,8 @@ pub struct Conf<'a> {
|
||||
pub facet_condition: Option<&'a str>,
|
||||
/// enable or disable the optional words on the query
|
||||
pub optional_words: bool,
|
||||
/// primary key, if there is None we'll auto-generate docids for every documents
|
||||
pub primary_key: Option<&'a str>,
|
||||
}
|
||||
|
||||
impl Conf<'_> {
|
||||
@ -47,6 +40,7 @@ impl Conf<'_> {
|
||||
configure: Self::nop,
|
||||
facet_condition: None,
|
||||
optional_words: true,
|
||||
primary_key: None,
|
||||
};
|
||||
}
|
||||
|
||||
@ -86,13 +80,8 @@ pub fn base_setup(conf: &Conf) -> Index {
|
||||
let mut builder = update_builder.index_documents(&mut wtxn, &index);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
|
||||
// we called from cargo the current directory is supposed to be milli/milli
|
||||
let base_dataset_path = std::env::vars()
|
||||
.find(|var| var.0 == BASE_DATASETS_PATH_KEY)
|
||||
.map_or(DEFAULT_DATASETS_PATH.to_owned(), |(_key, value)| value);
|
||||
let dataset_path = format!("{}/{}", base_dataset_path, conf.dataset);
|
||||
let reader = File::open(&dataset_path)
|
||||
.expect(&format!("could not find the dataset in: {}", &dataset_path));
|
||||
let reader = File::open(conf.dataset)
|
||||
.expect(&format!("could not find the dataset in: {}", conf.dataset));
|
||||
builder.execute(reader, |_, _| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -1,3 +1,4 @@
|
||||
mod datasets_paths;
|
||||
mod utils;
|
||||
|
||||
use criterion::{criterion_group, criterion_main};
|
||||
@ -16,7 +17,7 @@ fn base_conf(builder: &mut Settings) {
|
||||
}
|
||||
|
||||
const BASE_CONF: Conf = Conf {
|
||||
dataset: "smol-wiki-articles.csv",
|
||||
dataset: datasets_paths::SMOL_WIKI_ARTICLES,
|
||||
queries: &[
|
||||
"mingus ", // 46 candidates
|
||||
"miles davis ", // 159
|
80
benchmarks/build.rs
Normal file
80
benchmarks/build.rs
Normal file
@ -0,0 +1,80 @@
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::{env, fs};
|
||||
use std::{
|
||||
fs::File,
|
||||
io::{Cursor, Read, Seek, Write},
|
||||
};
|
||||
|
||||
use bytes::Bytes;
|
||||
use convert_case::{Case, Casing};
|
||||
use flate2::read::GzDecoder;
|
||||
use reqwest::IntoUrl;
|
||||
|
||||
const BASE_URL: &str = "https://meili-datasets.s3.fr-par.scw.cloud/benchmarks";
|
||||
|
||||
const DATASET_SONGS: &str = "smol-songs";
|
||||
const DATASET_WIKI: &str = "smol-wiki-articles";
|
||||
|
||||
/// The name of the environment variable used to select the path
|
||||
/// of the directory containing the datasets
|
||||
const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH";
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let out_dir = PathBuf::from(env::var(BASE_DATASETS_PATH_KEY).unwrap_or(env::var("OUT_DIR")?));
|
||||
|
||||
let benches_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?).join("benches");
|
||||
let mut manifest_paths_file = File::create(benches_dir.join("datasets_paths.rs"))?;
|
||||
writeln!(
|
||||
manifest_paths_file,
|
||||
r#"//! This file is generated by the build script.
|
||||
//! Do not modify by hand, use the build.rs file.
|
||||
#![allow(dead_code)]
|
||||
"#
|
||||
)?;
|
||||
writeln!(manifest_paths_file)?;
|
||||
|
||||
for dataset in &[DATASET_SONGS, DATASET_WIKI] {
|
||||
let out_path = out_dir.join(dataset);
|
||||
let out_file = out_path.with_extension("csv");
|
||||
|
||||
writeln!(
|
||||
&mut manifest_paths_file,
|
||||
r#"pub const {}: &str = {:?};"#,
|
||||
dataset.to_case(Case::ScreamingSnake),
|
||||
out_file.display(),
|
||||
)?;
|
||||
|
||||
if out_file.exists() {
|
||||
eprintln!("The dataset {} already exists on the file system and will not be downloaded again", dataset);
|
||||
continue;
|
||||
}
|
||||
let url = format!("{}/{}.csv.gz", BASE_URL, dataset);
|
||||
eprintln!("downloading: {}", url);
|
||||
let bytes = download_dataset(url.clone())?;
|
||||
eprintln!("{} downloaded successfully", url);
|
||||
eprintln!("uncompressing in {}", out_path.display());
|
||||
uncompress_in_file(bytes, &out_file)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn download_dataset<U: IntoUrl>(url: U) -> anyhow::Result<Cursor<Bytes>> {
|
||||
let bytes = reqwest::blocking::Client::builder()
|
||||
.timeout(None)
|
||||
.build()?
|
||||
.get(url)
|
||||
.send()?
|
||||
.bytes()?;
|
||||
Ok(Cursor::new(bytes))
|
||||
}
|
||||
|
||||
fn uncompress_in_file<R: Read + Seek, P: AsRef<Path>>(bytes: R, path: P) -> anyhow::Result<()> {
|
||||
let path = path.as_ref();
|
||||
let mut gz = GzDecoder::new(bytes);
|
||||
let mut dataset = Vec::new();
|
||||
gz.read_to_end(&mut dataset)?;
|
||||
|
||||
fs::write(path, dataset)?;
|
||||
Ok(())
|
||||
}
|
5
benchmarks/src/lib.rs
Normal file
5
benchmarks/src/lib.rs
Normal file
@ -0,0 +1,5 @@
|
||||
//! This library is only used to isolate the benchmarks
|
||||
//! from the original milli library.
|
||||
//!
|
||||
//! It does not include interesting functions for milli library
|
||||
//! users only for milli contributors.
|
@ -53,17 +53,8 @@ tinytemplate = "=1.1.0"
|
||||
|
||||
[dev-dependencies]
|
||||
big_s = "1.0.2"
|
||||
criterion = "0.3.4"
|
||||
maplit = "1.0.2"
|
||||
rand = "0.8.3"
|
||||
|
||||
[features]
|
||||
default = []
|
||||
|
||||
[[bench]]
|
||||
name = "songs"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "wiki"
|
||||
harness = false
|
||||
|
@ -1,27 +0,0 @@
|
||||
Benchmarks
|
||||
==========
|
||||
|
||||
For our benchmark we are using a small subset of the dataset songs.csv. It was generated with this command:
|
||||
```
|
||||
xsv sample --seed 42 song.csv -o smol-songs.csv
|
||||
```
|
||||
You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)
|
||||
And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz).
|
||||
|
||||
You need to put this file in the current directory: `milli/milli/benches/smol-songs.csv.gz`
|
||||
You can run the following command from the root of this git repository
|
||||
```
|
||||
wget https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz -O milli/benches/smol-songs.csv.gz
|
||||
```
|
||||
|
||||
- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h
|
||||
- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h
|
||||
- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h
|
||||
|
||||
By default the benchmarks expect the datasets to be uncompressed and present in `milli/milli/benches`, but you can also specify your own path with the environment variable `MILLI_BENCH_DATASETS_PATH` like that:
|
||||
```
|
||||
MILLI_BENCH_DATASETS_PATH=~/Downloads/datasets cargo bench --bench songs
|
||||
```
|
||||
|
||||
Our benchmarking suite uses criterion which allow you to do a lot of configuration, see the documentation [here](https://bheisler.github.io/criterion.rs/book/user_guide/user_guide.html)
|
||||
|
Loading…
Reference in New Issue
Block a user