mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-25 22:34:28 +01:00
move the benchmarks to another crate so we can download the datasets automatically without adding overhead to the build of milli
This commit is contained in:
parent
3c84075d2d
commit
06c414a753
@ -1,5 +1,5 @@
|
|||||||
[workspace]
|
[workspace]
|
||||||
members = ["milli", "http-ui", "infos", "helpers", "search"]
|
members = ["milli", "http-ui", "benchmarks", "infos", "helpers", "search"]
|
||||||
default-members = ["milli"]
|
default-members = ["milli"]
|
||||||
|
|
||||||
[profile.release]
|
[profile.release]
|
||||||
|
29
benchmarks/Cargo.toml
Normal file
29
benchmarks/Cargo.toml
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
[package]
|
||||||
|
name = "benchmarks"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2018"
|
||||||
|
publish = false
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
milli = { path = "../milli" }
|
||||||
|
|
||||||
|
[dev-dependencies]
|
||||||
|
heed = "*" # we want to use the version milli uses
|
||||||
|
criterion = "0.3.4"
|
||||||
|
|
||||||
|
[build-dependencies]
|
||||||
|
anyhow = "1.0"
|
||||||
|
bytes = "1.0"
|
||||||
|
flate2 = "1.0.20"
|
||||||
|
convert_case = "0.4"
|
||||||
|
reqwest = { version = "0.11.3", features = ["blocking", "rustls-tls"], default-features = false }
|
||||||
|
|
||||||
|
[[bench]]
|
||||||
|
name = "songs"
|
||||||
|
harness = false
|
||||||
|
|
||||||
|
[[bench]]
|
||||||
|
name = "wiki"
|
||||||
|
harness = false
|
30
benchmarks/README.md
Normal file
30
benchmarks/README.md
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
Benchmarks
|
||||||
|
==========
|
||||||
|
|
||||||
|
For our benchmark we are using a small subset of the dataset `songs.csv`. It was generated with this command:
|
||||||
|
```
|
||||||
|
xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv
|
||||||
|
```
|
||||||
|
You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)
|
||||||
|
And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz).
|
||||||
|
|
||||||
|
We also use a subset of `wikipedia-articles.csv` that was generated with the following command:
|
||||||
|
```
|
||||||
|
xsv sample --seed 42 500000 wikipedia-articles.csv -o smol-wikipedia-articles.csv
|
||||||
|
```
|
||||||
|
You can download the original [here](https://meili-datasets.s3.fr-par.scw.cloud/wikipedia-articles.csv.gz) and the subset [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-wikipedia-articles.csv.gz).
|
||||||
|
|
||||||
|
-----
|
||||||
|
|
||||||
|
- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h
|
||||||
|
- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h
|
||||||
|
- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h
|
||||||
|
|
||||||
|
By default the benchmarks will be downloaded and uncompressed automatically in the target directory.
|
||||||
|
If you don't want to download the datasets everytime you updates something on the code you can specify a custom directory with the env variable `MILLI_BENCH_DATASETS_PATH`:
|
||||||
|
```
|
||||||
|
mkdir ~/datasets
|
||||||
|
MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded
|
||||||
|
touch build.rs
|
||||||
|
MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded
|
||||||
|
```
|
@ -1,3 +1,4 @@
|
|||||||
|
mod datasets_paths;
|
||||||
mod utils;
|
mod utils;
|
||||||
|
|
||||||
use criterion::{criterion_group, criterion_main};
|
use criterion::{criterion_group, criterion_main};
|
||||||
@ -33,7 +34,7 @@ fn base_conf(builder: &mut Settings) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const BASE_CONF: Conf = Conf {
|
const BASE_CONF: Conf = Conf {
|
||||||
dataset: "smol-songs.csv",
|
dataset: datasets_paths::SMOL_SONGS,
|
||||||
queries: &[
|
queries: &[
|
||||||
"john ", // 9097
|
"john ", // 9097
|
||||||
"david ", // 4794
|
"david ", // 4794
|
@ -7,15 +7,6 @@ use milli::{
|
|||||||
FacetCondition, Index,
|
FacetCondition, Index,
|
||||||
};
|
};
|
||||||
|
|
||||||
/// The name of the environment variable used to select the path
|
|
||||||
/// of the directory containing the datasets
|
|
||||||
const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH";
|
|
||||||
|
|
||||||
/// The default path for the dataset if nothing is specified
|
|
||||||
/// By default we chose `milli/benches` because any cargo command ran in `milli/milli/**` will be
|
|
||||||
/// executed with a pwd of `milli/milli`
|
|
||||||
const DEFAULT_DATASETS_PATH: &str = "milli/benches";
|
|
||||||
|
|
||||||
pub struct Conf<'a> {
|
pub struct Conf<'a> {
|
||||||
/// where we are going to create our database.mmdb directory
|
/// where we are going to create our database.mmdb directory
|
||||||
/// each benchmark will first try to delete it and then recreate it
|
/// each benchmark will first try to delete it and then recreate it
|
||||||
@ -33,6 +24,8 @@ pub struct Conf<'a> {
|
|||||||
pub facet_condition: Option<&'a str>,
|
pub facet_condition: Option<&'a str>,
|
||||||
/// enable or disable the optional words on the query
|
/// enable or disable the optional words on the query
|
||||||
pub optional_words: bool,
|
pub optional_words: bool,
|
||||||
|
/// primary key, if there is None we'll auto-generate docids for every documents
|
||||||
|
pub primary_key: Option<&'a str>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Conf<'_> {
|
impl Conf<'_> {
|
||||||
@ -47,6 +40,7 @@ impl Conf<'_> {
|
|||||||
configure: Self::nop,
|
configure: Self::nop,
|
||||||
facet_condition: None,
|
facet_condition: None,
|
||||||
optional_words: true,
|
optional_words: true,
|
||||||
|
primary_key: None,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -86,13 +80,8 @@ pub fn base_setup(conf: &Conf) -> Index {
|
|||||||
let mut builder = update_builder.index_documents(&mut wtxn, &index);
|
let mut builder = update_builder.index_documents(&mut wtxn, &index);
|
||||||
builder.update_format(UpdateFormat::Csv);
|
builder.update_format(UpdateFormat::Csv);
|
||||||
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
|
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
|
||||||
// we called from cargo the current directory is supposed to be milli/milli
|
let reader = File::open(conf.dataset)
|
||||||
let base_dataset_path = std::env::vars()
|
.expect(&format!("could not find the dataset in: {}", conf.dataset));
|
||||||
.find(|var| var.0 == BASE_DATASETS_PATH_KEY)
|
|
||||||
.map_or(DEFAULT_DATASETS_PATH.to_owned(), |(_key, value)| value);
|
|
||||||
let dataset_path = format!("{}/{}", base_dataset_path, conf.dataset);
|
|
||||||
let reader = File::open(&dataset_path)
|
|
||||||
.expect(&format!("could not find the dataset in: {}", &dataset_path));
|
|
||||||
builder.execute(reader, |_, _| ()).unwrap();
|
builder.execute(reader, |_, _| ()).unwrap();
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
@ -1,3 +1,4 @@
|
|||||||
|
mod datasets_paths;
|
||||||
mod utils;
|
mod utils;
|
||||||
|
|
||||||
use criterion::{criterion_group, criterion_main};
|
use criterion::{criterion_group, criterion_main};
|
||||||
@ -16,7 +17,7 @@ fn base_conf(builder: &mut Settings) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const BASE_CONF: Conf = Conf {
|
const BASE_CONF: Conf = Conf {
|
||||||
dataset: "smol-wiki-articles.csv",
|
dataset: datasets_paths::SMOL_WIKI_ARTICLES,
|
||||||
queries: &[
|
queries: &[
|
||||||
"mingus ", // 46 candidates
|
"mingus ", // 46 candidates
|
||||||
"miles davis ", // 159
|
"miles davis ", // 159
|
80
benchmarks/build.rs
Normal file
80
benchmarks/build.rs
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
use std::{env, fs};
|
||||||
|
use std::{
|
||||||
|
fs::File,
|
||||||
|
io::{Cursor, Read, Seek, Write},
|
||||||
|
};
|
||||||
|
|
||||||
|
use bytes::Bytes;
|
||||||
|
use convert_case::{Case, Casing};
|
||||||
|
use flate2::read::GzDecoder;
|
||||||
|
use reqwest::IntoUrl;
|
||||||
|
|
||||||
|
const BASE_URL: &str = "https://meili-datasets.s3.fr-par.scw.cloud/benchmarks";
|
||||||
|
|
||||||
|
const DATASET_SONGS: &str = "smol-songs";
|
||||||
|
const DATASET_WIKI: &str = "smol-wiki-articles";
|
||||||
|
|
||||||
|
/// The name of the environment variable used to select the path
|
||||||
|
/// of the directory containing the datasets
|
||||||
|
const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH";
|
||||||
|
|
||||||
|
fn main() -> anyhow::Result<()> {
|
||||||
|
let out_dir = PathBuf::from(env::var(BASE_DATASETS_PATH_KEY).unwrap_or(env::var("OUT_DIR")?));
|
||||||
|
|
||||||
|
let benches_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?).join("benches");
|
||||||
|
let mut manifest_paths_file = File::create(benches_dir.join("datasets_paths.rs"))?;
|
||||||
|
writeln!(
|
||||||
|
manifest_paths_file,
|
||||||
|
r#"//! This file is generated by the build script.
|
||||||
|
//! Do not modify by hand, use the build.rs file.
|
||||||
|
#![allow(dead_code)]
|
||||||
|
"#
|
||||||
|
)?;
|
||||||
|
writeln!(manifest_paths_file)?;
|
||||||
|
|
||||||
|
for dataset in &[DATASET_SONGS, DATASET_WIKI] {
|
||||||
|
let out_path = out_dir.join(dataset);
|
||||||
|
let out_file = out_path.with_extension("csv");
|
||||||
|
|
||||||
|
writeln!(
|
||||||
|
&mut manifest_paths_file,
|
||||||
|
r#"pub const {}: &str = {:?};"#,
|
||||||
|
dataset.to_case(Case::ScreamingSnake),
|
||||||
|
out_file.display(),
|
||||||
|
)?;
|
||||||
|
|
||||||
|
if out_file.exists() {
|
||||||
|
eprintln!("The dataset {} already exists on the file system and will not be downloaded again", dataset);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let url = format!("{}/{}.csv.gz", BASE_URL, dataset);
|
||||||
|
eprintln!("downloading: {}", url);
|
||||||
|
let bytes = download_dataset(url.clone())?;
|
||||||
|
eprintln!("{} downloaded successfully", url);
|
||||||
|
eprintln!("uncompressing in {}", out_path.display());
|
||||||
|
uncompress_in_file(bytes, &out_file)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn download_dataset<U: IntoUrl>(url: U) -> anyhow::Result<Cursor<Bytes>> {
|
||||||
|
let bytes = reqwest::blocking::Client::builder()
|
||||||
|
.timeout(None)
|
||||||
|
.build()?
|
||||||
|
.get(url)
|
||||||
|
.send()?
|
||||||
|
.bytes()?;
|
||||||
|
Ok(Cursor::new(bytes))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn uncompress_in_file<R: Read + Seek, P: AsRef<Path>>(bytes: R, path: P) -> anyhow::Result<()> {
|
||||||
|
let path = path.as_ref();
|
||||||
|
let mut gz = GzDecoder::new(bytes);
|
||||||
|
let mut dataset = Vec::new();
|
||||||
|
gz.read_to_end(&mut dataset)?;
|
||||||
|
|
||||||
|
fs::write(path, dataset)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
5
benchmarks/src/lib.rs
Normal file
5
benchmarks/src/lib.rs
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
//! This library is only used to isolate the benchmarks
|
||||||
|
//! from the original milli library.
|
||||||
|
//!
|
||||||
|
//! It does not include interesting functions for milli library
|
||||||
|
//! users only for milli contributors.
|
@ -53,17 +53,8 @@ tinytemplate = "=1.1.0"
|
|||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
big_s = "1.0.2"
|
big_s = "1.0.2"
|
||||||
criterion = "0.3.4"
|
|
||||||
maplit = "1.0.2"
|
maplit = "1.0.2"
|
||||||
rand = "0.8.3"
|
rand = "0.8.3"
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = []
|
default = []
|
||||||
|
|
||||||
[[bench]]
|
|
||||||
name = "songs"
|
|
||||||
harness = false
|
|
||||||
|
|
||||||
[[bench]]
|
|
||||||
name = "wiki"
|
|
||||||
harness = false
|
|
||||||
|
@ -1,27 +0,0 @@
|
|||||||
Benchmarks
|
|
||||||
==========
|
|
||||||
|
|
||||||
For our benchmark we are using a small subset of the dataset songs.csv. It was generated with this command:
|
|
||||||
```
|
|
||||||
xsv sample --seed 42 song.csv -o smol-songs.csv
|
|
||||||
```
|
|
||||||
You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)
|
|
||||||
And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz).
|
|
||||||
|
|
||||||
You need to put this file in the current directory: `milli/milli/benches/smol-songs.csv.gz`
|
|
||||||
You can run the following command from the root of this git repository
|
|
||||||
```
|
|
||||||
wget https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz -O milli/benches/smol-songs.csv.gz
|
|
||||||
```
|
|
||||||
|
|
||||||
- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h
|
|
||||||
- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h
|
|
||||||
- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h
|
|
||||||
|
|
||||||
By default the benchmarks expect the datasets to be uncompressed and present in `milli/milli/benches`, but you can also specify your own path with the environment variable `MILLI_BENCH_DATASETS_PATH` like that:
|
|
||||||
```
|
|
||||||
MILLI_BENCH_DATASETS_PATH=~/Downloads/datasets cargo bench --bench songs
|
|
||||||
```
|
|
||||||
|
|
||||||
Our benchmarking suite uses criterion which allow you to do a lot of configuration, see the documentation [here](https://bheisler.github.io/criterion.rs/book/user_guide/user_guide.html)
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user