mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-29 16:24:26 +01:00
Merge #357
357: Add benchmarks for the geosearch r=Kerollmops a=irevoire closes #336 Should I merge this PR in #322 and then we merge everything in `main` or should we wait for #322 to be merged and then merge this one in `main` later? Co-authored-by: Tamo <tamo@meilisearch.com> Co-authored-by: Irevoire <tamo@meilisearch.com>
This commit is contained in:
commit
700318dc62
2
.github/workflows/benchmarks.yml
vendored
2
.github/workflows/benchmarks.yml
vendored
@ -4,7 +4,7 @@ on:
|
|||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
inputs:
|
inputs:
|
||||||
dataset_name:
|
dataset_name:
|
||||||
description: 'The name of the dataset used to benchmark (search_songs, search_wiki or indexing)'
|
description: 'The name of the dataset used to benchmark (search_songs, search_wiki, search_geo or indexing)'
|
||||||
required: false
|
required: false
|
||||||
default: 'search_songs'
|
default: 'search_songs'
|
||||||
|
|
||||||
|
@ -29,6 +29,10 @@ harness = false
|
|||||||
name = "search_wiki"
|
name = "search_wiki"
|
||||||
harness = false
|
harness = false
|
||||||
|
|
||||||
|
[[bench]]
|
||||||
|
name = "search_geo"
|
||||||
|
harness = false
|
||||||
|
|
||||||
[[bench]]
|
[[bench]]
|
||||||
name = "indexing"
|
name = "indexing"
|
||||||
harness = false
|
harness = false
|
||||||
|
@ -36,7 +36,7 @@ To run all the benchmarks (~5h):
|
|||||||
cargo bench
|
cargo bench
|
||||||
```
|
```
|
||||||
|
|
||||||
To run only the `songs` (~1h), `wiki` (~3h) or `indexing` (~4h) benchmark:
|
To run only the `search_songs` (~1h), `search_wiki` (~3h), `search_geo` (~20m) or `indexing` (~2h) benchmark:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cargo bench --bench <dataset name>
|
cargo bench --bench <dataset name>
|
||||||
@ -47,7 +47,7 @@ If you don't want to download the datasets every time you update something on th
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
mkdir ~/datasets
|
mkdir ~/datasets
|
||||||
MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the three datasets are downloaded
|
MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench search_songs # the four datasets are downloaded
|
||||||
touch build.rs
|
touch build.rs
|
||||||
MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded
|
MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded
|
||||||
```
|
```
|
||||||
@ -87,14 +87,15 @@ Run the comparison script:
|
|||||||
|
|
||||||
## Datasets
|
## Datasets
|
||||||
|
|
||||||
The benchmarks are available for the following datasets:
|
The benchmarks uses the following datasets:
|
||||||
- `songs`
|
- `smol-songs`
|
||||||
- `wiki`
|
- `smol-wiki`
|
||||||
- `movies`
|
- `movies`
|
||||||
|
- `smol-all-countries`
|
||||||
|
|
||||||
### Songs
|
### Songs
|
||||||
|
|
||||||
`songs` is a subset of the [`songs.csv` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/songs.csv.gz).
|
`smol-songs` is a subset of the [`songs.csv` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/songs.csv.gz).
|
||||||
|
|
||||||
It was generated with this command:
|
It was generated with this command:
|
||||||
|
|
||||||
@ -102,11 +103,11 @@ It was generated with this command:
|
|||||||
xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv
|
xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv
|
||||||
```
|
```
|
||||||
|
|
||||||
_[Download the generated `songs` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-songs.csv.gz)._
|
_[Download the generated `smol-songs` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-songs.csv.gz)._
|
||||||
|
|
||||||
### Wiki
|
### Wiki
|
||||||
|
|
||||||
`wiki` is a subset of the [`wikipedia-articles.csv` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/wiki-articles.csv.gz).
|
`smol-wiki` is a subset of the [`wikipedia-articles.csv` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/wiki-articles.csv.gz).
|
||||||
|
|
||||||
It was generated with the following command:
|
It was generated with the following command:
|
||||||
|
|
||||||
@ -114,9 +115,24 @@ It was generated with the following command:
|
|||||||
xsv sample --seed 42 500000 wiki-articles.csv -o smol-wiki-articles.csv
|
xsv sample --seed 42 500000 wiki-articles.csv -o smol-wiki-articles.csv
|
||||||
```
|
```
|
||||||
|
|
||||||
|
_[Download the `smol-wiki` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-wiki-articles.csv.gz)._
|
||||||
|
|
||||||
### Movies
|
### Movies
|
||||||
|
|
||||||
`movies` is a really small dataset we uses as our example in the [getting started](https://docs.meilisearch.com/learn/getting_started/)
|
`movies` is a really small dataset we uses as our example in the [getting started](https://docs.meilisearch.com/learn/getting_started/)
|
||||||
|
|
||||||
_[Download the `movies` dataset](https://docs.meilisearch.com/movies.json)._
|
_[Download the `movies` dataset](https://docs.meilisearch.com/movies.json)._
|
||||||
|
|
||||||
|
|
||||||
|
### All Countries
|
||||||
|
|
||||||
|
`smol-all-countries` is a subset of the [`all-countries.csv` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/all-countries.csv.gz)
|
||||||
|
It has been converted to jsonlines and then edited so it matches our format for the `_geo` field.
|
||||||
|
|
||||||
|
It was generated with the following command:
|
||||||
|
```bash
|
||||||
|
bat all-countries.csv.gz | gunzip | xsv sample --seed 42 1000000 | csv2json-lite | sd '"latitude":"(.*?)","longitude":"(.*?)"' '"_geo": { "lat": $1, "lng": $2 }' | sd '\[|\]|,$' '' | gzip > smol-all-countries.jsonl.gz
|
||||||
|
```
|
||||||
|
|
||||||
|
_[Download the `smol-all-countries` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-all-countries.jsonl.gz)._
|
||||||
|
|
||||||
|
@ -277,12 +277,69 @@ fn indexing_movies_default(c: &mut Criterion) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn indexing_geo(c: &mut Criterion) {
|
||||||
|
let mut group = c.benchmark_group("indexing");
|
||||||
|
group.sample_size(10);
|
||||||
|
group.bench_function("Indexing geo_point", |b| {
|
||||||
|
b.iter_with_setup(
|
||||||
|
move || {
|
||||||
|
let index = setup_index();
|
||||||
|
|
||||||
|
let update_builder = UpdateBuilder::new(0);
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let mut builder = update_builder.settings(&mut wtxn, &index);
|
||||||
|
|
||||||
|
builder.set_primary_key("geonameid".to_owned());
|
||||||
|
let displayed_fields =
|
||||||
|
["geonameid", "name", "asciiname", "alternatenames", "_geo", "population"]
|
||||||
|
.iter()
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
.collect();
|
||||||
|
builder.set_displayed_fields(displayed_fields);
|
||||||
|
|
||||||
|
let searchable_fields =
|
||||||
|
["name", "alternatenames", "elevation"].iter().map(|s| s.to_string()).collect();
|
||||||
|
builder.set_searchable_fields(searchable_fields);
|
||||||
|
|
||||||
|
let filterable_fields =
|
||||||
|
["_geo", "population", "elevation"].iter().map(|s| s.to_string()).collect();
|
||||||
|
builder.set_filterable_fields(filterable_fields);
|
||||||
|
|
||||||
|
let sortable_fields =
|
||||||
|
["_geo", "population", "elevation"].iter().map(|s| s.to_string()).collect();
|
||||||
|
builder.set_sortable_fields(sortable_fields);
|
||||||
|
|
||||||
|
builder.execute(|_, _| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
index
|
||||||
|
},
|
||||||
|
move |index| {
|
||||||
|
let update_builder = UpdateBuilder::new(0);
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let mut builder = update_builder.index_documents(&mut wtxn, &index);
|
||||||
|
|
||||||
|
builder.update_format(UpdateFormat::JsonStream);
|
||||||
|
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
|
||||||
|
let reader = File::open(datasets_paths::SMOL_ALL_COUNTRIES).expect(&format!(
|
||||||
|
"could not find the dataset in: {}",
|
||||||
|
datasets_paths::SMOL_ALL_COUNTRIES
|
||||||
|
));
|
||||||
|
builder.execute(reader, |_, _| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
index.prepare_for_closing().wait();
|
||||||
|
},
|
||||||
|
)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
criterion_group!(
|
criterion_group!(
|
||||||
benches,
|
benches,
|
||||||
indexing_songs_default,
|
indexing_songs_default,
|
||||||
indexing_songs_without_faceted_numbers,
|
indexing_songs_without_faceted_numbers,
|
||||||
indexing_songs_without_faceted_fields,
|
indexing_songs_without_faceted_fields,
|
||||||
indexing_wiki,
|
indexing_wiki,
|
||||||
indexing_movies_default
|
indexing_movies_default,
|
||||||
|
indexing_geo
|
||||||
);
|
);
|
||||||
criterion_main!(benches);
|
criterion_main!(benches);
|
||||||
|
123
benchmarks/benches/search_geo.rs
Normal file
123
benchmarks/benches/search_geo.rs
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
mod datasets_paths;
|
||||||
|
mod utils;
|
||||||
|
|
||||||
|
use criterion::{criterion_group, criterion_main};
|
||||||
|
use milli::update::{Settings, UpdateFormat};
|
||||||
|
use utils::Conf;
|
||||||
|
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
#[global_allocator]
|
||||||
|
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
||||||
|
|
||||||
|
fn base_conf(builder: &mut Settings) {
|
||||||
|
let displayed_fields =
|
||||||
|
["geonameid", "name", "asciiname", "alternatenames", "_geo", "population"]
|
||||||
|
.iter()
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
.collect();
|
||||||
|
builder.set_displayed_fields(displayed_fields);
|
||||||
|
|
||||||
|
let searchable_fields =
|
||||||
|
["name", "alternatenames", "elevation"].iter().map(|s| s.to_string()).collect();
|
||||||
|
builder.set_searchable_fields(searchable_fields);
|
||||||
|
|
||||||
|
let filterable_fields =
|
||||||
|
["_geo", "population", "elevation"].iter().map(|s| s.to_string()).collect();
|
||||||
|
builder.set_filterable_fields(filterable_fields);
|
||||||
|
|
||||||
|
let sortable_fields =
|
||||||
|
["_geo", "population", "elevation"].iter().map(|s| s.to_string()).collect();
|
||||||
|
builder.set_sortable_fields(sortable_fields);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[rustfmt::skip]
|
||||||
|
const BASE_CONF: Conf = Conf {
|
||||||
|
dataset: datasets_paths::SMOL_ALL_COUNTRIES,
|
||||||
|
dataset_format: UpdateFormat::JsonStream,
|
||||||
|
queries: &[
|
||||||
|
"",
|
||||||
|
],
|
||||||
|
configure: base_conf,
|
||||||
|
primary_key: Some("geonameid"),
|
||||||
|
..Conf::BASE
|
||||||
|
};
|
||||||
|
|
||||||
|
fn bench_geo(c: &mut criterion::Criterion) {
|
||||||
|
#[rustfmt::skip]
|
||||||
|
let confs = &[
|
||||||
|
// A basic placeholder with no geo
|
||||||
|
utils::Conf {
|
||||||
|
group_name: "placeholder with no geo",
|
||||||
|
..BASE_CONF
|
||||||
|
},
|
||||||
|
// Medium aglomeration: probably the most common usecase
|
||||||
|
utils::Conf {
|
||||||
|
group_name: "asc sort from Lille",
|
||||||
|
sort: Some(vec!["_geoPoint(50.62999333378238, 3.086269263384099):asc"]),
|
||||||
|
..BASE_CONF
|
||||||
|
},
|
||||||
|
utils::Conf {
|
||||||
|
group_name: "desc sort from Lille",
|
||||||
|
sort: Some(vec!["_geoPoint(50.62999333378238, 3.086269263384099):desc"]),
|
||||||
|
..BASE_CONF
|
||||||
|
},
|
||||||
|
// Big agglomeration: a lot of documents close to our point
|
||||||
|
utils::Conf {
|
||||||
|
group_name: "asc sort from Tokyo",
|
||||||
|
sort: Some(vec!["_geoPoint(35.749512532692144, 139.61664952543356):asc"]),
|
||||||
|
..BASE_CONF
|
||||||
|
},
|
||||||
|
utils::Conf {
|
||||||
|
group_name: "desc sort from Tokyo",
|
||||||
|
sort: Some(vec!["_geoPoint(35.749512532692144, 139.61664952543356):desc"]),
|
||||||
|
..BASE_CONF
|
||||||
|
},
|
||||||
|
// The furthest point from any civilization
|
||||||
|
utils::Conf {
|
||||||
|
group_name: "asc sort from Point Nemo",
|
||||||
|
sort: Some(vec!["_geoPoint(-48.87561645055408, -123.39275749319793):asc"]),
|
||||||
|
..BASE_CONF
|
||||||
|
},
|
||||||
|
utils::Conf {
|
||||||
|
group_name: "desc sort from Point Nemo",
|
||||||
|
sort: Some(vec!["_geoPoint(-48.87561645055408, -123.39275749319793):desc"]),
|
||||||
|
..BASE_CONF
|
||||||
|
},
|
||||||
|
// Filters
|
||||||
|
utils::Conf {
|
||||||
|
group_name: "filter of 100km from Lille",
|
||||||
|
filter: Some("_geoRadius(50.62999333378238, 3.086269263384099, 100000)"),
|
||||||
|
..BASE_CONF
|
||||||
|
},
|
||||||
|
utils::Conf {
|
||||||
|
group_name: "filter of 1km from Lille",
|
||||||
|
filter: Some("_geoRadius(50.62999333378238, 3.086269263384099, 1000)"),
|
||||||
|
..BASE_CONF
|
||||||
|
},
|
||||||
|
utils::Conf {
|
||||||
|
group_name: "filter of 100km from Tokyo",
|
||||||
|
filter: Some("_geoRadius(35.749512532692144, 139.61664952543356, 100000)"),
|
||||||
|
..BASE_CONF
|
||||||
|
},
|
||||||
|
utils::Conf {
|
||||||
|
group_name: "filter of 1km from Tokyo",
|
||||||
|
filter: Some("_geoRadius(35.749512532692144, 139.61664952543356, 1000)"),
|
||||||
|
..BASE_CONF
|
||||||
|
},
|
||||||
|
utils::Conf {
|
||||||
|
group_name: "filter of 100km from Point Nemo",
|
||||||
|
filter: Some("_geoRadius(-48.87561645055408, -123.39275749319793, 100000)"),
|
||||||
|
..BASE_CONF
|
||||||
|
},
|
||||||
|
utils::Conf {
|
||||||
|
group_name: "filter of 1km from Point Nemo",
|
||||||
|
filter: Some("_geoRadius(-48.87561645055408, -123.39275749319793, 1000)"),
|
||||||
|
..BASE_CONF
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
utils::run_benches(c, confs);
|
||||||
|
}
|
||||||
|
|
||||||
|
criterion_group!(benches, bench_geo);
|
||||||
|
criterion_main!(benches);
|
@ -12,6 +12,8 @@ pub struct Conf<'a> {
|
|||||||
pub database_name: &'a str,
|
pub database_name: &'a str,
|
||||||
/// the dataset to be used, it must be an uncompressed csv
|
/// the dataset to be used, it must be an uncompressed csv
|
||||||
pub dataset: &'a str,
|
pub dataset: &'a str,
|
||||||
|
/// The format of the dataset
|
||||||
|
pub dataset_format: UpdateFormat,
|
||||||
pub group_name: &'a str,
|
pub group_name: &'a str,
|
||||||
pub queries: &'a [&'a str],
|
pub queries: &'a [&'a str],
|
||||||
/// here you can change which criterion are used and in which order.
|
/// here you can change which criterion are used and in which order.
|
||||||
@ -21,6 +23,7 @@ pub struct Conf<'a> {
|
|||||||
/// the last chance to configure your database as you want
|
/// the last chance to configure your database as you want
|
||||||
pub configure: fn(&mut Settings),
|
pub configure: fn(&mut Settings),
|
||||||
pub filter: Option<&'a str>,
|
pub filter: Option<&'a str>,
|
||||||
|
pub sort: Option<Vec<&'a str>>,
|
||||||
/// enable or disable the optional words on the query
|
/// enable or disable the optional words on the query
|
||||||
pub optional_words: bool,
|
pub optional_words: bool,
|
||||||
/// primary key, if there is None we'll auto-generate docids for every documents
|
/// primary key, if there is None we'll auto-generate docids for every documents
|
||||||
@ -30,12 +33,14 @@ pub struct Conf<'a> {
|
|||||||
impl Conf<'_> {
|
impl Conf<'_> {
|
||||||
pub const BASE: Self = Conf {
|
pub const BASE: Self = Conf {
|
||||||
database_name: "benches.mmdb",
|
database_name: "benches.mmdb",
|
||||||
|
dataset_format: UpdateFormat::Csv,
|
||||||
dataset: "",
|
dataset: "",
|
||||||
group_name: "",
|
group_name: "",
|
||||||
queries: &[],
|
queries: &[],
|
||||||
criterion: None,
|
criterion: None,
|
||||||
configure: |_| (),
|
configure: |_| (),
|
||||||
filter: None,
|
filter: None,
|
||||||
|
sort: None,
|
||||||
optional_words: true,
|
optional_words: true,
|
||||||
primary_key: None,
|
primary_key: None,
|
||||||
};
|
};
|
||||||
@ -82,7 +87,7 @@ pub fn base_setup(conf: &Conf) -> Index {
|
|||||||
if let None = conf.primary_key {
|
if let None = conf.primary_key {
|
||||||
builder.enable_autogenerate_docids();
|
builder.enable_autogenerate_docids();
|
||||||
}
|
}
|
||||||
builder.update_format(UpdateFormat::Csv);
|
builder.update_format(conf.dataset_format);
|
||||||
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
|
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
|
||||||
let reader = File::open(conf.dataset)
|
let reader = File::open(conf.dataset)
|
||||||
.expect(&format!("could not find the dataset in: {}", conf.dataset));
|
.expect(&format!("could not find the dataset in: {}", conf.dataset));
|
||||||
@ -110,6 +115,10 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
|
|||||||
let filter = FilterCondition::from_str(&rtxn, &index, filter).unwrap();
|
let filter = FilterCondition::from_str(&rtxn, &index, filter).unwrap();
|
||||||
search.filter(filter);
|
search.filter(filter);
|
||||||
}
|
}
|
||||||
|
if let Some(sort) = &conf.sort {
|
||||||
|
let sort = sort.iter().map(|sort| sort.parse().unwrap()).collect();
|
||||||
|
search.sort_criteria(sort);
|
||||||
|
}
|
||||||
let _ids = search.execute().unwrap();
|
let _ids = search.execute().unwrap();
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -13,6 +13,7 @@ const BASE_URL: &str = "https://milli-benchmarks.fra1.digitaloceanspaces.com/dat
|
|||||||
const DATASET_SONGS: (&str, &str) = ("smol-songs", "csv");
|
const DATASET_SONGS: (&str, &str) = ("smol-songs", "csv");
|
||||||
const DATASET_WIKI: (&str, &str) = ("smol-wiki-articles", "csv");
|
const DATASET_WIKI: (&str, &str) = ("smol-wiki-articles", "csv");
|
||||||
const DATASET_MOVIES: (&str, &str) = ("movies", "json");
|
const DATASET_MOVIES: (&str, &str) = ("movies", "json");
|
||||||
|
const DATASET_GEO: (&str, &str) = ("smol-all-countries", "jsonl");
|
||||||
|
|
||||||
/// The name of the environment variable used to select the path
|
/// The name of the environment variable used to select the path
|
||||||
/// of the directory containing the datasets
|
/// of the directory containing the datasets
|
||||||
@ -32,7 +33,7 @@ fn main() -> anyhow::Result<()> {
|
|||||||
)?;
|
)?;
|
||||||
writeln!(manifest_paths_file)?;
|
writeln!(manifest_paths_file)?;
|
||||||
|
|
||||||
for (dataset, extension) in [DATASET_SONGS, DATASET_WIKI, DATASET_MOVIES] {
|
for (dataset, extension) in [DATASET_SONGS, DATASET_WIKI, DATASET_MOVIES, DATASET_GEO] {
|
||||||
let out_path = out_dir.join(dataset);
|
let out_path = out_dir.join(dataset);
|
||||||
let out_file = out_path.with_extension(extension);
|
let out_file = out_path.with_extension(extension);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user