From 5e683ba472e00421594658a2a5c90c73f26b8514 Mon Sep 17 00:00:00 2001
From: Tamo <tamo@meilisearch.com>
Date: Mon, 13 Sep 2021 18:08:28 +0200
Subject: [PATCH] add benchmarks for the geosearch

---
 .github/workflows/benchmarks.yml |   2 +-
 benchmarks/Cargo.toml            |   4 +
 benchmarks/README.md             |  32 ++++++--
 benchmarks/benches/indexing.rs   |  59 ++++++++++++++-
 benchmarks/benches/search_geo.rs | 123 +++++++++++++++++++++++++++++++
 benchmarks/benches/utils.rs      |  11 ++-
 benchmarks/build.rs              |   3 +-
 7 files changed, 222 insertions(+), 12 deletions(-)
 create mode 100644 benchmarks/benches/search_geo.rs
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index c64c6a64b..7a9fbb5de 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -4,7 +4,7 @@ on:
   workflow_dispatch:
     inputs:
       dataset_name:
-        description: 'The name of the dataset used to benchmark (search_songs, search_wiki or indexing)'
+        description: 'The name of the dataset used to benchmark (search_songs, search_wiki, search_geo or indexing)'
         required: false
         default: 'search_songs'
 
diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml
index 9e380b9a8..b598f2f6f 100644
--- a/benchmarks/Cargo.toml
+++ b/benchmarks/Cargo.toml
@@ -29,6 +29,10 @@ harness = false
 name = "search_wiki"
 harness = false
 
+[[bench]]
+name = "search_geo"
+harness = false
+
 [[bench]]
 name = "indexing"
 harness = false
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 16838e488..7a387dfdd 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -36,7 +36,7 @@ To run all the benchmarks (~5h):
 cargo bench
 ```
 
-To run only the `songs` (~1h), `wiki` (~3h) or `indexing` (~4h) benchmark:
+To run only the `search_songs` (~1h), `search_wiki` (~3h), `search_geo` (~20m) or `indexing` (~2h) benchmark:
 
 ```bash
 cargo bench --bench <dataset name>
@@ -47,7 +47,7 @@ If you don't want to download the datasets every time you update something on th
 
 ```bash
 mkdir ~/datasets
-MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the three datasets are downloaded
+MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench search_songs # the four datasets are downloaded
 touch build.rs
 MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded
 ```
@@ -81,14 +81,15 @@ Run the comparison script:
 
 ## Datasets
 
-The benchmarks are available for the following datasets:
-- `songs`
-- `wiki`
+The benchmarks uses the following datasets:
+- `smol-songs`
+- `smol-wiki`
 - `movies`
+- `smol-all-countries`
 
 ### Songs
 
-`songs` is a subset of the [`songs.csv` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/songs.csv.gz).
+`smol-songs` is a subset of the [`songs.csv` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/songs.csv.gz).
 
 It was generated with this command:
 
@@ -96,11 +97,11 @@ It was generated with this command:
 xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv
 ```
 
-_[Download the generated `songs` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-songs.csv.gz)._
+_[Download the generated `smol-songs` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-songs.csv.gz)._
 
 ### Wiki
 
-`wiki` is a subset of the [`wikipedia-articles.csv` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/wiki-articles.csv.gz).
+`smol-wiki` is a subset of the [`wikipedia-articles.csv` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/wiki-articles.csv.gz).
 
 It was generated with the following command:
 
@@ -108,9 +109,24 @@ It was generated with the following command:
 xsv sample --seed 42 500000 wiki-articles.csv -o smol-wiki-articles.csv
 ```
 
+_[Download the `smol-wiki` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-wiki.csv.gz)._
+
 ### Movies
 
 `movies` is a really small dataset we uses as our example in the [getting started](https://docs.meilisearch.com/learn/getting_started/)
 
 _[Download the `movies` dataset](https://docs.meilisearch.com/movies.json)._
 
+
+### All Countries
+
+`smol-all-countries` is a subset of the [`all-countries.csv` dataset]()
+It has been converted to jsonlines and then edited so it matches our format for the `_geo` field.
+
+It was generated with the following command:
+```bash
+bat all-countries.csv.gz | gunzip | xsv sample --seed 42 1000000 | csv2json-lite | sd '"latitude":"(.*?)","longitude":"(.*?)"' '"_geo": { "lat": $1, "lng": $2 }' | sd '\[|\]|,$' '' | gzip > smol-all-countries.jsonl.gz
+```
+
+_[Download the `smol-all-countries` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-all-countries.jsonl.gz)._
+
diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs
index bd056ea23..30532aef8 100644
--- a/benchmarks/benches/indexing.rs
+++ b/benchmarks/benches/indexing.rs
@@ -277,12 +277,69 @@ fn indexing_movies_default(c: &mut Criterion) {
     });
 }
 
+fn indexing_geo(c: &mut Criterion) {
+    let mut group = c.benchmark_group("indexing");
+    group.sample_size(10);
+    group.bench_function("Indexing geo_point", |b| {
+        b.iter_with_setup(
+            move || {
+                let index = setup_index();
+
+                let update_builder = UpdateBuilder::new(0);
+                let mut wtxn = index.write_txn().unwrap();
+                let mut builder = update_builder.settings(&mut wtxn, &index);
+
+                builder.set_primary_key("geonameid".to_owned());
+                let displayed_fields =
+                    ["geonameid", "name", "asciiname", "alternatenames", "_geo", "population"]
+                        .iter()
+                        .map(|s| s.to_string())
+                        .collect();
+                builder.set_displayed_fields(displayed_fields);
+
+                let searchable_fields =
+                    ["name", "alternatenames", "elevation"].iter().map(|s| s.to_string()).collect();
+                builder.set_searchable_fields(searchable_fields);
+
+                let filterable_fields =
+                    ["_geo", "population", "elevation"].iter().map(|s| s.to_string()).collect();
+                builder.set_filterable_fields(filterable_fields);
+
+                let sortable_fields =
+                    ["_geo", "population", "elevation"].iter().map(|s| s.to_string()).collect();
+                builder.set_sortable_fields(sortable_fields);
+
+                builder.execute(|_, _| ()).unwrap();
+                wtxn.commit().unwrap();
+                index
+            },
+            move |index| {
+                let update_builder = UpdateBuilder::new(0);
+                let mut wtxn = index.write_txn().unwrap();
+                let mut builder = update_builder.index_documents(&mut wtxn, &index);
+
+                builder.update_format(UpdateFormat::JsonStream);
+                builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
+                let reader = File::open(datasets_paths::SMOL_ALL_COUNTRIES).expect(&format!(
+                    "could not find the dataset in: {}",
+                    datasets_paths::SMOL_ALL_COUNTRIES
+                ));
+                builder.execute(reader, |_, _| ()).unwrap();
+                wtxn.commit().unwrap();
+
+                index.prepare_for_closing().wait();
+            },
+        )
+    });
+}
+
 criterion_group!(
     benches,
     indexing_songs_default,
     indexing_songs_without_faceted_numbers,
     indexing_songs_without_faceted_fields,
     indexing_wiki,
-    indexing_movies_default
+    indexing_movies_default,
+    indexing_geo
 );
 criterion_main!(benches);
diff --git a/benchmarks/benches/search_geo.rs b/benchmarks/benches/search_geo.rs
new file mode 100644
index 000000000..1432f691b
--- /dev/null
+++ b/benchmarks/benches/search_geo.rs
@@ -0,0 +1,123 @@
+mod datasets_paths;
+mod utils;
+
+use criterion::{criterion_group, criterion_main};
+use milli::update::{Settings, UpdateFormat};
+use utils::Conf;
+
+#[cfg(target_os = "linux")]
+#[global_allocator]
+static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
+
+fn base_conf(builder: &mut Settings) {
+    let displayed_fields =
+        ["geonameid", "name", "asciiname", "alternatenames", "_geo", "population"]
+            .iter()
+            .map(|s| s.to_string())
+            .collect();
+    builder.set_displayed_fields(displayed_fields);
+
+    let searchable_fields =
+        ["name", "alternatenames", "elevation"].iter().map(|s| s.to_string()).collect();
+    builder.set_searchable_fields(searchable_fields);
+
+    let filterable_fields =
+        ["_geo", "population", "elevation"].iter().map(|s| s.to_string()).collect();
+    builder.set_filterable_fields(filterable_fields);
+
+    let sortable_fields =
+        ["_geo", "population", "elevation"].iter().map(|s| s.to_string()).collect();
+    builder.set_sortable_fields(sortable_fields);
+}
+
+#[rustfmt::skip]
+const BASE_CONF: Conf = Conf {
+    dataset: datasets_paths::SMOL_ALL_COUNTRIES,
+    dataset_format: UpdateFormat::JsonStream,
+    queries: &[
+        "",
+    ],
+    configure: base_conf,
+    primary_key: Some("geonameid"),
+    ..Conf::BASE
+};
+
+fn bench_geo(c: &mut criterion::Criterion) {
+    #[rustfmt::skip]
+    let confs = &[
+        // A basic placeholder with no geo
+        utils::Conf {
+            group_name: "placeholder with no geo",
+            ..BASE_CONF
+        },
+        // Medium aglomeration: probably the most common usecase
+        utils::Conf {
+            group_name: "asc sort from Lille",
+            sort: Some(vec!["_geoPoint(50.62999333378238, 3.086269263384099):asc"]),
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "desc sort from Lille",
+            sort: Some(vec!["_geoPoint(50.62999333378238, 3.086269263384099):desc"]),
+            ..BASE_CONF
+        },
+        // Big agglomeration: a lot of documents close to our point
+        utils::Conf {
+            group_name: "asc sort from Tokyo",
+            sort: Some(vec!["_geoPoint(35.749512532692144, 139.61664952543356):asc"]),
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "desc sort from Tokyo",
+            sort: Some(vec!["_geoPoint(35.749512532692144, 139.61664952543356):desc"]),
+            ..BASE_CONF
+        },
+        // The furthest point from any civilization
+        utils::Conf {
+            group_name: "asc sort from Point Nemo",
+            sort: Some(vec!["_geoPoint(-48.87561645055408, -123.39275749319793):asc"]),
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "desc sort from Point Nemo",
+            sort: Some(vec!["_geoPoint(-48.87561645055408, -123.39275749319793):desc"]),
+            ..BASE_CONF
+        },
+        // Filters
+        utils::Conf {
+            group_name: "filter of 100km from Lille",
+            filter: Some("_geoRadius(50.62999333378238, 3.086269263384099, 100000)"),
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "filter of 1km from Lille",
+            filter: Some("_geoRadius(50.62999333378238, 3.086269263384099, 1000)"),
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "filter of 100km from Tokyo",
+            filter: Some("_geoRadius(35.749512532692144, 139.61664952543356, 100000)"),
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "filter of 1km from Tokyo",
+            filter: Some("_geoRadius(35.749512532692144, 139.61664952543356, 1000)"),
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "filter of 100km from Point Nemo",
+            filter: Some("_geoRadius(-48.87561645055408, -123.39275749319793, 100000)"),
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "filter of 1km from Point Nemo",
+            filter: Some("_geoRadius(-48.87561645055408, -123.39275749319793, 1000)"),
+            ..BASE_CONF
+        },
+    ];
+
+    utils::run_benches(c, confs);
+}
+
+criterion_group!(benches, bench_geo);
+criterion_main!(benches);
diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs
index 5318527f4..72eac59d9 100644
--- a/benchmarks/benches/utils.rs
+++ b/benchmarks/benches/utils.rs
@@ -12,6 +12,8 @@ pub struct Conf<'a> {
     pub database_name: &'a str,
     /// the dataset to be used, it must be an uncompressed csv
     pub dataset: &'a str,
+    /// The format of the dataset
+    pub dataset_format: UpdateFormat,
     pub group_name: &'a str,
     pub queries: &'a [&'a str],
     /// here you can change which criterion are used and in which order.
@@ -21,6 +23,7 @@ pub struct Conf<'a> {
     /// the last chance to configure your database as you want
     pub configure: fn(&mut Settings),
     pub filter: Option<&'a str>,
+    pub sort: Option<Vec<&'a str>>,
     /// enable or disable the optional words on the query
     pub optional_words: bool,
     /// primary key, if there is None we'll auto-generate docids for every documents
@@ -30,12 +33,14 @@ pub struct Conf<'a> {
 impl Conf<'_> {
     pub const BASE: Self = Conf {
         database_name: "benches.mmdb",
+        dataset_format: UpdateFormat::Csv,
         dataset: "",
         group_name: "",
         queries: &[],
         criterion: None,
         configure: |_| (),
         filter: None,
+        sort: None,
         optional_words: true,
         primary_key: None,
     };
@@ -82,7 +87,7 @@ pub fn base_setup(conf: &Conf) -> Index {
     if let None = conf.primary_key {
         builder.enable_autogenerate_docids();
     }
-    builder.update_format(UpdateFormat::Csv);
+    builder.update_format(conf.dataset_format);
     builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
     let reader = File::open(conf.dataset)
         .expect(&format!("could not find the dataset in: {}", conf.dataset));
@@ -110,6 +115,10 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
                         let filter = FilterCondition::from_str(&rtxn, &index, filter).unwrap();
                         search.filter(filter);
                     }
+                    if let Some(sort) = &conf.sort {
+                        let sort = sort.iter().map(|sort| sort.parse().unwrap()).collect();
+                        search.sort_criteria(sort);
+                    }
                     let _ids = search.execute().unwrap();
                 });
             });
diff --git a/benchmarks/build.rs b/benchmarks/build.rs
index 47a14f25b..2495930bb 100644
--- a/benchmarks/build.rs
+++ b/benchmarks/build.rs
@@ -13,6 +13,7 @@ const BASE_URL: &str = "https://milli-benchmarks.fra1.digitaloceanspaces.com/dat
 const DATASET_SONGS: (&str, &str) = ("smol-songs", "csv");
 const DATASET_WIKI: (&str, &str) = ("smol-wiki-articles", "csv");
 const DATASET_MOVIES: (&str, &str) = ("movies", "json");
+const DATASET_GEO: (&str, &str) = ("smol-all-countries", "jsonl");
 
 /// The name of the environment variable used to select the path
 /// of the directory containing the datasets
@@ -32,7 +33,7 @@ fn main() -> anyhow::Result<()> {
     )?;
     writeln!(manifest_paths_file)?;
 
-    for (dataset, extension) in [DATASET_SONGS, DATASET_WIKI, DATASET_MOVIES] {
+    for (dataset, extension) in [DATASET_SONGS, DATASET_WIKI, DATASET_MOVIES, DATASET_GEO] {
         let out_path = out_dir.join(dataset);
         let out_file = out_path.with_extension(extension);