From 5a675bcb827300632262223b0c9d16525de17320 Mon Sep 17 00:00:00 2001
From: Mubelotix <simon@meilisearch.com>
Date: Wed, 2 Jul 2025 11:50:32 +0200
Subject: [PATCH] Add benchmarks

---
 crates/benchmarks/Cargo.toml       |   5 ++
 crates/benchmarks/benches/sort.rs  | 108 +++++++++++++++++++++++++++++
 crates/benchmarks/benches/utils.rs |  98 +++++++++++++++++++++-----
 3 files changed, 195 insertions(+), 16 deletions(-)
 create mode 100644 crates/benchmarks/benches/sort.rs

diff --git a/crates/benchmarks/Cargo.toml b/crates/benchmarks/Cargo.toml
index 9dccc444b..68ed5aff4 100644
--- a/crates/benchmarks/Cargo.toml
+++ b/crates/benchmarks/Cargo.toml
@@ -51,3 +51,8 @@ harness = false
 [[bench]]
 name = "indexing"
 harness = false
+
+[[bench]]
+name = "sort"
+harness = false
+
diff --git a/crates/benchmarks/benches/sort.rs b/crates/benchmarks/benches/sort.rs
new file mode 100644
index 000000000..0dd392cb2
--- /dev/null
+++ b/crates/benchmarks/benches/sort.rs
@@ -0,0 +1,108 @@
+//! This benchmark module is used to compare the performance of sorting documents in /search VS /documents
+//!
+//! The tests/benchmarks were designed in the context of a query returning only 20 documents.
+
+mod datasets_paths;
+mod utils;
+
+use criterion::{criterion_group, criterion_main};
+use milli::update::Settings;
+use utils::Conf;
+
+#[cfg(not(windows))]
+#[global_allocator]
+static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
+
+fn base_conf(builder: &mut Settings) {
+    let displayed_fields =
+        ["geonameid", "name", "asciiname", "alternatenames", "_geo", "population"]
+            .iter()
+            .map(|s| s.to_string())
+            .collect();
+    builder.set_displayed_fields(displayed_fields);
+
+    let sortable_fields =
+        ["_geo", "name", "population", "elevation", "timezone", "modification-date"]
+            .iter()
+            .map(|s| s.to_string())
+            .collect();
+    builder.set_sortable_fields(sortable_fields);
+}
+
+#[rustfmt::skip]
+const BASE_CONF: Conf = Conf {
+    dataset: datasets_paths::SMOL_ALL_COUNTRIES,
+    dataset_format: "jsonl",
+    configure: base_conf,
+    primary_key: Some("geonameid"),
+    queries: &[""],
+    offsets: &[
+        Some((0, 20)), // The most common query in the real world
+        Some((0, 500)), // A query that ranges over many documents
+        Some((980, 20)), // The worst query that could happen in the real world
+        Some((800_000, 20)) // The worst query
+    ],
+    get_documents: true,
+    ..Conf::BASE
+};
+
+fn bench_sort(c: &mut criterion::Criterion) {
+    #[rustfmt::skip]
+    let confs = &[
+        // utils::Conf {
+        //     group_name: "without sort",
+        //     sort: None,
+        //     ..BASE_CONF
+        // },
+
+        // utils::Conf {
+        //     group_name: "sort on many different values",
+        //     sort: Some(vec!["name:asc"]),
+        //     ..BASE_CONF
+        // },
+
+        // utils::Conf {
+        //     group_name: "sort on many similar values",
+        //     sort: Some(vec!["timezone:desc"]),
+        //     ..BASE_CONF
+        // },
+
+        // utils::Conf {
+        //     group_name: "sort on many similar then different values",
+        //     sort: Some(vec!["timezone:desc", "name:asc"]),
+        //     ..BASE_CONF
+        // },
+
+        // utils::Conf {
+        //     group_name: "sort on many different then similar values",
+        //     sort: Some(vec!["timezone:desc", "name:asc"]),
+        //     ..BASE_CONF
+        // },
+
+        utils::Conf {
+            group_name: "geo sort",
+            sample_size: Some(10),
+            sort: Some(vec!["_geoPoint(45.4777599, 9.1967508):asc"]),
+            ..BASE_CONF
+        },
+
+        utils::Conf {
+            group_name: "sort on many similar values then geo sort",
+            sample_size: Some(10),
+            sort: Some(vec!["timezone:desc", "_geoPoint(45.4777599, 9.1967508):asc"]),
+            ..BASE_CONF
+        },
+
+        utils::Conf {
+            group_name: "sort on many different values then geo sort",
+            sample_size: Some(10),
+            sort: Some(vec!["name:desc", "_geoPoint(45.4777599, 9.1967508):asc"]),
+            ..BASE_CONF
+        },
+    ];
+
+    utils::run_benches(c, confs);
+}
+
+criterion_group!(benches, bench_sort);
+criterion_main!(benches);
diff --git a/crates/benchmarks/benches/utils.rs b/crates/benchmarks/benches/utils.rs
index aaa2d50a0..93fa7506f 100644
--- a/crates/benchmarks/benches/utils.rs
+++ b/crates/benchmarks/benches/utils.rs
@@ -9,6 +9,7 @@ use anyhow::Context;
 use bumpalo::Bump;
 use criterion::BenchmarkId;
 use memmap2::Mmap;
+use milli::documents::sort::recursive_sort;
 use milli::heed::EnvOpenOptions;
 use milli::progress::Progress;
 use milli::update::new::indexer;
@@ -35,6 +36,12 @@ pub struct Conf<'a> {
     pub configure: fn(&mut Settings),
     pub filter: Option<&'a str>,
     pub sort: Option<Vec<&'a str>>,
+    /// set to skip documents (offset, limit)
+    pub offsets: &'a [Option<(usize, usize)>],
+    /// enable if you want to bench getting documents without querying
+    pub get_documents: bool,
+    /// configure the benchmark sample size
+    pub sample_size: Option<usize>,
     /// enable or disable the optional words on the query
     pub optional_words: bool,
     /// primary key, if there is None we'll auto-generate docids for every documents
@@ -52,6 +59,9 @@ impl Conf<'_> {
         configure: |_| (),
         filter: None,
         sort: None,
+        offsets: &[None],
+        get_documents: false,
+        sample_size: None,
         optional_words: true,
         primary_key: None,
     };
@@ -144,25 +154,81 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
         let file_name = Path::new(conf.dataset).file_name().and_then(|f| f.to_str()).unwrap();
         let name = format!("{}: {}", file_name, conf.group_name);
         let mut group = c.benchmark_group(&name);
+        if let Some(sample_size) = conf.sample_size {
+            group.sample_size(sample_size);
+        }
 
         for &query in conf.queries {
-            group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| {
-                b.iter(|| {
-                    let rtxn = index.read_txn().unwrap();
-                    let mut search = index.search(&rtxn);
-                    search.query(query).terms_matching_strategy(TermsMatchingStrategy::default());
-                    if let Some(filter) = conf.filter {
-                        let filter = Filter::from_str(filter).unwrap().unwrap();
-                        search.filter(filter);
-                    }
-                    if let Some(sort) = &conf.sort {
-                        let sort = sort.iter().map(|sort| sort.parse().unwrap()).collect();
-                        search.sort_criteria(sort);
-                    }
-                    let _ids = search.execute().unwrap();
-                });
-            });
+            for offset in conf.offsets {
+                let parameter = match (query.is_empty(), offset) {
+                    (true, None) => String::from("placeholder"),
+                    (true, Some((offset, limit))) => format!("placeholder[{offset}:{limit}]"),
+                    (false, None) => query.to_string(),
+                    (false, Some((offset, limit))) => format!("{query}[{offset}:{limit}]"),
+                };
+                group.bench_with_input(
+                    BenchmarkId::from_parameter(parameter),
+                    &query,
+                    |b, &query| {
+                        b.iter(|| {
+                            let rtxn = index.read_txn().unwrap();
+                            let mut search = index.search(&rtxn);
+                            search
+                                .query(query)
+                                .terms_matching_strategy(TermsMatchingStrategy::default());
+                            if let Some(filter) = conf.filter {
+                                let filter = Filter::from_str(filter).unwrap().unwrap();
+                                search.filter(filter);
+                            }
+                            if let Some(sort) = &conf.sort {
+                                let sort = sort.iter().map(|sort| sort.parse().unwrap()).collect();
+                                search.sort_criteria(sort);
+                            }
+                            if let Some((offset, limit)) = offset {
+                                search.offset(*offset).limit(*limit);
+                            }
+
+                            let _ids = search.execute().unwrap();
+                        });
+                    },
+                );
+            }
         }
+
+        if conf.get_documents {
+            for offset in conf.offsets {
+                let parameter = match offset {
+                    None => String::from("get_documents"),
+                    Some((offset, limit)) => format!("get_documents[{offset}:{limit}]"),
+                };
+                group.bench_with_input(BenchmarkId::from_parameter(parameter), &(), |b, &()| {
+                    b.iter(|| {
+                        let rtxn = index.read_txn().unwrap();
+                        if let Some(sort) = &conf.sort {
+                            let sort = sort.iter().map(|sort| sort.parse().unwrap()).collect();
+                            let all_docs = index.documents_ids(&rtxn).unwrap();
+                            let facet_sort =
+                                recursive_sort(&index, &rtxn, sort, &all_docs).unwrap();
+                            let iter = facet_sort.iter().unwrap();
+                            if let Some((offset, limit)) = offset {
+                                let _results = iter.skip(*offset).take(*limit).collect::<Vec<_>>();
+                            } else {
+                                let _results = iter.collect::<Vec<_>>();
+                            }
+                        } else {
+                            let all_docs = index.documents_ids(&rtxn).unwrap();
+                            if let Some((offset, limit)) = offset {
+                                let _results =
+                                    all_docs.iter().skip(*offset).take(*limit).collect::<Vec<_>>();
+                            } else {
+                                let _results = all_docs.iter().collect::<Vec<_>>();
+                            }
+                        }
+                    });
+                });
+            }
+        }
+
         group.finish();
 
         index.prepare_for_closing().wait();