From 5a675bcb827300632262223b0c9d16525de17320 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Wed, 2 Jul 2025 11:50:32 +0200 Subject: [PATCH] Add benchmarks --- crates/benchmarks/Cargo.toml | 5 ++ crates/benchmarks/benches/sort.rs | 108 +++++++++++++++++++++++++++++ crates/benchmarks/benches/utils.rs | 98 +++++++++++++++++++++----- 3 files changed, 195 insertions(+), 16 deletions(-) create mode 100644 crates/benchmarks/benches/sort.rs diff --git a/crates/benchmarks/Cargo.toml b/crates/benchmarks/Cargo.toml index 9dccc444b..68ed5aff4 100644 --- a/crates/benchmarks/Cargo.toml +++ b/crates/benchmarks/Cargo.toml @@ -51,3 +51,8 @@ harness = false [[bench]] name = "indexing" harness = false + +[[bench]] +name = "sort" +harness = false + diff --git a/crates/benchmarks/benches/sort.rs b/crates/benchmarks/benches/sort.rs new file mode 100644 index 000000000..0dd392cb2 --- /dev/null +++ b/crates/benchmarks/benches/sort.rs @@ -0,0 +1,108 @@ +//! This benchmark module is used to compare the performance of sorting documents in /search VS /documents +//! +//! The tests/benchmarks were designed in the context of a query returning only 20 documents. + +mod datasets_paths; +mod utils; + +use criterion::{criterion_group, criterion_main}; +use milli::update::Settings; +use utils::Conf; + +#[cfg(not(windows))] +#[global_allocator] +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; + +fn base_conf(builder: &mut Settings) { + let displayed_fields = + ["geonameid", "name", "asciiname", "alternatenames", "_geo", "population"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let sortable_fields = + ["_geo", "name", "population", "elevation", "timezone", "modification-date"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_sortable_fields(sortable_fields); +} + +#[rustfmt::skip] +const BASE_CONF: Conf = Conf { + dataset: datasets_paths::SMOL_ALL_COUNTRIES, + dataset_format: "jsonl", + configure: base_conf, + primary_key: Some("geonameid"), + queries: &[""], + offsets: &[ + Some((0, 20)), // The most common query in the real world + Some((0, 500)), // A query that ranges over many documents + Some((980, 20)), // The worst query that could happen in the real world + Some((800_000, 20)) // The worst query + ], + get_documents: true, + ..Conf::BASE +}; + +fn bench_sort(c: &mut criterion::Criterion) { + #[rustfmt::skip] + let confs = &[ + // utils::Conf { + // group_name: "without sort", + // sort: None, + // ..BASE_CONF + // }, + + // utils::Conf { + // group_name: "sort on many different values", + // sort: Some(vec!["name:asc"]), + // ..BASE_CONF + // }, + + // utils::Conf { + // group_name: "sort on many similar values", + // sort: Some(vec!["timezone:desc"]), + // ..BASE_CONF + // }, + + // utils::Conf { + // group_name: "sort on many similar then different values", + // sort: Some(vec!["timezone:desc", "name:asc"]), + // ..BASE_CONF + // }, + + // utils::Conf { + // group_name: "sort on many different then similar values", + // sort: Some(vec!["timezone:desc", "name:asc"]), + // ..BASE_CONF + // }, + + utils::Conf { + group_name: "geo sort", + sample_size: Some(10), + sort: Some(vec!["_geoPoint(45.4777599, 9.1967508):asc"]), + ..BASE_CONF + }, + + utils::Conf { + group_name: "sort on many similar values then geo sort", + sample_size: Some(10), + sort: Some(vec!["timezone:desc", "_geoPoint(45.4777599, 9.1967508):asc"]), + ..BASE_CONF + }, + + utils::Conf { + group_name: "sort on many different values then geo sort", + sample_size: Some(10), + sort: Some(vec!["name:desc", "_geoPoint(45.4777599, 9.1967508):asc"]), + ..BASE_CONF + }, + ]; + + utils::run_benches(c, confs); +} + +criterion_group!(benches, bench_sort); +criterion_main!(benches); diff --git a/crates/benchmarks/benches/utils.rs b/crates/benchmarks/benches/utils.rs index aaa2d50a0..93fa7506f 100644 --- a/crates/benchmarks/benches/utils.rs +++ b/crates/benchmarks/benches/utils.rs @@ -9,6 +9,7 @@ use anyhow::Context; use bumpalo::Bump; use criterion::BenchmarkId; use memmap2::Mmap; +use milli::documents::sort::recursive_sort; use milli::heed::EnvOpenOptions; use milli::progress::Progress; use milli::update::new::indexer; @@ -35,6 +36,12 @@ pub struct Conf<'a> { pub configure: fn(&mut Settings), pub filter: Option<&'a str>, pub sort: Option>, + /// set to skip documents (offset, limit) + pub offsets: &'a [Option<(usize, usize)>], + /// enable if you want to bench getting documents without querying + pub get_documents: bool, + /// configure the benchmark sample size + pub sample_size: Option, /// enable or disable the optional words on the query pub optional_words: bool, /// primary key, if there is None we'll auto-generate docids for every documents @@ -52,6 +59,9 @@ impl Conf<'_> { configure: |_| (), filter: None, sort: None, + offsets: &[None], + get_documents: false, + sample_size: None, optional_words: true, primary_key: None, }; @@ -144,25 +154,81 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { let file_name = Path::new(conf.dataset).file_name().and_then(|f| f.to_str()).unwrap(); let name = format!("{}: {}", file_name, conf.group_name); let mut group = c.benchmark_group(&name); + if let Some(sample_size) = conf.sample_size { + group.sample_size(sample_size); + } for &query in conf.queries { - group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { - b.iter(|| { - let rtxn = index.read_txn().unwrap(); - let mut search = index.search(&rtxn); - search.query(query).terms_matching_strategy(TermsMatchingStrategy::default()); - if let Some(filter) = conf.filter { - let filter = Filter::from_str(filter).unwrap().unwrap(); - search.filter(filter); - } - if let Some(sort) = &conf.sort { - let sort = sort.iter().map(|sort| sort.parse().unwrap()).collect(); - search.sort_criteria(sort); - } - let _ids = search.execute().unwrap(); - }); - }); + for offset in conf.offsets { + let parameter = match (query.is_empty(), offset) { + (true, None) => String::from("placeholder"), + (true, Some((offset, limit))) => format!("placeholder[{offset}:{limit}]"), + (false, None) => query.to_string(), + (false, Some((offset, limit))) => format!("{query}[{offset}:{limit}]"), + }; + group.bench_with_input( + BenchmarkId::from_parameter(parameter), + &query, + |b, &query| { + b.iter(|| { + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + search + .query(query) + .terms_matching_strategy(TermsMatchingStrategy::default()); + if let Some(filter) = conf.filter { + let filter = Filter::from_str(filter).unwrap().unwrap(); + search.filter(filter); + } + if let Some(sort) = &conf.sort { + let sort = sort.iter().map(|sort| sort.parse().unwrap()).collect(); + search.sort_criteria(sort); + } + if let Some((offset, limit)) = offset { + search.offset(*offset).limit(*limit); + } + + let _ids = search.execute().unwrap(); + }); + }, + ); + } } + + if conf.get_documents { + for offset in conf.offsets { + let parameter = match offset { + None => String::from("get_documents"), + Some((offset, limit)) => format!("get_documents[{offset}:{limit}]"), + }; + group.bench_with_input(BenchmarkId::from_parameter(parameter), &(), |b, &()| { + b.iter(|| { + let rtxn = index.read_txn().unwrap(); + if let Some(sort) = &conf.sort { + let sort = sort.iter().map(|sort| sort.parse().unwrap()).collect(); + let all_docs = index.documents_ids(&rtxn).unwrap(); + let facet_sort = + recursive_sort(&index, &rtxn, sort, &all_docs).unwrap(); + let iter = facet_sort.iter().unwrap(); + if let Some((offset, limit)) = offset { + let _results = iter.skip(*offset).take(*limit).collect::>(); + } else { + let _results = iter.collect::>(); + } + } else { + let all_docs = index.documents_ids(&rtxn).unwrap(); + if let Some((offset, limit)) = offset { + let _results = + all_docs.iter().skip(*offset).take(*limit).collect::>(); + } else { + let _results = all_docs.iter().collect::>(); + } + } + }); + }); + } + } + group.finish(); index.prepare_for_closing().wait();