From 4fdbfd6048531c0cc2666062f8fdf7325480d5a8 Mon Sep 17 00:00:00 2001 From: tamo Date: Thu, 1 Apr 2021 18:54:14 +0200 Subject: [PATCH 01/33] push a first version of the benchmark for the typo --- milli/Cargo.toml | 2 +- milli/benches/README.md | 8 ++++++ milli/benches/{search.rs => typo.rs} | 33 ++++++++++++---------- milli/benches/utils.rs | 41 ++++++++++++++++++++++++++++ 4 files changed, 69 insertions(+), 15 deletions(-) create mode 100644 milli/benches/README.md rename milli/benches/{search.rs => typo.rs} (52%) create mode 100644 milli/benches/utils.rs diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 3b25bb268..175c15679 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -61,5 +61,5 @@ rand = "0.8.3" default = [] [[bench]] -name = "search" +name = "typo" harness = false diff --git a/milli/benches/README.md b/milli/benches/README.md new file mode 100644 index 000000000..c02af0084 --- /dev/null +++ b/milli/benches/README.md @@ -0,0 +1,8 @@ +Benchmarks +========== + +For our benchmark we are using a small subset of the dataset songs.csv. It was generated with this command: +``` +xsv sample --seed 42 song.csv -o smol_songs.csv +``` +The original songs.csv datasets is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz) diff --git a/milli/benches/search.rs b/milli/benches/typo.rs similarity index 52% rename from milli/benches/search.rs rename to milli/benches/typo.rs index a201e241c..9fbce8038 100644 --- a/milli/benches/search.rs +++ b/milli/benches/typo.rs @@ -1,22 +1,27 @@ -use std::time::Duration; +mod utils; -use heed::EnvOpenOptions; -use milli::Index; +use std::time::Duration; use criterion::{criterion_group, criterion_main, BenchmarkId}; -fn bench_search(c: &mut criterion::Criterion) { - let database = "books-4cpu.mmdb"; +fn bench_typo(c: &mut criterion::Criterion) { + let index = utils::base_setup(Some(vec!["typo".to_string()])); + let queries = [ - "minogue kylie", - "minogue kylie live", + "mongus ", + "thelonius monk ", + "Disnaylande ", + "the white striper ", + "indochie ", + "indochien ", + "klub des loopers ", + "fear of the duck ", + "michel depech ", + "stromal ", + "dire straights ", + "Arethla Franklin ", ]; - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - options.max_readers(10); - let index = Index::new(options, database).unwrap(); - - let mut group = c.benchmark_group("search"); + let mut group = c.benchmark_group("typo"); group.sample_size(10); group.measurement_time(Duration::from_secs(12)); @@ -32,5 +37,5 @@ fn bench_search(c: &mut criterion::Criterion) { group.finish(); } -criterion_group!(benches, bench_search); +criterion_group!(benches, bench_typo); criterion_main!(benches); diff --git a/milli/benches/utils.rs b/milli/benches/utils.rs new file mode 100644 index 000000000..23c47ea76 --- /dev/null +++ b/milli/benches/utils.rs @@ -0,0 +1,41 @@ +use std::{fs::{File, create_dir_all}}; + +use heed::EnvOpenOptions; +use milli::{Index, update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}}; + +pub fn base_setup(criteria: Option>) -> Index { + let database = "songs.mmdb"; + create_dir_all(&database).unwrap(); + + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + options.max_readers(10); + let index = Index::new(options, database).unwrap(); + + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.settings(&mut wtxn, &index); + + if let Some(criteria) = criteria { + builder.reset_faceted_fields(); + builder.reset_criteria(); + builder.reset_stop_words(); + + builder.set_criteria(criteria); + } + + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.index_documents(&mut wtxn, &index); + builder.update_format(UpdateFormat::Csv); + builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); + // we called from cargo the current directory is supposed to be milli/milli + let reader = File::open("benches/smol_songs.csv").unwrap(); + builder.execute(reader, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + index +} From e425f70ef9ef14ea3242ec5f9e9f18d09d92ea55 Mon Sep 17 00:00:00 2001 From: tamo Date: Thu, 1 Apr 2021 19:27:12 +0200 Subject: [PATCH 02/33] let criterion decide how much iteration it wants to do in 10s --- milli/benches/typo.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/milli/benches/typo.rs b/milli/benches/typo.rs index 9fbce8038..1bbe1aecb 100644 --- a/milli/benches/typo.rs +++ b/milli/benches/typo.rs @@ -22,8 +22,7 @@ fn bench_typo(c: &mut criterion::Criterion) { ]; let mut group = c.benchmark_group("typo"); - group.sample_size(10); - group.measurement_time(Duration::from_secs(12)); + group.measurement_time(Duration::from_secs(10)); for query in &queries { group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { From 15cce89a45d7032abc1e9e622ce0ce2b200e5273 Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 6 Apr 2021 16:06:49 +0200 Subject: [PATCH 03/33] update the README with instructions to get the download the dataset --- milli/benches/README.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/milli/benches/README.md b/milli/benches/README.md index c02af0084..9b53fc0d1 100644 --- a/milli/benches/README.md +++ b/milli/benches/README.md @@ -3,6 +3,13 @@ Benchmarks For our benchmark we are using a small subset of the dataset songs.csv. It was generated with this command: ``` -xsv sample --seed 42 song.csv -o smol_songs.csv +xsv sample --seed 42 song.csv -o smol-songs.csv +``` +You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz) +And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz). + +You need to put this file in the current directory: `milli/milli/benches/smol-songs.csv.gz` +You can run the following command from the root of this git repository +``` +wget https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz -O milli/benches/smol-songs.csv.gz ``` -The original songs.csv datasets is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz) From 49e4cc3daf85bd4a86325988e23f68de2f64b700 Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 6 Apr 2021 19:17:24 +0200 Subject: [PATCH 04/33] add the words criterion to the bench --- milli/Cargo.toml | 4 ++++ milli/benches/words.rs | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 milli/benches/words.rs diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 175c15679..5184d028b 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -63,3 +63,7 @@ default = [] [[bench]] name = "typo" harness = false + +[[bench]] +name = "words" +harness = false diff --git a/milli/benches/words.rs b/milli/benches/words.rs new file mode 100644 index 000000000..92ca0a784 --- /dev/null +++ b/milli/benches/words.rs @@ -0,0 +1,35 @@ +mod utils; + +use std::time::Duration; +use criterion::{criterion_group, criterion_main, BenchmarkId}; + +fn bench_words(c: &mut criterion::Criterion) { + let index = utils::base_setup(Some(vec!["words".to_string()])); + + let queries = [ + "the black saint and the sinner lady and the good doggo ", // four words to pop + "les liaisons dangeureuses 1793 ", // one word to pop + "The Disneyland Children's Sing-Alone song ", // two words to pop + "seven nation mummy ", // one word to pop + "7000 Danses / Le Baiser / je me trompe de mots ", // four words to pop + "Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop + "whathavenotnsuchforth and then a good amount of words tot pop in order to match the first one ", // 16 + ]; + + let mut group = c.benchmark_group("words"); + group.measurement_time(Duration::from_secs(10)); + + for query in &queries { + group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { + b.iter(|| { + let rtxn = index.read_txn().unwrap(); + let _documents_ids = index.search(&rtxn).query(*query).execute().unwrap(); + }); + }); + } + + group.finish(); +} + +criterion_group!(benches, bench_words); +criterion_main!(benches); From aee49bb3cd20a88c0d62c735268086d0033e1ed5 Mon Sep 17 00:00:00 2001 From: tamo Date: Wed, 7 Apr 2021 11:04:53 +0200 Subject: [PATCH 05/33] add the proximity criterion --- milli/Cargo.toml | 4 ++++ milli/benches/proximity.rs | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 milli/benches/proximity.rs diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 5184d028b..156518e19 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -67,3 +67,7 @@ harness = false [[bench]] name = "words" harness = false + +[[bench]] +name = "proximity" +harness = false diff --git a/milli/benches/proximity.rs b/milli/benches/proximity.rs new file mode 100644 index 000000000..5b687855f --- /dev/null +++ b/milli/benches/proximity.rs @@ -0,0 +1,33 @@ +mod utils; + +use std::time::Duration; +use criterion::{criterion_group, criterion_main, BenchmarkId}; + +fn bench_proximity(c: &mut criterion::Criterion) { + let index = utils::base_setup(Some(vec!["words".to_string()])); + + let queries = [ + "black saint sinner lady ", + "les dangeureuses 1960 ", + "The Disneyland Sing-Alone song ", + "Under Great Northern Lights ", + "7000 Danses Un Jour Dans Notre Vie", + ]; + + let mut group = c.benchmark_group("proximity"); + group.measurement_time(Duration::from_secs(10)); + + for query in &queries { + group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { + b.iter(|| { + let rtxn = index.read_txn().unwrap(); + let _documents_ids = index.search(&rtxn).query(*query).optional_words(false).execute().unwrap(); + }); + }); + } + + group.finish(); +} + +criterion_group!(benches, bench_proximity); +criterion_main!(benches); From a2bff68c1a16e780e572d2e0aa3b304abd6c47c2 Mon Sep 17 00:00:00 2001 From: tamo Date: Wed, 7 Apr 2021 11:05:10 +0200 Subject: [PATCH 06/33] remove the optional words for the typo criterion --- milli/benches/typo.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/benches/typo.rs b/milli/benches/typo.rs index 1bbe1aecb..184f1e5df 100644 --- a/milli/benches/typo.rs +++ b/milli/benches/typo.rs @@ -28,7 +28,7 @@ fn bench_typo(c: &mut criterion::Criterion) { group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { b.iter(|| { let rtxn = index.read_txn().unwrap(); - let _documents_ids = index.search(&rtxn).query(*query).execute().unwrap(); + let _documents_ids = index.search(&rtxn).query(*query).optional_words(false).execute().unwrap(); }); }); } From 3def42abd8e14ccc28b9d6e8cb622ec37034ea52 Mon Sep 17 00:00:00 2001 From: tamo Date: Wed, 7 Apr 2021 11:50:38 +0200 Subject: [PATCH 07/33] merge all the criterion only benchmarks in one file --- milli/Cargo.toml | 10 +------ milli/benches/criterion.rs | 58 ++++++++++++++++++++++++++++++++++++++ milli/benches/proximity.rs | 33 ---------------------- milli/benches/typo.rs | 40 -------------------------- milli/benches/utils.rs | 36 ++++++++++++++++++++--- milli/benches/words.rs | 35 ----------------------- 6 files changed, 91 insertions(+), 121 deletions(-) create mode 100644 milli/benches/criterion.rs delete mode 100644 milli/benches/proximity.rs delete mode 100644 milli/benches/typo.rs delete mode 100644 milli/benches/words.rs diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 156518e19..399b04428 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -61,13 +61,5 @@ rand = "0.8.3" default = [] [[bench]] -name = "typo" -harness = false - -[[bench]] -name = "words" -harness = false - -[[bench]] -name = "proximity" +name = "criterion" harness = false diff --git a/milli/benches/criterion.rs b/milli/benches/criterion.rs new file mode 100644 index 000000000..3f0b6d6b7 --- /dev/null +++ b/milli/benches/criterion.rs @@ -0,0 +1,58 @@ +mod utils; + +use criterion::{criterion_group, criterion_main}; + +fn bench_criterion(c: &mut criterion::Criterion) { + let confs = &[ + utils::Conf { + group_name: "proximity", + queries: &[ + "black saint sinner lady ", + "les dangeureuses 1960 ", + "The Disneyland Sing-Along Chorus ", + "Under Great Northern Lights ", + "7000 Danses Un Jour Dans Notre Vie", + ], + criterion: Some(&["proximity"]), + optional_words: false, + }, + utils::Conf { + group_name: "typo", + queries: &[ + "mongus ", + "thelonius monk ", + "Disnaylande ", + "the white striper ", + "indochie ", + "indochien ", + "klub des loopers ", + "fear of the duck ", + "michel depech ", + "stromal ", + "dire straights ", + "Arethla Franklin ", + ], + criterion: Some(&["typo"]), + optional_words: false, + }, + utils::Conf { + group_name: "words", + queries: &[ + "the black saint and the sinner lady and the good doggo ", // four words to pop + "les liaisons dangeureuses 1793 ", // one word to pop + "The Disneyland Children's Sing-Alone song ", // two words to pop + "seven nation mummy ", // one word to pop + "7000 Danses / Le Baiser / je me trompe de mots ", // four words to pop + "Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop + "whathavenotnsuchforth and then a good amount of words tot pop in order to match the first one ", // 16 + ], + criterion: Some(&["words"]), + optional_words: true, + } + ]; + + utils::run_benches(c, confs); +} + +criterion_group!(benches, bench_criterion); +criterion_main!(benches); diff --git a/milli/benches/proximity.rs b/milli/benches/proximity.rs deleted file mode 100644 index 5b687855f..000000000 --- a/milli/benches/proximity.rs +++ /dev/null @@ -1,33 +0,0 @@ -mod utils; - -use std::time::Duration; -use criterion::{criterion_group, criterion_main, BenchmarkId}; - -fn bench_proximity(c: &mut criterion::Criterion) { - let index = utils::base_setup(Some(vec!["words".to_string()])); - - let queries = [ - "black saint sinner lady ", - "les dangeureuses 1960 ", - "The Disneyland Sing-Alone song ", - "Under Great Northern Lights ", - "7000 Danses Un Jour Dans Notre Vie", - ]; - - let mut group = c.benchmark_group("proximity"); - group.measurement_time(Duration::from_secs(10)); - - for query in &queries { - group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { - b.iter(|| { - let rtxn = index.read_txn().unwrap(); - let _documents_ids = index.search(&rtxn).query(*query).optional_words(false).execute().unwrap(); - }); - }); - } - - group.finish(); -} - -criterion_group!(benches, bench_proximity); -criterion_main!(benches); diff --git a/milli/benches/typo.rs b/milli/benches/typo.rs deleted file mode 100644 index 184f1e5df..000000000 --- a/milli/benches/typo.rs +++ /dev/null @@ -1,40 +0,0 @@ -mod utils; - -use std::time::Duration; -use criterion::{criterion_group, criterion_main, BenchmarkId}; - -fn bench_typo(c: &mut criterion::Criterion) { - let index = utils::base_setup(Some(vec!["typo".to_string()])); - - let queries = [ - "mongus ", - "thelonius monk ", - "Disnaylande ", - "the white striper ", - "indochie ", - "indochien ", - "klub des loopers ", - "fear of the duck ", - "michel depech ", - "stromal ", - "dire straights ", - "Arethla Franklin ", - ]; - - let mut group = c.benchmark_group("typo"); - group.measurement_time(Duration::from_secs(10)); - - for query in &queries { - group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { - b.iter(|| { - let rtxn = index.read_txn().unwrap(); - let _documents_ids = index.search(&rtxn).query(*query).optional_words(false).execute().unwrap(); - }); - }); - } - - group.finish(); -} - -criterion_group!(benches, bench_typo); -criterion_main!(benches); diff --git a/milli/benches/utils.rs b/milli/benches/utils.rs index 23c47ea76..c608a3ef3 100644 --- a/milli/benches/utils.rs +++ b/milli/benches/utils.rs @@ -1,9 +1,17 @@ -use std::{fs::{File, create_dir_all}}; +use std::{fs::{File, create_dir_all}, time::Duration}; use heed::EnvOpenOptions; +use criterion::BenchmarkId; use milli::{Index, update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}}; -pub fn base_setup(criteria: Option>) -> Index { +pub struct Conf<'a> { + pub group_name: &'a str, + pub queries: &'a[&'a str], + pub criterion: Option<&'a [&'a str]>, + pub optional_words: bool, +} + +pub fn base_setup(criterion: Option>) -> Index { let database = "songs.mmdb"; create_dir_all(&database).unwrap(); @@ -16,12 +24,12 @@ pub fn base_setup(criteria: Option>) -> Index { let mut wtxn = index.write_txn().unwrap(); let mut builder = update_builder.settings(&mut wtxn, &index); - if let Some(criteria) = criteria { + if let Some(criterion) = criterion { builder.reset_faceted_fields(); builder.reset_criteria(); builder.reset_stop_words(); - builder.set_criteria(criteria); + builder.set_criteria(criterion); } builder.execute(|_, _| ()).unwrap(); @@ -39,3 +47,23 @@ pub fn base_setup(criteria: Option>) -> Index { index } + +pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { + for conf in confs { + let criterion = conf.criterion.map(|s| s.iter().map(|s| s.to_string()).collect()); + let index = base_setup(criterion); + + let mut group = c.benchmark_group(conf.group_name); + group.measurement_time(Duration::from_secs(10)); + + for &query in conf.queries { + group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { + b.iter(|| { + let rtxn = index.read_txn().unwrap(); + let _documents_ids = index.search(&rtxn).query(query).optional_words(conf.optional_words).execute().unwrap(); + }); + }); + } + group.finish(); + } +} diff --git a/milli/benches/words.rs b/milli/benches/words.rs deleted file mode 100644 index 92ca0a784..000000000 --- a/milli/benches/words.rs +++ /dev/null @@ -1,35 +0,0 @@ -mod utils; - -use std::time::Duration; -use criterion::{criterion_group, criterion_main, BenchmarkId}; - -fn bench_words(c: &mut criterion::Criterion) { - let index = utils::base_setup(Some(vec!["words".to_string()])); - - let queries = [ - "the black saint and the sinner lady and the good doggo ", // four words to pop - "les liaisons dangeureuses 1793 ", // one word to pop - "The Disneyland Children's Sing-Alone song ", // two words to pop - "seven nation mummy ", // one word to pop - "7000 Danses / Le Baiser / je me trompe de mots ", // four words to pop - "Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop - "whathavenotnsuchforth and then a good amount of words tot pop in order to match the first one ", // 16 - ]; - - let mut group = c.benchmark_group("words"); - group.measurement_time(Duration::from_secs(10)); - - for query in &queries { - group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { - b.iter(|| { - let rtxn = index.read_txn().unwrap(); - let _documents_ids = index.search(&rtxn).query(*query).execute().unwrap(); - }); - }); - } - - group.finish(); -} - -criterion_group!(benches, bench_words); -criterion_main!(benches); From ea0c6d8c401a3ee37c14a62878e4b1641e08d726 Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 13 Apr 2021 10:44:27 +0200 Subject: [PATCH 08/33] add a bunch of queries and start the introduction of the filters and the new dataset --- milli/benches/criterion.rs | 48 ++++++++++++++++++++++++++++++-- milli/benches/normal_search.rs | 51 ++++++++++++++++++++++++++++++++++ milli/benches/utils.rs | 43 ++++++++++++++++++++++------ 3 files changed, 132 insertions(+), 10 deletions(-) create mode 100644 milli/benches/normal_search.rs diff --git a/milli/benches/criterion.rs b/milli/benches/criterion.rs index 3f0b6d6b7..bdfe3d478 100644 --- a/milli/benches/criterion.rs +++ b/milli/benches/criterion.rs @@ -3,6 +3,24 @@ mod utils; use criterion::{criterion_group, criterion_main}; fn bench_criterion(c: &mut criterion::Criterion) { + let songs_base_queries = &[ + "mingus ", + "thelonious monk ", + "Disneyland ", + "the white stripes ", + "indochine ", + "klub des loosers ", + "fear of the dark ", + "michel delpech ", + "stromae ", + "dire straits ", + "aretha franklin ", + ]; + let default_criterion: Vec = milli::default_criteria().iter().map(|criteria| criteria.to_string()).collect(); + let default_criterion = default_criterion.iter().map(|s| s.as_str()); + let asc_default: Vec<&str> = std::iter::once("asc").chain(default_criterion.clone()).collect(); + let desc_default: Vec<&str> = std::iter::once("desc").chain(default_criterion.clone()).collect(); + let confs = &[ utils::Conf { group_name: "proximity", @@ -15,6 +33,7 @@ fn bench_criterion(c: &mut criterion::Criterion) { ], criterion: Some(&["proximity"]), optional_words: false, + ..utils::Conf::BASE }, utils::Conf { group_name: "typo", @@ -34,6 +53,7 @@ fn bench_criterion(c: &mut criterion::Criterion) { ], criterion: Some(&["typo"]), optional_words: false, + ..utils::Conf::BASE }, utils::Conf { group_name: "words", @@ -47,8 +67,32 @@ fn bench_criterion(c: &mut criterion::Criterion) { "whathavenotnsuchforth and then a good amount of words tot pop in order to match the first one ", // 16 ], criterion: Some(&["words"]), - optional_words: true, - } + ..utils::Conf::BASE + }, + utils::Conf { + group_name: "asc", + queries: songs_base_queries, + criterion: Some(&["asc"]), + ..utils::Conf::BASE + }, + utils::Conf { + group_name: "desc", + queries: songs_base_queries, + criterion: Some(&["desc"]), + ..utils::Conf::BASE + }, + utils::Conf { + group_name: "asc + default", + queries: songs_base_queries, + criterion: Some(&asc_default[..]), + ..utils::Conf::BASE + }, + utils::Conf { + group_name: "desc + default", + queries: songs_base_queries, + criterion: Some(&desc_default[..]), + ..utils::Conf::BASE + }, ]; utils::run_benches(c, confs); diff --git a/milli/benches/normal_search.rs b/milli/benches/normal_search.rs new file mode 100644 index 000000000..39a343cf0 --- /dev/null +++ b/milli/benches/normal_search.rs @@ -0,0 +1,51 @@ +mod utils; + +use criterion::{criterion_group, criterion_main}; + +fn bench_normal(c: &mut criterion::Criterion) { + let confs = &[ + utils::Conf { + group_name: "basic placeholder", + queries: &[ + "", + ], + ..utils::Conf::BASE + }, + utils::Conf { + group_name: "basic without quote", + queries: &[ + "david bowie", // 1200 + "michael jackson", // 600 + "marcus miller", // 60 + "Notstandskomitee", // 4 + ], + ..utils::Conf::BASE + }, + utils::Conf { + group_name: "basic with quote", + queries: &[ + "\"david\" \"bowie\"", // 1200 + "\"michael\" \"jackson\"", // 600 + "\"marcus\" \"miller\"", // 60 + "\"Notstandskomitee\"", // 4 + ], + ..utils::Conf::BASE + }, + utils::Conf { + group_name: "prefix search", + queries: &[ + "s", // 500k+ results + "a", + "b", + "i", + "x", // only 7k results + ], + ..utils::Conf::BASE + }, + ]; + + utils::run_benches(c, confs); +} + +criterion_group!(benches, bench_normal); +criterion_main!(benches); diff --git a/milli/benches/utils.rs b/milli/benches/utils.rs index c608a3ef3..6c8360fe2 100644 --- a/milli/benches/utils.rs +++ b/milli/benches/utils.rs @@ -1,18 +1,40 @@ -use std::{fs::{File, create_dir_all}, time::Duration}; +use std::{fs::{File, create_dir_all, remove_dir_all}, time::Duration}; use heed::EnvOpenOptions; use criterion::BenchmarkId; -use milli::{Index, update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}}; +use milli::{FacetCondition, Index, update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}}; pub struct Conf<'a> { + /// where we are going to create our database.mmdb directory + /// each benchmark will first try to delete it and then recreate it + pub database_name: &'a str, + /// the dataset to be used, it must be an uncompressed csv + pub dataset: &'a str, pub group_name: &'a str, pub queries: &'a[&'a str], pub criterion: Option<&'a [&'a str]>, + pub facet_condition: Option, pub optional_words: bool, } -pub fn base_setup(criterion: Option>) -> Index { - let database = "songs.mmdb"; +impl Conf<'_> { + pub const BASE: Self = Conf { + database_name: "benches.mmdb", + dataset: "", + group_name: "", + queries: &[], + criterion: None, + facet_condition: None, + optional_words: true, + }; +} + +pub fn base_setup(database: &str, dataset: &str, criterion: Option>) -> Index { + match remove_dir_all(&database) { + Ok(_) => (), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => (), + Err(e) => panic!("{}", e), + } create_dir_all(&database).unwrap(); let mut options = EnvOpenOptions::new(); @@ -41,7 +63,7 @@ pub fn base_setup(criterion: Option>) -> Index { builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); // we called from cargo the current directory is supposed to be milli/milli - let reader = File::open("benches/smol_songs.csv").unwrap(); + let reader = File::open(dataset).unwrap(); builder.execute(reader, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -51,16 +73,21 @@ pub fn base_setup(criterion: Option>) -> Index { pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { for conf in confs { let criterion = conf.criterion.map(|s| s.iter().map(|s| s.to_string()).collect()); - let index = base_setup(criterion); + let index = base_setup(conf.database_name, conf.dataset, criterion); - let mut group = c.benchmark_group(conf.group_name); + let mut group = c.benchmark_group(&format!("{}: {}", conf.dataset, conf.group_name)); group.measurement_time(Duration::from_secs(10)); for &query in conf.queries { group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { b.iter(|| { let rtxn = index.read_txn().unwrap(); - let _documents_ids = index.search(&rtxn).query(query).optional_words(conf.optional_words).execute().unwrap(); + let mut search = index.search(&rtxn); + search.query(query).optional_words(conf.optional_words); + if let Some(facet_condition) = conf.facet_condition.clone() { + search.facet_condition(facet_condition); + } + let _ids = search.execute().unwrap(); }); }); } From 4b78ef31b649a32f9d5274f872413c26b7b40910 Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 13 Apr 2021 11:40:16 +0200 Subject: [PATCH 09/33] add the configuration of the searchable fields and displayed fields and a default configuration for the songs --- milli/benches/criterion.rs | 14 +++++----- milli/benches/normal_search.rs | 8 +++--- milli/benches/utils.rs | 51 ++++++++++++++++++++++++++++------ 3 files changed, 53 insertions(+), 20 deletions(-) diff --git a/milli/benches/criterion.rs b/milli/benches/criterion.rs index bdfe3d478..fb79a597d 100644 --- a/milli/benches/criterion.rs +++ b/milli/benches/criterion.rs @@ -33,7 +33,7 @@ fn bench_criterion(c: &mut criterion::Criterion) { ], criterion: Some(&["proximity"]), optional_words: false, - ..utils::Conf::BASE + ..utils::Conf::BASE_SONGS }, utils::Conf { group_name: "typo", @@ -53,7 +53,7 @@ fn bench_criterion(c: &mut criterion::Criterion) { ], criterion: Some(&["typo"]), optional_words: false, - ..utils::Conf::BASE + ..utils::Conf::BASE_SONGS }, utils::Conf { group_name: "words", @@ -67,31 +67,31 @@ fn bench_criterion(c: &mut criterion::Criterion) { "whathavenotnsuchforth and then a good amount of words tot pop in order to match the first one ", // 16 ], criterion: Some(&["words"]), - ..utils::Conf::BASE + ..utils::Conf::BASE_SONGS }, utils::Conf { group_name: "asc", queries: songs_base_queries, criterion: Some(&["asc"]), - ..utils::Conf::BASE + ..utils::Conf::BASE_SONGS }, utils::Conf { group_name: "desc", queries: songs_base_queries, criterion: Some(&["desc"]), - ..utils::Conf::BASE + ..utils::Conf::BASE_SONGS }, utils::Conf { group_name: "asc + default", queries: songs_base_queries, criterion: Some(&asc_default[..]), - ..utils::Conf::BASE + ..utils::Conf::BASE_SONGS }, utils::Conf { group_name: "desc + default", queries: songs_base_queries, criterion: Some(&desc_default[..]), - ..utils::Conf::BASE + ..utils::Conf::BASE_SONGS }, ]; diff --git a/milli/benches/normal_search.rs b/milli/benches/normal_search.rs index 39a343cf0..bd57a8c45 100644 --- a/milli/benches/normal_search.rs +++ b/milli/benches/normal_search.rs @@ -9,7 +9,7 @@ fn bench_normal(c: &mut criterion::Criterion) { queries: &[ "", ], - ..utils::Conf::BASE + ..utils::Conf::BASE_SONGS }, utils::Conf { group_name: "basic without quote", @@ -19,7 +19,7 @@ fn bench_normal(c: &mut criterion::Criterion) { "marcus miller", // 60 "Notstandskomitee", // 4 ], - ..utils::Conf::BASE + ..utils::Conf::BASE_SONGS }, utils::Conf { group_name: "basic with quote", @@ -29,7 +29,7 @@ fn bench_normal(c: &mut criterion::Criterion) { "\"marcus\" \"miller\"", // 60 "\"Notstandskomitee\"", // 4 ], - ..utils::Conf::BASE + ..utils::Conf::BASE_SONGS }, utils::Conf { group_name: "prefix search", @@ -40,7 +40,7 @@ fn bench_normal(c: &mut criterion::Criterion) { "i", "x", // only 7k results ], - ..utils::Conf::BASE + ..utils::Conf::BASE_SONGS }, ]; diff --git a/milli/benches/utils.rs b/milli/benches/utils.rs index 6c8360fe2..2eb067a02 100644 --- a/milli/benches/utils.rs +++ b/milli/benches/utils.rs @@ -2,7 +2,7 @@ use std::{fs::{File, create_dir_all, remove_dir_all}, time::Duration}; use heed::EnvOpenOptions; use criterion::BenchmarkId; -use milli::{FacetCondition, Index, update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}}; +use milli::{FacetCondition, Index, update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat}}; pub struct Conf<'a> { /// where we are going to create our database.mmdb directory @@ -12,48 +12,82 @@ pub struct Conf<'a> { pub dataset: &'a str, pub group_name: &'a str, pub queries: &'a[&'a str], + /// here you can change which criterion are used and in which order. + /// - if you specify something all the base configuration will be thrown out + /// - if you don't specify anything (None) the default configuration will be kept pub criterion: Option<&'a [&'a str]>, + /// the last chance to configure your database as you want + pub configure: fn(&mut Settings), pub facet_condition: Option, + /// enable or disable the optional words on the query pub optional_words: bool, } impl Conf<'_> { + fn nop(_builder: &mut Settings) {} + + fn songs_conf(builder: &mut Settings) { + let displayed_fields = [ + "id", "title", "album", "artist", "genre", "country", "released", "duration", + ] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "album", "artist"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_searchable_fields(searchable_fields); + } + pub const BASE: Self = Conf { database_name: "benches.mmdb", dataset: "", group_name: "", queries: &[], criterion: None, + configure: Self::nop, facet_condition: None, optional_words: true, }; + + pub const BASE_SONGS: Self = Conf { + dataset: "smol-songs", + configure: Self::songs_conf, + ..Self::BASE + }; } -pub fn base_setup(database: &str, dataset: &str, criterion: Option>) -> Index { - match remove_dir_all(&database) { +pub fn base_setup(conf: &Conf) -> Index { + match remove_dir_all(&conf.database_name) { Ok(_) => (), Err(e) if e.kind() == std::io::ErrorKind::NotFound => (), Err(e) => panic!("{}", e), } - create_dir_all(&database).unwrap(); + create_dir_all(&conf.database_name).unwrap(); let mut options = EnvOpenOptions::new(); options.map_size(100 * 1024 * 1024 * 1024); // 100 GB options.max_readers(10); - let index = Index::new(options, database).unwrap(); + let index = Index::new(options, conf.database_name).unwrap(); let update_builder = UpdateBuilder::new(0); let mut wtxn = index.write_txn().unwrap(); let mut builder = update_builder.settings(&mut wtxn, &index); - if let Some(criterion) = criterion { + if let Some(criterion) = conf.criterion { builder.reset_faceted_fields(); builder.reset_criteria(); builder.reset_stop_words(); + let criterion = criterion.iter().map(|s| s.to_string()).collect(); builder.set_criteria(criterion); } + (conf.configure)(&mut builder); + builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -63,7 +97,7 @@ pub fn base_setup(database: &str, dataset: &str, criterion: Option>) builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); // we called from cargo the current directory is supposed to be milli/milli - let reader = File::open(dataset).unwrap(); + let reader = File::open(conf.dataset).unwrap(); builder.execute(reader, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -72,8 +106,7 @@ pub fn base_setup(database: &str, dataset: &str, criterion: Option>) pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { for conf in confs { - let criterion = conf.criterion.map(|s| s.iter().map(|s| s.to_string()).collect()); - let index = base_setup(conf.database_name, conf.dataset, criterion); + let index = base_setup(conf); let mut group = c.benchmark_group(&format!("{}: {}", conf.dataset, conf.group_name)); group.measurement_time(Duration::from_secs(10)); From 136efd6b534dc864e7b61efe478c37c7bc5ee7ee Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 13 Apr 2021 14:26:08 +0200 Subject: [PATCH 10/33] fix the benches --- milli/benches/criterion.rs | 8 ++++---- milli/benches/utils.rs | 17 +++++++++++++++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/milli/benches/criterion.rs b/milli/benches/criterion.rs index fb79a597d..3049557f0 100644 --- a/milli/benches/criterion.rs +++ b/milli/benches/criterion.rs @@ -18,8 +18,8 @@ fn bench_criterion(c: &mut criterion::Criterion) { ]; let default_criterion: Vec = milli::default_criteria().iter().map(|criteria| criteria.to_string()).collect(); let default_criterion = default_criterion.iter().map(|s| s.as_str()); - let asc_default: Vec<&str> = std::iter::once("asc").chain(default_criterion.clone()).collect(); - let desc_default: Vec<&str> = std::iter::once("desc").chain(default_criterion.clone()).collect(); + let asc_default: Vec<&str> = std::iter::once("asc(released-timestamp)").chain(default_criterion.clone()).collect(); + let desc_default: Vec<&str> = std::iter::once("desc(released-timestamp)").chain(default_criterion.clone()).collect(); let confs = &[ utils::Conf { @@ -72,13 +72,13 @@ fn bench_criterion(c: &mut criterion::Criterion) { utils::Conf { group_name: "asc", queries: songs_base_queries, - criterion: Some(&["asc"]), + criterion: Some(&["asc(released-timestamp)"]), ..utils::Conf::BASE_SONGS }, utils::Conf { group_name: "desc", queries: songs_base_queries, - criterion: Some(&["desc"]), + criterion: Some(&["desc(released-timestamp)"]), ..utils::Conf::BASE_SONGS }, utils::Conf { diff --git a/milli/benches/utils.rs b/milli/benches/utils.rs index 2eb067a02..9b58b54b8 100644 --- a/milli/benches/utils.rs +++ b/milli/benches/utils.rs @@ -40,6 +40,18 @@ impl Conf<'_> { .map(|s| s.to_string()) .collect(); builder.set_searchable_fields(searchable_fields); + + let faceted_fields = [ + ("released-timestamp", "integer"), + ("duration-float", "float"), + ("genre", "string"), + ("country", "string"), + ("artist", "string"), + ] + .iter() + .map(|(a, b)| (a.to_string(), b.to_string())) + .collect(); + builder.set_faceted_fields(faceted_fields); } pub const BASE: Self = Conf { @@ -54,7 +66,7 @@ impl Conf<'_> { }; pub const BASE_SONGS: Self = Conf { - dataset: "smol-songs", + dataset: "smol-songs.csv", configure: Self::songs_conf, ..Self::BASE }; @@ -97,7 +109,8 @@ pub fn base_setup(conf: &Conf) -> Index { builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); // we called from cargo the current directory is supposed to be milli/milli - let reader = File::open(conf.dataset).unwrap(); + let dataset_path = format!("benches/{}", conf.dataset); + let reader = File::open(&dataset_path).expect(&format!("could not find the dataset in: {}", &dataset_path)); builder.execute(reader, |_, _| ()).unwrap(); wtxn.commit().unwrap(); From 5132a106a160641513084f6a880bf7ba09a03d18 Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 13 Apr 2021 18:34:00 +0200 Subject: [PATCH 11/33] refactorize everything related to the songs dataset in a songs benchmark file --- milli/Cargo.toml | 2 +- milli/benches/criterion.rs | 102 ------------------ milli/benches/normal_search.rs | 51 --------- milli/benches/songs.rs | 185 +++++++++++++++++++++++++++++++++ milli/benches/utils.rs | 33 ------ 5 files changed, 186 insertions(+), 187 deletions(-) delete mode 100644 milli/benches/criterion.rs delete mode 100644 milli/benches/normal_search.rs create mode 100644 milli/benches/songs.rs diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 399b04428..2bdb3f4dc 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -61,5 +61,5 @@ rand = "0.8.3" default = [] [[bench]] -name = "criterion" +name = "songs" harness = false diff --git a/milli/benches/criterion.rs b/milli/benches/criterion.rs deleted file mode 100644 index 3049557f0..000000000 --- a/milli/benches/criterion.rs +++ /dev/null @@ -1,102 +0,0 @@ -mod utils; - -use criterion::{criterion_group, criterion_main}; - -fn bench_criterion(c: &mut criterion::Criterion) { - let songs_base_queries = &[ - "mingus ", - "thelonious monk ", - "Disneyland ", - "the white stripes ", - "indochine ", - "klub des loosers ", - "fear of the dark ", - "michel delpech ", - "stromae ", - "dire straits ", - "aretha franklin ", - ]; - let default_criterion: Vec = milli::default_criteria().iter().map(|criteria| criteria.to_string()).collect(); - let default_criterion = default_criterion.iter().map(|s| s.as_str()); - let asc_default: Vec<&str> = std::iter::once("asc(released-timestamp)").chain(default_criterion.clone()).collect(); - let desc_default: Vec<&str> = std::iter::once("desc(released-timestamp)").chain(default_criterion.clone()).collect(); - - let confs = &[ - utils::Conf { - group_name: "proximity", - queries: &[ - "black saint sinner lady ", - "les dangeureuses 1960 ", - "The Disneyland Sing-Along Chorus ", - "Under Great Northern Lights ", - "7000 Danses Un Jour Dans Notre Vie", - ], - criterion: Some(&["proximity"]), - optional_words: false, - ..utils::Conf::BASE_SONGS - }, - utils::Conf { - group_name: "typo", - queries: &[ - "mongus ", - "thelonius monk ", - "Disnaylande ", - "the white striper ", - "indochie ", - "indochien ", - "klub des loopers ", - "fear of the duck ", - "michel depech ", - "stromal ", - "dire straights ", - "Arethla Franklin ", - ], - criterion: Some(&["typo"]), - optional_words: false, - ..utils::Conf::BASE_SONGS - }, - utils::Conf { - group_name: "words", - queries: &[ - "the black saint and the sinner lady and the good doggo ", // four words to pop - "les liaisons dangeureuses 1793 ", // one word to pop - "The Disneyland Children's Sing-Alone song ", // two words to pop - "seven nation mummy ", // one word to pop - "7000 Danses / Le Baiser / je me trompe de mots ", // four words to pop - "Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop - "whathavenotnsuchforth and then a good amount of words tot pop in order to match the first one ", // 16 - ], - criterion: Some(&["words"]), - ..utils::Conf::BASE_SONGS - }, - utils::Conf { - group_name: "asc", - queries: songs_base_queries, - criterion: Some(&["asc(released-timestamp)"]), - ..utils::Conf::BASE_SONGS - }, - utils::Conf { - group_name: "desc", - queries: songs_base_queries, - criterion: Some(&["desc(released-timestamp)"]), - ..utils::Conf::BASE_SONGS - }, - utils::Conf { - group_name: "asc + default", - queries: songs_base_queries, - criterion: Some(&asc_default[..]), - ..utils::Conf::BASE_SONGS - }, - utils::Conf { - group_name: "desc + default", - queries: songs_base_queries, - criterion: Some(&desc_default[..]), - ..utils::Conf::BASE_SONGS - }, - ]; - - utils::run_benches(c, confs); -} - -criterion_group!(benches, bench_criterion); -criterion_main!(benches); diff --git a/milli/benches/normal_search.rs b/milli/benches/normal_search.rs deleted file mode 100644 index bd57a8c45..000000000 --- a/milli/benches/normal_search.rs +++ /dev/null @@ -1,51 +0,0 @@ -mod utils; - -use criterion::{criterion_group, criterion_main}; - -fn bench_normal(c: &mut criterion::Criterion) { - let confs = &[ - utils::Conf { - group_name: "basic placeholder", - queries: &[ - "", - ], - ..utils::Conf::BASE_SONGS - }, - utils::Conf { - group_name: "basic without quote", - queries: &[ - "david bowie", // 1200 - "michael jackson", // 600 - "marcus miller", // 60 - "Notstandskomitee", // 4 - ], - ..utils::Conf::BASE_SONGS - }, - utils::Conf { - group_name: "basic with quote", - queries: &[ - "\"david\" \"bowie\"", // 1200 - "\"michael\" \"jackson\"", // 600 - "\"marcus\" \"miller\"", // 60 - "\"Notstandskomitee\"", // 4 - ], - ..utils::Conf::BASE_SONGS - }, - utils::Conf { - group_name: "prefix search", - queries: &[ - "s", // 500k+ results - "a", - "b", - "i", - "x", // only 7k results - ], - ..utils::Conf::BASE_SONGS - }, - ]; - - utils::run_benches(c, confs); -} - -criterion_group!(benches, bench_normal); -criterion_main!(benches); diff --git a/milli/benches/songs.rs b/milli/benches/songs.rs new file mode 100644 index 000000000..586b8d4ef --- /dev/null +++ b/milli/benches/songs.rs @@ -0,0 +1,185 @@ +mod utils; + +use criterion::{criterion_group, criterion_main}; +use milli::update::Settings; +use utils::Conf; + +fn base_conf(builder: &mut Settings) { + let displayed_fields = [ + "id", "title", "album", "artist", "genre", "country", "released", "duration", + ] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "album", "artist"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_searchable_fields(searchable_fields); + + let faceted_fields = [ + ("released-timestamp", "integer"), + ("duration-float", "float"), + ("genre", "string"), + ("country", "string"), + ("artist", "string"), + ] + .iter() + .map(|(a, b)| (a.to_string(), b.to_string())) + .collect(); + builder.set_faceted_fields(faceted_fields); +} + +const BASE_CONF: Conf = Conf { + dataset: "smol-songs.csv", + queries: &[ + "mingus ", + "thelonious monk ", + "Disneyland ", + "the white stripes ", + "indochine ", + "klub des loosers ", + "fear of the dark ", + "michel delpech ", + "stromae ", + "dire straits ", + "aretha franklin ", + ], + configure: base_conf, + ..Conf::BASE +}; + +fn bench_songs(c: &mut criterion::Criterion) { + let default_criterion: Vec = milli::default_criteria() + .iter() + .map(|criteria| criteria.to_string()) + .collect(); + let default_criterion = default_criterion.iter().map(|s| s.as_str()); + let asc_default: Vec<&str> = std::iter::once("asc(released-timestamp)") + .chain(default_criterion.clone()) + .collect(); + let desc_default: Vec<&str> = std::iter::once("desc(released-timestamp)") + .chain(default_criterion.clone()) + .collect(); + + let confs = &[ + /* first we bench each criterion alone */ + utils::Conf { + group_name: "proximity", + queries: &[ + "black saint sinner lady ", + "les dangeureuses 1960 ", + "The Disneyland Sing-Along Chorus ", + "Under Great Northern Lights ", + "7000 Danses Un Jour Dans Notre Vie", + ], + criterion: Some(&["proximity"]), + optional_words: false, + ..BASE_CONF + }, + utils::Conf { + group_name: "typo", + queries: &[ + "mongus ", + "thelonius monk ", + "Disnaylande ", + "the white striper ", + "indochie ", + "indochien ", + "klub des loopers ", + "fear of the duck ", + "michel depech ", + "stromal ", + "dire straights ", + "Arethla Franklin ", + ], + criterion: Some(&["typo"]), + optional_words: false, + ..BASE_CONF + }, + utils::Conf { + group_name: "words", + queries: &[ + "the black saint and the sinner lady and the good doggo ", // four words to pop + "les liaisons dangeureuses 1793 ", // one word to pop + "The Disneyland Children's Sing-Alone song ", // two words to pop + "seven nation mummy ", // one word to pop + "7000 Danses / Le Baiser / je me trompe de mots ", // four words to pop + "Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop + "whathavenotnsuchforth and a good amount of words to pop to match the first one ", // 16 + ], + criterion: Some(&["words"]), + ..BASE_CONF + }, + utils::Conf { + group_name: "asc", + criterion: Some(&["asc(released-timestamp)"]), + ..BASE_CONF + }, + utils::Conf { + group_name: "desc", + criterion: Some(&["desc(released-timestamp)"]), + ..BASE_CONF + }, + + /* then we bench the asc and desc criterion on top of the default criterion */ + utils::Conf { + group_name: "asc + default", + criterion: Some(&asc_default[..]), + ..BASE_CONF + }, + utils::Conf { + group_name: "desc + default", + criterion: Some(&desc_default[..]), + ..BASE_CONF + }, + + /* the we bench some global / normal search with all the default criterion in the default + * order */ + utils::Conf { + group_name: "basic placeholder", + queries: &[ + "", + ], + ..BASE_CONF + }, + utils::Conf { + group_name: "basic without quote", + queries: &[ + "david bowie", // 1200 + "michael jackson", // 600 + "marcus miller", // 60 + "Notstandskomitee", // 4 + ], + ..BASE_CONF + }, + utils::Conf { + group_name: "basic with quote", + queries: &[ + "\"david\" \"bowie\"", // 1200 + "\"michael\" \"jackson\"", // 600 + "\"marcus\" \"miller\"", // 60 + "\"Notstandskomitee\"", // 4 + ], + ..BASE_CONF + }, + utils::Conf { + group_name: "prefix search", + queries: &[ + "s", // 500k+ results + "a", + "b", + "i", + "x", // only 7k results + ], + ..BASE_CONF + }, + ]; + + utils::run_benches(c, confs); +} + +criterion_group!(benches, bench_songs); +criterion_main!(benches); diff --git a/milli/benches/utils.rs b/milli/benches/utils.rs index 9b58b54b8..b101adb63 100644 --- a/milli/benches/utils.rs +++ b/milli/benches/utils.rs @@ -26,34 +26,6 @@ pub struct Conf<'a> { impl Conf<'_> { fn nop(_builder: &mut Settings) {} - fn songs_conf(builder: &mut Settings) { - let displayed_fields = [ - "id", "title", "album", "artist", "genre", "country", "released", "duration", - ] - .iter() - .map(|s| s.to_string()) - .collect(); - builder.set_displayed_fields(displayed_fields); - - let searchable_fields = ["title", "album", "artist"] - .iter() - .map(|s| s.to_string()) - .collect(); - builder.set_searchable_fields(searchable_fields); - - let faceted_fields = [ - ("released-timestamp", "integer"), - ("duration-float", "float"), - ("genre", "string"), - ("country", "string"), - ("artist", "string"), - ] - .iter() - .map(|(a, b)| (a.to_string(), b.to_string())) - .collect(); - builder.set_faceted_fields(faceted_fields); - } - pub const BASE: Self = Conf { database_name: "benches.mmdb", dataset: "", @@ -65,11 +37,6 @@ impl Conf<'_> { optional_words: true, }; - pub const BASE_SONGS: Self = Conf { - dataset: "smol-songs.csv", - configure: Self::songs_conf, - ..Self::BASE - }; } pub fn base_setup(conf: &Conf) -> Index { From beae84376658257f5a538aced80e3b5898f4f022 Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 13 Apr 2021 18:39:34 +0200 Subject: [PATCH 12/33] add a missing space --- milli/benches/songs.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/benches/songs.rs b/milli/benches/songs.rs index 586b8d4ef..71bc164ab 100644 --- a/milli/benches/songs.rs +++ b/milli/benches/songs.rs @@ -73,7 +73,7 @@ fn bench_songs(c: &mut criterion::Criterion) { "les dangeureuses 1960 ", "The Disneyland Sing-Along Chorus ", "Under Great Northern Lights ", - "7000 Danses Un Jour Dans Notre Vie", + "7000 Danses Un Jour Dans Notre Vie ", ], criterion: Some(&["proximity"]), optional_words: false, From d0b44c380f6bb98aba0c57ee32910d4d6f71a948 Mon Sep 17 00:00:00 2001 From: tamo Date: Wed, 14 Apr 2021 12:09:51 +0200 Subject: [PATCH 13/33] add benchmarks on a wiki dataset --- milli/Cargo.toml | 4 ++ milli/benches/wiki.rs | 127 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 131 insertions(+) create mode 100644 milli/benches/wiki.rs diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 2bdb3f4dc..1c0f74613 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -63,3 +63,7 @@ default = [] [[bench]] name = "songs" harness = false + +[[bench]] +name = "wiki" +harness = false diff --git a/milli/benches/wiki.rs b/milli/benches/wiki.rs new file mode 100644 index 000000000..fc8af02e5 --- /dev/null +++ b/milli/benches/wiki.rs @@ -0,0 +1,127 @@ +mod utils; + +use criterion::{criterion_group, criterion_main}; +use milli::update::Settings; +use utils::Conf; + +fn base_conf(builder: &mut Settings) { + let displayed_fields = ["title", "body", "url"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); +} + +const BASE_CONF: Conf = Conf { + dataset: "smol-wiki-articles.csv", + queries: &[ + "mingus ", // 46 candidates + "miles davis ", // 159 + "rock and roll ", // 1007 + "machine ", // 3448 + "spain ", // 7002 + "japan ", // 10.593 + "france ", // 17.616 + "film ", // 24.959 + ], + configure: base_conf, + ..Conf::BASE +}; + +fn bench_songs(c: &mut criterion::Criterion) { + let basic_with_quote: Vec = BASE_CONF + .queries + .iter() + .map(|s| { + s.trim() + .split(' ') + .map(|s| format!(r#""{}""#, s)).collect::>().join(" ") + }) + .collect(); + let basic_with_quote: &[&str] = &basic_with_quote.iter().map(|s| s.as_str()).collect::>(); + + let confs = &[ + /* first we bench each criterion alone */ + utils::Conf { + group_name: "proximity", + queries: &[ + "herald sings ", + "april paris ", + "tea two ", + "diesel engine ", + ], + criterion: Some(&["proximity"]), + optional_words: false, + ..BASE_CONF + }, + utils::Conf { + group_name: "typo", + queries: &[ + "migrosoft ", + "linax ", + "Disnaylande ", + "phytogropher ", + "nympalidea ", + "aritmetric ", + "the fronce ", + "sisan ", + ], + criterion: Some(&["typo"]), + optional_words: false, + ..BASE_CONF + }, + utils::Conf { + group_name: "words", + queries: &[ + "the black saint and the sinner lady and the good doggo ", // four words to pop, 27 results + "Kameya Tokujirō mingus monk ", // two words to pop, 55 + "Ulrich Hensel meilisearch milli ", // two words to pop, 306 + "Idaho Bellevue pizza ", // one word to pop, 800 + "Abraham machin ", // one word to pop, 1141 + ], + criterion: Some(&["words"]), + ..BASE_CONF + }, + /* the we bench some global / normal search with all the default criterion in the default + * order */ + utils::Conf { + group_name: "basic placeholder", + queries: &[""], + ..BASE_CONF + }, + utils::Conf { + group_name: "basic without quote", + queries: &BASE_CONF + .queries + .iter() + .map(|s| s.trim()) // we remove the space at the end of each request + .collect::>(), + ..BASE_CONF + }, + utils::Conf { + group_name: "basic with quote", + queries: basic_with_quote, + ..BASE_CONF + }, + utils::Conf { + group_name: "prefix search", + queries: &[ + "t", // 453k results + "c", // 405k + "g", // 318k + "j", // 227k + "q", // 71k + "x", // 17k + ], + ..BASE_CONF + }, + ]; + + utils::run_benches(c, confs); +} + +criterion_group!(benches, bench_songs); +criterion_main!(benches); From 7086009f9350b2ef72c126a2b0ffa342b405869b Mon Sep 17 00:00:00 2001 From: tamo Date: Wed, 14 Apr 2021 12:36:12 +0200 Subject: [PATCH 14/33] improve the base search --- milli/benches/songs.rs | 52 +++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/milli/benches/songs.rs b/milli/benches/songs.rs index 71bc164ab..f6e36262d 100644 --- a/milli/benches/songs.rs +++ b/milli/benches/songs.rs @@ -35,17 +35,16 @@ fn base_conf(builder: &mut Settings) { const BASE_CONF: Conf = Conf { dataset: "smol-songs.csv", queries: &[ - "mingus ", - "thelonious monk ", - "Disneyland ", - "the white stripes ", - "indochine ", - "klub des loosers ", - "fear of the dark ", - "michel delpech ", - "stromae ", - "dire straits ", - "aretha franklin ", + "john ", // 9097 + "david ", // 4794 + "charles ", // 1957 + "david bowie ", // 1200 + "michael jackson ", // 600 + "thelonious monk ", // 303 + "charles mingus ", // 142 + "marcus miller ", // 60 + "tamo ", // 13 + "Notstandskomitee ", // 4 ], configure: base_conf, ..Conf::BASE @@ -64,6 +63,17 @@ fn bench_songs(c: &mut criterion::Criterion) { .chain(default_criterion.clone()) .collect(); + let basic_with_quote: Vec = BASE_CONF + .queries + .iter() + .map(|s| { + s.trim() + .split(' ') + .map(|s| format!(r#""{}""#, s)).collect::>().join(" ") + }) + .collect(); + let basic_with_quote: &[&str] = &basic_with_quote.iter().map(|s| s.as_str()).collect::>(); + let confs = &[ /* first we bench each criterion alone */ utils::Conf { @@ -108,7 +118,7 @@ fn bench_songs(c: &mut criterion::Criterion) { "seven nation mummy ", // one word to pop "7000 Danses / Le Baiser / je me trompe de mots ", // four words to pop "Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop - "whathavenotnsuchforth and a good amount of words to pop to match the first one ", // 16 + "whathavenotnsuchforth and a good amount of words to pop to match the first one ", // 13 ], criterion: Some(&["words"]), ..BASE_CONF @@ -147,22 +157,16 @@ fn bench_songs(c: &mut criterion::Criterion) { }, utils::Conf { group_name: "basic without quote", - queries: &[ - "david bowie", // 1200 - "michael jackson", // 600 - "marcus miller", // 60 - "Notstandskomitee", // 4 - ], + queries: &BASE_CONF + .queries + .iter() + .map(|s| s.trim()) // we remove the space at the end of each request + .collect::>(), ..BASE_CONF }, utils::Conf { group_name: "basic with quote", - queries: &[ - "\"david\" \"bowie\"", // 1200 - "\"michael\" \"jackson\"", // 600 - "\"marcus\" \"miller\"", // 60 - "\"Notstandskomitee\"", // 4 - ], + queries: basic_with_quote, ..BASE_CONF }, utils::Conf { From 5d5d11560890bf27e4254686d6ba44c3aab5afcc Mon Sep 17 00:00:00 2001 From: tamo Date: Wed, 14 Apr 2021 13:13:33 +0200 Subject: [PATCH 15/33] reformat all the files --- milli/benches/songs.rs | 48 ++++++++++++++++++++---------------------- milli/benches/utils.rs | 18 ++++++++++------ milli/benches/wiki.rs | 9 ++++++-- 3 files changed, 42 insertions(+), 33 deletions(-) diff --git a/milli/benches/songs.rs b/milli/benches/songs.rs index f6e36262d..0c056d93f 100644 --- a/milli/benches/songs.rs +++ b/milli/benches/songs.rs @@ -35,15 +35,15 @@ fn base_conf(builder: &mut Settings) { const BASE_CONF: Conf = Conf { dataset: "smol-songs.csv", queries: &[ - "john ", // 9097 - "david ", // 4794 - "charles ", // 1957 - "david bowie ", // 1200 - "michael jackson ", // 600 - "thelonious monk ", // 303 - "charles mingus ", // 142 - "marcus miller ", // 60 - "tamo ", // 13 + "john ", // 9097 + "david ", // 4794 + "charles ", // 1957 + "david bowie ", // 1200 + "michael jackson ", // 600 + "thelonious monk ", // 303 + "charles mingus ", // 142 + "marcus miller ", // 60 + "tamo ", // 13 "Notstandskomitee ", // 4 ], configure: base_conf, @@ -69,10 +69,15 @@ fn bench_songs(c: &mut criterion::Criterion) { .map(|s| { s.trim() .split(' ') - .map(|s| format!(r#""{}""#, s)).collect::>().join(" ") + .map(|s| format!(r#""{}""#, s)) + .collect::>() + .join(" ") }) .collect(); - let basic_with_quote: &[&str] = &basic_with_quote.iter().map(|s| s.as_str()).collect::>(); + let basic_with_quote: &[&str] = &basic_with_quote + .iter() + .map(|s| s.as_str()) + .collect::>(); let confs = &[ /* first we bench each criterion alone */ @@ -113,10 +118,10 @@ fn bench_songs(c: &mut criterion::Criterion) { group_name: "words", queries: &[ "the black saint and the sinner lady and the good doggo ", // four words to pop - "les liaisons dangeureuses 1793 ", // one word to pop - "The Disneyland Children's Sing-Alone song ", // two words to pop - "seven nation mummy ", // one word to pop - "7000 Danses / Le Baiser / je me trompe de mots ", // four words to pop + "les liaisons dangeureuses 1793 ", // one word to pop + "The Disneyland Children's Sing-Alone song ", // two words to pop + "seven nation mummy ", // one word to pop + "7000 Danses / Le Baiser / je me trompe de mots ", // four words to pop "Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop "whathavenotnsuchforth and a good amount of words to pop to match the first one ", // 13 ], @@ -133,7 +138,6 @@ fn bench_songs(c: &mut criterion::Criterion) { criterion: Some(&["desc(released-timestamp)"]), ..BASE_CONF }, - /* then we bench the asc and desc criterion on top of the default criterion */ utils::Conf { group_name: "asc + default", @@ -145,14 +149,11 @@ fn bench_songs(c: &mut criterion::Criterion) { criterion: Some(&desc_default[..]), ..BASE_CONF }, - /* the we bench some global / normal search with all the default criterion in the default * order */ utils::Conf { group_name: "basic placeholder", - queries: &[ - "", - ], + queries: &[""], ..BASE_CONF }, utils::Conf { @@ -173,14 +174,11 @@ fn bench_songs(c: &mut criterion::Criterion) { group_name: "prefix search", queries: &[ "s", // 500k+ results - "a", - "b", - "i", - "x", // only 7k results + "a", "b", "i", "x", // only 7k results ], ..BASE_CONF }, - ]; + ]; utils::run_benches(c, confs); } diff --git a/milli/benches/utils.rs b/milli/benches/utils.rs index b101adb63..4c8fb347d 100644 --- a/milli/benches/utils.rs +++ b/milli/benches/utils.rs @@ -1,8 +1,14 @@ -use std::{fs::{File, create_dir_all, remove_dir_all}, time::Duration}; +use std::{ + fs::{create_dir_all, remove_dir_all, File}, + time::Duration, +}; -use heed::EnvOpenOptions; use criterion::BenchmarkId; -use milli::{FacetCondition, Index, update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat}}; +use heed::EnvOpenOptions; +use milli::{ + update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat}, + FacetCondition, Index, +}; pub struct Conf<'a> { /// where we are going to create our database.mmdb directory @@ -11,7 +17,7 @@ pub struct Conf<'a> { /// the dataset to be used, it must be an uncompressed csv pub dataset: &'a str, pub group_name: &'a str, - pub queries: &'a[&'a str], + pub queries: &'a [&'a str], /// here you can change which criterion are used and in which order. /// - if you specify something all the base configuration will be thrown out /// - if you don't specify anything (None) the default configuration will be kept @@ -36,7 +42,6 @@ impl Conf<'_> { facet_condition: None, optional_words: true, }; - } pub fn base_setup(conf: &Conf) -> Index { @@ -77,7 +82,8 @@ pub fn base_setup(conf: &Conf) -> Index { builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); // we called from cargo the current directory is supposed to be milli/milli let dataset_path = format!("benches/{}", conf.dataset); - let reader = File::open(&dataset_path).expect(&format!("could not find the dataset in: {}", &dataset_path)); + let reader = File::open(&dataset_path) + .expect(&format!("could not find the dataset in: {}", &dataset_path)); builder.execute(reader, |_, _| ()).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/benches/wiki.rs b/milli/benches/wiki.rs index fc8af02e5..d876814a9 100644 --- a/milli/benches/wiki.rs +++ b/milli/benches/wiki.rs @@ -38,10 +38,15 @@ fn bench_songs(c: &mut criterion::Criterion) { .map(|s| { s.trim() .split(' ') - .map(|s| format!(r#""{}""#, s)).collect::>().join(" ") + .map(|s| format!(r#""{}""#, s)) + .collect::>() + .join(" ") }) .collect(); - let basic_with_quote: &[&str] = &basic_with_quote.iter().map(|s| s.as_str()).collect::>(); + let basic_with_quote: &[&str] = &basic_with_quote + .iter() + .map(|s| s.as_str()) + .collect::>(); let confs = &[ /* first we bench each criterion alone */ From 7c7fba4e577edacf668f8cec6a24e0f48cea854d Mon Sep 17 00:00:00 2001 From: tamo Date: Wed, 14 Apr 2021 14:39:36 +0200 Subject: [PATCH 16/33] remove the time limitation to let criterion do what it wants --- milli/benches/utils.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/milli/benches/utils.rs b/milli/benches/utils.rs index 4c8fb347d..3d91f726a 100644 --- a/milli/benches/utils.rs +++ b/milli/benches/utils.rs @@ -95,7 +95,6 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { let index = base_setup(conf); let mut group = c.benchmark_group(&format!("{}: {}", conf.dataset, conf.group_name)); - group.measurement_time(Duration::from_secs(10)); for &query in conf.queries { group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { From e5dfde88fd042708d3765dd703e542c7a3a0a512 Mon Sep 17 00:00:00 2001 From: tamo Date: Wed, 14 Apr 2021 16:26:21 +0200 Subject: [PATCH 17/33] fix the facets conditions --- milli/benches/songs.rs | 24 +++++++++++++++++++++++- milli/benches/utils.rs | 10 ++++------ milli/benches/wiki.rs | 2 +- 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/milli/benches/songs.rs b/milli/benches/songs.rs index 0c056d93f..8ef6df3c8 100644 --- a/milli/benches/songs.rs +++ b/milli/benches/songs.rs @@ -138,6 +138,7 @@ fn bench_songs(c: &mut criterion::Criterion) { criterion: Some(&["desc(released-timestamp)"]), ..BASE_CONF }, + /* then we bench the asc and desc criterion on top of the default criterion */ utils::Conf { group_name: "asc + default", @@ -149,6 +150,24 @@ fn bench_songs(c: &mut criterion::Criterion) { criterion: Some(&desc_default[..]), ..BASE_CONF }, + + /* we bench the filters with the default request */ + utils::Conf { + group_name: "basic filter: <=", + facet_condition: Some("released-timestamp <= 946728000"), // year 2000 + ..BASE_CONF + }, + utils::Conf { + group_name: "basic filter: TO", + facet_condition: Some("released-timestamp 946728000 TO 1262347200"), // year 2000 to 2010 + ..BASE_CONF + }, + utils::Conf { + group_name: "big filter", + facet_condition: Some("released-timestamp != 1262347200 AND (NOT (released-timestamp = 946728000)) AND (duration-float = 1 OR (duration-float 1.1 TO 1.5 AND released-timestamp > 315576000))"), + ..BASE_CONF + }, + /* the we bench some global / normal search with all the default criterion in the default * order */ utils::Conf { @@ -174,7 +193,10 @@ fn bench_songs(c: &mut criterion::Criterion) { group_name: "prefix search", queries: &[ "s", // 500k+ results - "a", "b", "i", "x", // only 7k results + "a", // + "b", // + "i", // + "x", // only 7k results ], ..BASE_CONF }, diff --git a/milli/benches/utils.rs b/milli/benches/utils.rs index 3d91f726a..460623ab5 100644 --- a/milli/benches/utils.rs +++ b/milli/benches/utils.rs @@ -1,7 +1,4 @@ -use std::{ - fs::{create_dir_all, remove_dir_all, File}, - time::Duration, -}; +use std::fs::{create_dir_all, remove_dir_all, File}; use criterion::BenchmarkId; use heed::EnvOpenOptions; @@ -24,7 +21,7 @@ pub struct Conf<'a> { pub criterion: Option<&'a [&'a str]>, /// the last chance to configure your database as you want pub configure: fn(&mut Settings), - pub facet_condition: Option, + pub facet_condition: Option<&'a str>, /// enable or disable the optional words on the query pub optional_words: bool, } @@ -102,7 +99,8 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { let rtxn = index.read_txn().unwrap(); let mut search = index.search(&rtxn); search.query(query).optional_words(conf.optional_words); - if let Some(facet_condition) = conf.facet_condition.clone() { + if let Some(facet_condition) = conf.facet_condition { + let facet_condition = FacetCondition::from_str(&rtxn, &index, facet_condition).unwrap(); search.facet_condition(facet_condition); } let _ids = search.execute().unwrap(); diff --git a/milli/benches/wiki.rs b/milli/benches/wiki.rs index d876814a9..8c15f11ca 100644 --- a/milli/benches/wiki.rs +++ b/milli/benches/wiki.rs @@ -25,7 +25,7 @@ const BASE_CONF: Conf = Conf { "spain ", // 7002 "japan ", // 10.593 "france ", // 17.616 - "film ", // 24.959 + "film ", // 24.959 ], configure: base_conf, ..Conf::BASE From 4969abeaab9711c953aea88efa65316c23756bd0 Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 27 Apr 2021 15:02:14 +0200 Subject: [PATCH 18/33] update the facets for the benchmarks --- milli/benches/songs.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/benches/songs.rs b/milli/benches/songs.rs index 8ef6df3c8..430b73a40 100644 --- a/milli/benches/songs.rs +++ b/milli/benches/songs.rs @@ -20,8 +20,8 @@ fn base_conf(builder: &mut Settings) { builder.set_searchable_fields(searchable_fields); let faceted_fields = [ - ("released-timestamp", "integer"), - ("duration-float", "float"), + ("released-timestamp", "number"), + ("duration-float", "number"), ("genre", "string"), ("country", "string"), ("artist", "string"), From 3c84075d2d38b707c4df77deb960341f8c4bbedc Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 27 Apr 2021 15:41:16 +0200 Subject: [PATCH 19/33] uses an env variable to find the datasets --- milli/benches/README.md | 12 ++++++++++++ milli/benches/utils.rs | 17 +++++++++++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/milli/benches/README.md b/milli/benches/README.md index 9b53fc0d1..b2c1aec15 100644 --- a/milli/benches/README.md +++ b/milli/benches/README.md @@ -13,3 +13,15 @@ You can run the following command from the root of this git repository ``` wget https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz -O milli/benches/smol-songs.csv.gz ``` + +- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h +- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h +- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h + +By default the benchmarks expect the datasets to be uncompressed and present in `milli/milli/benches`, but you can also specify your own path with the environment variable `MILLI_BENCH_DATASETS_PATH` like that: +``` +MILLI_BENCH_DATASETS_PATH=~/Downloads/datasets cargo bench --bench songs +``` + +Our benchmarking suite uses criterion which allow you to do a lot of configuration, see the documentation [here](https://bheisler.github.io/criterion.rs/book/user_guide/user_guide.html) + diff --git a/milli/benches/utils.rs b/milli/benches/utils.rs index 460623ab5..f3f5e9bf6 100644 --- a/milli/benches/utils.rs +++ b/milli/benches/utils.rs @@ -7,6 +7,15 @@ use milli::{ FacetCondition, Index, }; +/// The name of the environment variable used to select the path +/// of the directory containing the datasets +const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH"; + +/// The default path for the dataset if nothing is specified +/// By default we chose `milli/benches` because any cargo command ran in `milli/milli/**` will be +/// executed with a pwd of `milli/milli` +const DEFAULT_DATASETS_PATH: &str = "milli/benches"; + pub struct Conf<'a> { /// where we are going to create our database.mmdb directory /// each benchmark will first try to delete it and then recreate it @@ -78,7 +87,10 @@ pub fn base_setup(conf: &Conf) -> Index { builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); // we called from cargo the current directory is supposed to be milli/milli - let dataset_path = format!("benches/{}", conf.dataset); + let base_dataset_path = std::env::vars() + .find(|var| var.0 == BASE_DATASETS_PATH_KEY) + .map_or(DEFAULT_DATASETS_PATH.to_owned(), |(_key, value)| value); + let dataset_path = format!("{}/{}", base_dataset_path, conf.dataset); let reader = File::open(&dataset_path) .expect(&format!("could not find the dataset in: {}", &dataset_path)); builder.execute(reader, |_, _| ()).unwrap(); @@ -100,7 +112,8 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { let mut search = index.search(&rtxn); search.query(query).optional_words(conf.optional_words); if let Some(facet_condition) = conf.facet_condition { - let facet_condition = FacetCondition::from_str(&rtxn, &index, facet_condition).unwrap(); + let facet_condition = + FacetCondition::from_str(&rtxn, &index, facet_condition).unwrap(); search.facet_condition(facet_condition); } let _ids = search.execute().unwrap(); From 06c414a75388bd34f7dc3e768f433e0b5bafac23 Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 25 May 2021 17:09:14 +0200 Subject: [PATCH 20/33] move the benchmarks to another crate so we can download the datasets automatically without adding overhead to the build of milli --- Cargo.toml | 2 +- benchmarks/Cargo.toml | 29 ++++++++++ benchmarks/README.md | 30 ++++++++++ {milli => benchmarks}/benches/songs.rs | 3 +- {milli => benchmarks}/benches/utils.rs | 21 ++----- {milli => benchmarks}/benches/wiki.rs | 3 +- benchmarks/build.rs | 80 ++++++++++++++++++++++++++ benchmarks/src/lib.rs | 5 ++ milli/Cargo.toml | 9 --- milli/benches/README.md | 27 --------- 10 files changed, 154 insertions(+), 55 deletions(-) create mode 100644 benchmarks/Cargo.toml create mode 100644 benchmarks/README.md rename {milli => benchmarks}/benches/songs.rs (99%) rename {milli => benchmarks}/benches/utils.rs (81%) rename {milli => benchmarks}/benches/wiki.rs (98%) create mode 100644 benchmarks/build.rs create mode 100644 benchmarks/src/lib.rs delete mode 100644 milli/benches/README.md diff --git a/Cargo.toml b/Cargo.toml index a60c293e3..ff0b2582a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["milli", "http-ui", "infos", "helpers", "search"] +members = ["milli", "http-ui", "benchmarks", "infos", "helpers", "search"] default-members = ["milli"] [profile.release] diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml new file mode 100644 index 000000000..f7b66fe3a --- /dev/null +++ b/benchmarks/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "benchmarks" +version = "0.1.0" +edition = "2018" +publish = false + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +milli = { path = "../milli" } + +[dev-dependencies] +heed = "*" # we want to use the version milli uses +criterion = "0.3.4" + +[build-dependencies] +anyhow = "1.0" +bytes = "1.0" +flate2 = "1.0.20" +convert_case = "0.4" +reqwest = { version = "0.11.3", features = ["blocking", "rustls-tls"], default-features = false } + +[[bench]] +name = "songs" +harness = false + +[[bench]] +name = "wiki" +harness = false diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 000000000..8c91700e9 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,30 @@ +Benchmarks +========== + +For our benchmark we are using a small subset of the dataset `songs.csv`. It was generated with this command: +``` +xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv +``` +You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz) +And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz). + +We also use a subset of `wikipedia-articles.csv` that was generated with the following command: +``` +xsv sample --seed 42 500000 wikipedia-articles.csv -o smol-wikipedia-articles.csv +``` +You can download the original [here](https://meili-datasets.s3.fr-par.scw.cloud/wikipedia-articles.csv.gz) and the subset [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-wikipedia-articles.csv.gz). + +----- + +- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h +- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h +- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h + +By default the benchmarks will be downloaded and uncompressed automatically in the target directory. +If you don't want to download the datasets everytime you updates something on the code you can specify a custom directory with the env variable `MILLI_BENCH_DATASETS_PATH`: +``` +mkdir ~/datasets +MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded +touch build.rs +MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded +``` diff --git a/milli/benches/songs.rs b/benchmarks/benches/songs.rs similarity index 99% rename from milli/benches/songs.rs rename to benchmarks/benches/songs.rs index 430b73a40..dd52a0afc 100644 --- a/milli/benches/songs.rs +++ b/benchmarks/benches/songs.rs @@ -1,3 +1,4 @@ +mod datasets_paths; mod utils; use criterion::{criterion_group, criterion_main}; @@ -33,7 +34,7 @@ fn base_conf(builder: &mut Settings) { } const BASE_CONF: Conf = Conf { - dataset: "smol-songs.csv", + dataset: datasets_paths::SMOL_SONGS, queries: &[ "john ", // 9097 "david ", // 4794 diff --git a/milli/benches/utils.rs b/benchmarks/benches/utils.rs similarity index 81% rename from milli/benches/utils.rs rename to benchmarks/benches/utils.rs index f3f5e9bf6..e0feb9b0e 100644 --- a/milli/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -7,15 +7,6 @@ use milli::{ FacetCondition, Index, }; -/// The name of the environment variable used to select the path -/// of the directory containing the datasets -const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH"; - -/// The default path for the dataset if nothing is specified -/// By default we chose `milli/benches` because any cargo command ran in `milli/milli/**` will be -/// executed with a pwd of `milli/milli` -const DEFAULT_DATASETS_PATH: &str = "milli/benches"; - pub struct Conf<'a> { /// where we are going to create our database.mmdb directory /// each benchmark will first try to delete it and then recreate it @@ -33,6 +24,8 @@ pub struct Conf<'a> { pub facet_condition: Option<&'a str>, /// enable or disable the optional words on the query pub optional_words: bool, + /// primary key, if there is None we'll auto-generate docids for every documents + pub primary_key: Option<&'a str>, } impl Conf<'_> { @@ -47,6 +40,7 @@ impl Conf<'_> { configure: Self::nop, facet_condition: None, optional_words: true, + primary_key: None, }; } @@ -86,13 +80,8 @@ pub fn base_setup(conf: &Conf) -> Index { let mut builder = update_builder.index_documents(&mut wtxn, &index); builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - // we called from cargo the current directory is supposed to be milli/milli - let base_dataset_path = std::env::vars() - .find(|var| var.0 == BASE_DATASETS_PATH_KEY) - .map_or(DEFAULT_DATASETS_PATH.to_owned(), |(_key, value)| value); - let dataset_path = format!("{}/{}", base_dataset_path, conf.dataset); - let reader = File::open(&dataset_path) - .expect(&format!("could not find the dataset in: {}", &dataset_path)); + let reader = File::open(conf.dataset) + .expect(&format!("could not find the dataset in: {}", conf.dataset)); builder.execute(reader, |_, _| ()).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/benches/wiki.rs b/benchmarks/benches/wiki.rs similarity index 98% rename from milli/benches/wiki.rs rename to benchmarks/benches/wiki.rs index 8c15f11ca..99ecff2ce 100644 --- a/milli/benches/wiki.rs +++ b/benchmarks/benches/wiki.rs @@ -1,3 +1,4 @@ +mod datasets_paths; mod utils; use criterion::{criterion_group, criterion_main}; @@ -16,7 +17,7 @@ fn base_conf(builder: &mut Settings) { } const BASE_CONF: Conf = Conf { - dataset: "smol-wiki-articles.csv", + dataset: datasets_paths::SMOL_WIKI_ARTICLES, queries: &[ "mingus ", // 46 candidates "miles davis ", // 159 diff --git a/benchmarks/build.rs b/benchmarks/build.rs new file mode 100644 index 000000000..dc92a1a4c --- /dev/null +++ b/benchmarks/build.rs @@ -0,0 +1,80 @@ +use std::path::{Path, PathBuf}; +use std::{env, fs}; +use std::{ + fs::File, + io::{Cursor, Read, Seek, Write}, +}; + +use bytes::Bytes; +use convert_case::{Case, Casing}; +use flate2::read::GzDecoder; +use reqwest::IntoUrl; + +const BASE_URL: &str = "https://meili-datasets.s3.fr-par.scw.cloud/benchmarks"; + +const DATASET_SONGS: &str = "smol-songs"; +const DATASET_WIKI: &str = "smol-wiki-articles"; + +/// The name of the environment variable used to select the path +/// of the directory containing the datasets +const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH"; + +fn main() -> anyhow::Result<()> { + let out_dir = PathBuf::from(env::var(BASE_DATASETS_PATH_KEY).unwrap_or(env::var("OUT_DIR")?)); + + let benches_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?).join("benches"); + let mut manifest_paths_file = File::create(benches_dir.join("datasets_paths.rs"))?; + writeln!( + manifest_paths_file, + r#"//! This file is generated by the build script. +//! Do not modify by hand, use the build.rs file. +#![allow(dead_code)] +"# + )?; + writeln!(manifest_paths_file)?; + + for dataset in &[DATASET_SONGS, DATASET_WIKI] { + let out_path = out_dir.join(dataset); + let out_file = out_path.with_extension("csv"); + + writeln!( + &mut manifest_paths_file, + r#"pub const {}: &str = {:?};"#, + dataset.to_case(Case::ScreamingSnake), + out_file.display(), + )?; + + if out_file.exists() { + eprintln!("The dataset {} already exists on the file system and will not be downloaded again", dataset); + continue; + } + let url = format!("{}/{}.csv.gz", BASE_URL, dataset); + eprintln!("downloading: {}", url); + let bytes = download_dataset(url.clone())?; + eprintln!("{} downloaded successfully", url); + eprintln!("uncompressing in {}", out_path.display()); + uncompress_in_file(bytes, &out_file)?; + } + + Ok(()) +} + +fn download_dataset(url: U) -> anyhow::Result> { + let bytes = reqwest::blocking::Client::builder() + .timeout(None) + .build()? + .get(url) + .send()? + .bytes()?; + Ok(Cursor::new(bytes)) +} + +fn uncompress_in_file>(bytes: R, path: P) -> anyhow::Result<()> { + let path = path.as_ref(); + let mut gz = GzDecoder::new(bytes); + let mut dataset = Vec::new(); + gz.read_to_end(&mut dataset)?; + + fs::write(path, dataset)?; + Ok(()) +} diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs new file mode 100644 index 000000000..4281ec115 --- /dev/null +++ b/benchmarks/src/lib.rs @@ -0,0 +1,5 @@ +//! This library is only used to isolate the benchmarks +//! from the original milli library. +//! +//! It does not include interesting functions for milli library +//! users only for milli contributors. diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 1c0f74613..2af6a9042 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -53,17 +53,8 @@ tinytemplate = "=1.1.0" [dev-dependencies] big_s = "1.0.2" -criterion = "0.3.4" maplit = "1.0.2" rand = "0.8.3" [features] default = [] - -[[bench]] -name = "songs" -harness = false - -[[bench]] -name = "wiki" -harness = false diff --git a/milli/benches/README.md b/milli/benches/README.md deleted file mode 100644 index b2c1aec15..000000000 --- a/milli/benches/README.md +++ /dev/null @@ -1,27 +0,0 @@ -Benchmarks -========== - -For our benchmark we are using a small subset of the dataset songs.csv. It was generated with this command: -``` -xsv sample --seed 42 song.csv -o smol-songs.csv -``` -You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz) -And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz). - -You need to put this file in the current directory: `milli/milli/benches/smol-songs.csv.gz` -You can run the following command from the root of this git repository -``` -wget https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz -O milli/benches/smol-songs.csv.gz -``` - -- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h -- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h -- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h - -By default the benchmarks expect the datasets to be uncompressed and present in `milli/milli/benches`, but you can also specify your own path with the environment variable `MILLI_BENCH_DATASETS_PATH` like that: -``` -MILLI_BENCH_DATASETS_PATH=~/Downloads/datasets cargo bench --bench songs -``` - -Our benchmarking suite uses criterion which allow you to do a lot of configuration, see the documentation [here](https://bheisler.github.io/criterion.rs/book/user_guide/user_guide.html) - From 4536dfccd05c522b3fc720e3d3dc51a2c68a6d65 Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 25 May 2021 17:55:45 +0200 Subject: [PATCH 21/33] add a way to provide primary_key or autogenerate documents ids --- Cargo.lock | 346 ++++++++++++++++++++++++++++++++++-- benchmarks/benches/songs.rs | 1 + benchmarks/benches/utils.rs | 7 + 3 files changed, 338 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0b1da2b3f..04fd284c6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -122,6 +122,20 @@ version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" +[[package]] +name = "benchmarks" +version = "0.1.0" +dependencies = [ + "anyhow", + "bytes 1.0.1", + "convert_case", + "criterion", + "flate2", + "heed", + "milli", + "reqwest", +] + [[package]] name = "big_s" version = "1.0.2" @@ -327,6 +341,12 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "convert_case" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" + [[package]] name = "cow-utils" version = "0.1.2" @@ -506,6 +526,15 @@ version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" +[[package]] +name = "encoding_rs" +version = "0.8.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80df024fbc5ac80f87dfef0d9f5209a252f2a497f7f42944cff24d8253cac065" +dependencies = [ + "cfg-if 1.0.0", +] + [[package]] name = "fake-simd" version = "0.1.2" @@ -750,12 +779,31 @@ dependencies = [ "http", "indexmap", "slab", - "tokio", - "tokio-util", + "tokio 0.2.25", + "tokio-util 0.3.1", "tracing", "tracing-futures", ] +[[package]] +name = "h2" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "825343c4eef0b63f541f8903f395dc5beb362a979b5799a84062527ef1e37726" +dependencies = [ + "bytes 1.0.1", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap", + "slab", + "tokio 1.6.0", + "tokio-util 0.6.7", + "tracing", +] + [[package]] name = "half" version = "1.7.1" @@ -893,6 +941,17 @@ dependencies = [ "http", ] +[[package]] +name = "http-body" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60daa14be0e0786db0f03a9e57cb404c9d756eed2b6c62b9ea98ec5743ec75a9" +dependencies = [ + "bytes 1.0.1", + "http", + "pin-project-lite 0.2.6", +] + [[package]] name = "http-ui" version = "0.2.1" @@ -922,7 +981,7 @@ dependencies = [ "stderrlog", "structopt", "tempfile", - "tokio", + "tokio 0.2.25", "warp", ] @@ -960,20 +1019,59 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", - "h2", + "h2 0.2.7", "http", - "http-body", + "http-body 0.3.1", "httparse", "httpdate", "itoa", "pin-project 1.0.5", - "socket2", - "tokio", + "socket2 0.3.19", + "tokio 0.2.25", "tower-service", "tracing", "want", ] +[[package]] +name = "hyper" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bf09f61b52cfcf4c00de50df88ae423d6c02354e385a86341133b5338630ad1" +dependencies = [ + "bytes 1.0.1", + "futures-channel", + "futures-core", + "futures-util", + "h2 0.3.3", + "http", + "http-body 0.4.2", + "httparse", + "httpdate", + "itoa", + "pin-project 1.0.5", + "socket2 0.4.0", + "tokio 1.6.0", + "tower-service", + "tracing", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f9f7a97316d44c0af9b0301e65010573a853a9fc97046d7331d7f6bc0fd5a64" +dependencies = [ + "futures-util", + "hyper 0.14.5", + "log", + "rustls", + "tokio 1.6.0", + "tokio-rustls", + "webpki", +] + [[package]] name = "idna" version = "0.2.2" @@ -1029,6 +1127,12 @@ dependencies = [ "libc", ] +[[package]] +name = "ipnet" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47be2f14c678be2fdcab04ab1171db51b2762ce6f0a8ee87c8dd4a04ed216135" + [[package]] name = "itertools" version = "0.9.0" @@ -1261,7 +1365,6 @@ dependencies = [ "bstr", "byteorder", "chrono", - "criterion", "crossbeam-channel", "csv", "either", @@ -1343,6 +1446,19 @@ dependencies = [ "winapi 0.2.8", ] +[[package]] +name = "mio" +version = "0.7.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf80d3e903b34e0bd7282b218398aec54e082c840d9baf8339e0080a0c542956" +dependencies = [ + "libc", + "log", + "miow 0.3.7", + "ntapi", + "winapi 0.3.9", +] + [[package]] name = "mio-named-pipes" version = "0.1.7" @@ -1350,7 +1466,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0840c1c50fd55e521b247f949c241c9997709f23bd7f023b9762cd561e935656" dependencies = [ "log", - "mio", + "mio 0.6.23", "miow 0.3.7", "winapi 0.3.9", ] @@ -1363,7 +1479,7 @@ checksum = "afcb699eb26d4332647cc848492bbc15eafb26f08d0304550d5aa1f612e066f0" dependencies = [ "iovec", "libc", - "mio", + "mio 0.6.23", ] [[package]] @@ -1441,6 +1557,15 @@ dependencies = [ "version_check", ] +[[package]] +name = "ntapi" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44" +dependencies = [ + "winapi 0.3.9", +] + [[package]] name = "num-integer" version = "0.1.44" @@ -1956,12 +2081,62 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "reqwest" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2296f2fac53979e8ccbc4a1136b25dcefd37be9ed7e4a1f6b05a6029c84ff124" +dependencies = [ + "base64 0.13.0", + "bytes 1.0.1", + "encoding_rs", + "futures-core", + "futures-util", + "http", + "http-body 0.4.2", + "hyper 0.14.5", + "hyper-rustls", + "ipnet", + "js-sys", + "lazy_static", + "log", + "mime", + "percent-encoding", + "pin-project-lite 0.2.6", + "rustls", + "serde", + "serde_urlencoded 0.7.0", + "tokio 1.6.0", + "tokio-rustls", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "webpki-roots", + "winreg", +] + [[package]] name = "retain_mut" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53552c6c49e1e13f1a203ef0080ab3bbef0beb570a528993e83df057a9d9bba1" +[[package]] +name = "ring" +version = "0.16.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc" +dependencies = [ + "cc", + "libc", + "once_cell", + "spin", + "untrusted", + "web-sys", + "winapi 0.3.9", +] + [[package]] name = "roaring" version = "0.6.6" @@ -1982,6 +2157,19 @@ dependencies = [ "semver", ] +[[package]] +name = "rustls" +version = "0.19.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35edb675feee39aec9c99fa5ff985081995a06d594114ae14cbe797ad7b7a6d7" +dependencies = [ + "base64 0.13.0", + "log", + "ring", + "sct", + "webpki", +] + [[package]] name = "ryu" version = "1.0.5" @@ -2015,6 +2203,16 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +[[package]] +name = "sct" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b362b83898e0e69f38515b82ee15aa80636befe47c3b6d3d89a911e78fc228ce" +dependencies = [ + "ring", + "untrusted", +] + [[package]] name = "search" version = "0.2.1" @@ -2108,6 +2306,18 @@ dependencies = [ "url", ] +[[package]] +name = "serde_urlencoded" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edfa57a7f8d9c1d260a549e7224100f6c43d43f9103e06dd8b4095a9b2b43ce9" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + [[package]] name = "sha-1" version = "0.8.2" @@ -2193,6 +2403,22 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "socket2" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e3dfc207c526015c632472a77be09cf1b6e46866581aecae5cc38fb4235dea2" +dependencies = [ + "libc", + "winapi 0.3.9", +] + +[[package]] +name = "spin" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" + [[package]] name = "static_assertions" version = "1.1.0" @@ -2386,7 +2612,7 @@ dependencies = [ "lazy_static", "libc", "memchr", - "mio", + "mio 0.6.23", "mio-named-pipes", "mio-uds", "num_cpus", @@ -2397,6 +2623,21 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "tokio" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd3076b5c8cc18138b8f8814895c11eb4de37114a5d127bafdc5e55798ceef37" +dependencies = [ + "autocfg", + "bytes 1.0.1", + "libc", + "memchr", + "mio 0.7.11", + "num_cpus", + "pin-project-lite 0.2.6", +] + [[package]] name = "tokio-macros" version = "0.2.6" @@ -2408,6 +2649,17 @@ dependencies = [ "syn 1.0.64", ] +[[package]] +name = "tokio-rustls" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc6844de72e57df1980054b38be3a9f4702aba4858be64dd700181a8a6d0e1b6" +dependencies = [ + "rustls", + "tokio 1.6.0", + "webpki", +] + [[package]] name = "tokio-tungstenite" version = "0.11.0" @@ -2417,7 +2669,7 @@ dependencies = [ "futures-util", "log", "pin-project 0.4.27", - "tokio", + "tokio 0.2.25", "tungstenite", ] @@ -2432,7 +2684,21 @@ dependencies = [ "futures-sink", "log", "pin-project-lite 0.1.12", - "tokio", + "tokio 0.2.25", +] + +[[package]] +name = "tokio-util" +version = "0.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1caa0b0c8d94a049db56b5acf8cba99dc0623aab1b26d5b5f5e2d945846b3592" +dependencies = [ + "bytes 1.0.1", + "futures-core", + "futures-sink", + "log", + "pin-project-lite 0.2.6", + "tokio 1.6.0", ] [[package]] @@ -2578,6 +2844,12 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" +[[package]] +name = "untrusted" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" + [[package]] name = "url" version = "2.2.1" @@ -2654,7 +2926,7 @@ dependencies = [ "futures", "headers", "http", - "hyper", + "hyper 0.13.10", "log", "mime", "mime_guess", @@ -2663,8 +2935,8 @@ dependencies = [ "scoped-tls", "serde", "serde_json", - "serde_urlencoded", - "tokio", + "serde_urlencoded 0.6.1", + "tokio 0.2.25", "tokio-tungstenite", "tower-service", "tracing", @@ -2691,6 +2963,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8fe8f61dba8e5d645a4d8132dc7a0a66861ed5e1045d2c0ed940fab33bac0fbe" dependencies = [ "cfg-if 1.0.0", + "serde", + "serde_json", "wasm-bindgen-macro", ] @@ -2709,6 +2983,18 @@ dependencies = [ "wasm-bindgen-shared", ] +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73157efb9af26fb564bb59a009afd1c7c334a44db171d280690d0c3faaec3468" +dependencies = [ + "cfg-if 1.0.0", + "js-sys", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "wasm-bindgen-macro" version = "0.2.72" @@ -2748,6 +3034,25 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "webpki" +version = "0.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e38c0608262c46d4a56202ebabdeb094cef7e560ca7a226c6bf055188aa4ea" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "webpki-roots" +version = "0.21.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aabe153544e473b775453675851ecc86863d2a81d786d741f6b76778f2a48940" +dependencies = [ + "webpki", +] + [[package]] name = "whatlang" version = "0.9.0" @@ -2800,6 +3105,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "winreg" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0120db82e8a1e0b9fb3345a539c478767c0048d842860994d96113d5b667bd69" +dependencies = [ + "winapi 0.3.9", +] + [[package]] name = "ws2_32-sys" version = "0.2.1" diff --git a/benchmarks/benches/songs.rs b/benchmarks/benches/songs.rs index dd52a0afc..dea8cd605 100644 --- a/benchmarks/benches/songs.rs +++ b/benchmarks/benches/songs.rs @@ -48,6 +48,7 @@ const BASE_CONF: Conf = Conf { "Notstandskomitee ", // 4 ], configure: base_conf, + primary_key: Some("id"), ..Conf::BASE }; diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index e0feb9b0e..6fa5f2d19 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -56,6 +56,10 @@ pub fn base_setup(conf: &Conf) -> Index { options.map_size(100 * 1024 * 1024 * 1024); // 100 GB options.max_readers(10); let index = Index::new(options, conf.database_name).unwrap(); + if let Some(primary_key) = conf.primary_key { + let mut wtxn = index.write_txn().unwrap(); + index.put_primary_key(&mut wtxn, primary_key).unwrap(); + } let update_builder = UpdateBuilder::new(0); let mut wtxn = index.write_txn().unwrap(); @@ -78,6 +82,9 @@ pub fn base_setup(conf: &Conf) -> Index { let update_builder = UpdateBuilder::new(0); let mut wtxn = index.write_txn().unwrap(); let mut builder = update_builder.index_documents(&mut wtxn, &index); + if let None = conf.primary_key { + builder.enable_autogenerate_docids(); + } builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); let reader = File::open(conf.dataset) From 0d0e900158cba3c450c99e241db18c224d1a4a21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Wed, 26 May 2021 15:57:22 +0200 Subject: [PATCH 22/33] Add CI for benchmarks --- .github/workflows/benchmarks.yml | 63 +++++++++++++++++++ benchmarks/README.md | 101 +++++++++++++++++++++++++++---- benchmarks/scripts/compare.sh | 58 ++++++++++++++++++ 3 files changed, 209 insertions(+), 13 deletions(-) create mode 100644 .github/workflows/benchmarks.yml create mode 100644 benchmarks/scripts/compare.sh diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml new file mode 100644 index 000000000..867e13132 --- /dev/null +++ b/.github/workflows/benchmarks.yml @@ -0,0 +1,63 @@ +name: Benchmarks + +on: + workflow_dispatch: + inputs: + dataset_name: + description: 'The name of the dataset used to benchmark (songs or wiki)' + required: false + default: 'songs' + +jobs: + benchmarks: + name: Run and upload benchmarks + runs-on: self-hosted + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + # Set variables + - name: Set current branch name + shell: bash + run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})" + id: current_branch + - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 + shell: bash + run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" + id: normalized_current_branch + - name: Set shorter commit SHA + shell: bash + run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)" + id: commit_sha + - name: Set file basename with format "dataset_branch_commitSHA" + shell: bash + run: echo "##[set-output name=basename;]$(echo ${{ github.event.inputs.dataset_name }}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + id: file + + # Run benchmarks + - name: Run benchmarks - Dataset ${{ github.event.inputs.dataset_name }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} + run: | + cd benchmarks + cargo bench --bench ${{ github.event.inputs.dataset_name }} -- --save-baseline ${{ steps.file.outputs.basename }} + + # Generate critcmpf files + - name: Install critcmp + run: cargo install critcmp + - name: Export cripcmp file + run: | + critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json + + # Upload benchmarks + - name: Upload to DO Spaces # DigitalOcean Spaces = S3 + uses: BetaHuhn/do-spaces-action@v2 + with: + access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }} + secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }} + space_name: ${{ secrets.DO_SPACES_SPACE_NAME }} + space_region: ${{ secrets.DO_SPACES_SPACE_REGION }} + source: ${{ steps.file.outputs.basename }}.json + out_dir: critcmp_results diff --git a/benchmarks/README.md b/benchmarks/README.md index 8c91700e9..cde4062e5 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,30 +1,105 @@ Benchmarks ========== -For our benchmark we are using a small subset of the dataset `songs.csv`. It was generated with this command: -``` +## TOC + +- [Datasets](#datasets) +- [Run the benchmarks](#run-the-benchmarks) +- [Comparison between benchmarks](#comparison-between-benchmarks) + +## Datasets + +The benchmarks are available for the following datasets: +- `songs` +- `wiki` + +### Songs + +`songs` is a subset of the [`songs.csv` dataset](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz). + +It was generated with this command: + +```bash xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv ``` -You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz) -And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz). -We also use a subset of `wikipedia-articles.csv` that was generated with the following command: -``` +_[Download the generated `songs` dataset](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)._ + +### Wiki + +`wiki` is a subset of the [`wikipedia-articles.csv` dataset](https://meili-datasets.s3.fr-par.scw.cloud/wikipedia-articles.csv.gz). + +It was generated with the following command: + +```bash xsv sample --seed 42 500000 wikipedia-articles.csv -o smol-wikipedia-articles.csv ``` -You can download the original [here](https://meili-datasets.s3.fr-par.scw.cloud/wikipedia-articles.csv.gz) and the subset [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-wikipedia-articles.csv.gz). ------ +_[Download the generated `wiki` dataset](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-wikipedia-articles.csv.gz)._ -- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h -- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h -- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h +## Run the benchmarks -By default the benchmarks will be downloaded and uncompressed automatically in the target directory. -If you don't want to download the datasets everytime you updates something on the code you can specify a custom directory with the env variable `MILLI_BENCH_DATASETS_PATH`: +### On our private server + +The Meili team has self-hosted his own GitHub runner to run benchmarks on our dedicated bare metal server. + +To trigger the benchmark workflow: +- Go to the `Actions` tab of this repository. +- Select the `Benchmarks` workflow on the left. +- Click on `Run workflow` in the blue banner. +- Select the branch on which you want to run the benchmarks and select the dataset you want (default: `songs`). +- Finally, click on `Run workflow`. + +This GitHub workflow will run the benchmarks and push the `critcmp` report to a DigitalOcean Space (= S3). + +_[More about critcmp](https://github.com/BurntSushi/critcmp)._ + +### On your machine + +To run all the benchmarks (~4h): + +```bash +cargo bench ``` + +To run only the `songs` (~1h) or `wiki` (~3h) benchmark: + +```bash +cargo bench --bench +``` + +By default, the benchmarks will be downloaded and uncompressed automatically in the target directory.
+If you don't want to download the datasets every time you update something on the code, you can specify a custom directory with the environment variable `MILLI_BENCH_DATASETS_PATH`: + +```bash mkdir ~/datasets MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded touch build.rs MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded ``` + +## Comparison between benchmarks + +The benchmark reports we push are generated with `critcmp`. Thus, we use `critcmp` to generate comparison results between 2 benchmarks. + +We provide a script to download and display the comparison report. + +Requirements: +- [`s3cmd`](https://github.com/s3tools/s3cmd) and being logged to the DigitalOcean Space "milli-benchmarks". See the [DigitalOcean guide](https://docs.digitalocean.com/products/spaces/resources/s3cmd/) +- [`critcmp`](https://github.com/BurntSushi/critcmp) + +List the available file in the DO Space: + +```bash +s3cmd ls s3://milli-benchmarks/critcmp_results/ +``` +```bash +2021-05-31 14:40 279890 s3://milli-benchmarks/critcmp_results/songs_main_09a4321.json +2021-05-31 13:49 279576 s3://milli-benchmarks/critcmp_results/songs_geosearch_24ec456.json +``` + +Run the comparison script: + +```bash +bash benchmarks/scripts/compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json +``` diff --git a/benchmarks/scripts/compare.sh b/benchmarks/scripts/compare.sh new file mode 100644 index 000000000..868baeacf --- /dev/null +++ b/benchmarks/scripts/compare.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Requirements: +# - s3cmd and being logged to the DO Space "milli-benchmarks". See: https://docs.digitalocean.com/products/spaces/resources/s3cmd/ +# - critcmp. See: https://github.com/BurntSushi/critcmp + +# Usage +# $ bash compare.sh json_file1 json_file1 +# ex: bash compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json + +# Checking that critcmp is installed +command -v critcmp > /dev/null 2>&1 +if [[ "$?" -ne 0 ]]; then + echo 'You must install critcmp to make this script working.' + echo '$ cargo install critcmp' + echo 'See: https://github.com/BurntSushi/critcmp' + exit 1 +fi + +# Checking that s3cmd is installed +command -v s3cmd > /dev/null 2>&1 +if [[ "$?" -ne 0 ]]; then + echo 'You must install s3cmd to make this script working.' + echo 'See: https://github.com/s3tools/s3cmd' + exit 1 +fi + +if [[ $# -ne 2 ]] + then + echo 'Need 2 arguments.' + echo 'Usage: ' + echo ' $ bash compare.sh file_to_download1 file_to_download2' + echo 'Ex:' + echo ' $ bash compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json' + exit 1 +fi + +file1="$1" +file2="$2" +s3_path='s3://milli-benchmarks/critcmp_results' +file1_s3_path="$s3_path/$file1" +file2_s3_path="$s3_path/$file2" +file1_local_path="/tmp/$file1" +file2_local_path="/tmp/$file2" + +if [[ ! -f "$file1_local_path" ]]; then + s3cmd get "$file1_s3_path" "$file1_local_path" +else + echo "$file1 already present in /tmp, no need to download." +fi + +if [[ ! -f "$file2_local_path" ]]; then + s3cmd get "$file2_s3_path" "$file2_local_path" +else + echo "$file2 already present in /tmp, no need to download." +fi + +critcmp --color always "$file1_local_path" "$file2_local_path" From b3c0d438902ab069b4e4e9492c8c2c7e5ff87ac1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Tue, 1 Jun 2021 13:56:11 +0200 Subject: [PATCH 23/33] Update benchmarks/scripts/compare.sh Co-authored-by: Irevoire --- benchmarks/scripts/compare.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/scripts/compare.sh b/benchmarks/scripts/compare.sh index 868baeacf..6bd260122 100644 --- a/benchmarks/scripts/compare.sh +++ b/benchmarks/scripts/compare.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # Requirements: # - s3cmd and being logged to the DO Space "milli-benchmarks". See: https://docs.digitalocean.com/products/spaces/resources/s3cmd/ From 57ed96622b92e066f0b1220e32c09161bf84bf7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Tue, 1 Jun 2021 13:56:26 +0200 Subject: [PATCH 24/33] Update benchmarks/scripts/compare.sh Co-authored-by: Irevoire --- benchmarks/scripts/compare.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/benchmarks/scripts/compare.sh b/benchmarks/scripts/compare.sh index 6bd260122..4d3205c96 100644 --- a/benchmarks/scripts/compare.sh +++ b/benchmarks/scripts/compare.sh @@ -45,6 +45,10 @@ file2_local_path="/tmp/$file2" if [[ ! -f "$file1_local_path" ]]; then s3cmd get "$file1_s3_path" "$file1_local_path" + if [[ "$?" -ne 0 ]]; then + echo 's3cmd command failed. Check your configuration' + exit 1 + fi else echo "$file1 already present in /tmp, no need to download." fi From 61fe422a884fe299a36ee3a0c455beef6648825b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Tue, 1 Jun 2021 13:56:42 +0200 Subject: [PATCH 25/33] Update benchmarks/scripts/compare.sh Co-authored-by: Irevoire --- benchmarks/scripts/compare.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/benchmarks/scripts/compare.sh b/benchmarks/scripts/compare.sh index 4d3205c96..02f903bee 100644 --- a/benchmarks/scripts/compare.sh +++ b/benchmarks/scripts/compare.sh @@ -55,6 +55,10 @@ fi if [[ ! -f "$file2_local_path" ]]; then s3cmd get "$file2_s3_path" "$file2_local_path" + if [[ "$?" -ne 0 ]]; then + echo 's3cmd command failed. Check your configuration' + exit 1 + fi else echo "$file2 already present in /tmp, no need to download." fi From bc4f4ee829fba22ba766e9b9f5a1a8f1a3d8bc79 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 1 Jun 2021 14:43:47 +0200 Subject: [PATCH 26/33] remove s3cmd as a dependency and provide a script to list all the available benchmarks --- benchmarks/scripts/compare.sh | 23 +++++++++++------------ benchmarks/scripts/list.sh | 14 ++++++++++++++ 2 files changed, 25 insertions(+), 12 deletions(-) mode change 100644 => 100755 benchmarks/scripts/compare.sh create mode 100755 benchmarks/scripts/list.sh diff --git a/benchmarks/scripts/compare.sh b/benchmarks/scripts/compare.sh old mode 100644 new mode 100755 index 02f903bee..e4231131d --- a/benchmarks/scripts/compare.sh +++ b/benchmarks/scripts/compare.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash # Requirements: -# - s3cmd and being logged to the DO Space "milli-benchmarks". See: https://docs.digitalocean.com/products/spaces/resources/s3cmd/ # - critcmp. See: https://github.com/BurntSushi/critcmp +# - wget # Usage # $ bash compare.sh json_file1 json_file1 @@ -17,11 +17,10 @@ if [[ "$?" -ne 0 ]]; then exit 1 fi -# Checking that s3cmd is installed -command -v s3cmd > /dev/null 2>&1 +# Checking that wget is installed +command -v wget > /dev/null 2>&1 if [[ "$?" -ne 0 ]]; then - echo 'You must install s3cmd to make this script working.' - echo 'See: https://github.com/s3tools/s3cmd' + echo 'You must install wget to make this script working.' exit 1 fi @@ -37,16 +36,16 @@ fi file1="$1" file2="$2" -s3_path='s3://milli-benchmarks/critcmp_results' -file1_s3_path="$s3_path/$file1" -file2_s3_path="$s3_path/$file2" +s3_url='https://milli-benchmarks.fra1.digitaloceanspaces.com/critcmp_results' +file1_s3_url="$s3_url/$file1" +file2_s3_url="$s3_url/$file2" file1_local_path="/tmp/$file1" file2_local_path="/tmp/$file2" if [[ ! -f "$file1_local_path" ]]; then - s3cmd get "$file1_s3_path" "$file1_local_path" + wget "$file1_s3_url" -O "$file1_local_path" if [[ "$?" -ne 0 ]]; then - echo 's3cmd command failed. Check your configuration' + echo 'wget command failed. Check your configuration' exit 1 fi else @@ -54,9 +53,9 @@ else fi if [[ ! -f "$file2_local_path" ]]; then - s3cmd get "$file2_s3_path" "$file2_local_path" + wget "$file2_s3_url" -O "$file2_local_path" if [[ "$?" -ne 0 ]]; then - echo 's3cmd command failed. Check your configuration' + echo 'wget command failed. Check your configuration' exit 1 fi else diff --git a/benchmarks/scripts/list.sh b/benchmarks/scripts/list.sh new file mode 100755 index 000000000..b368028da --- /dev/null +++ b/benchmarks/scripts/list.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +# Requirements: +# - curl +# - grep + +res=$(curl -s https://milli-benchmarks.fra1.digitaloceanspaces.com | grep -oP "(?<=)[^<]+" | grep -oP --color=never "(?<=^critcmp_results/).+") + +for pattern in "$@" +do + res=$(echo "$res" | grep $pattern) +done + +echo "$res" From 3c91a9a551e19bc15449837b093de55fe594e743 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Tue, 1 Jun 2021 16:37:57 +0200 Subject: [PATCH 27/33] Update following reviews --- .github/workflows/benchmarks.yml | 14 +++++++++++--- benchmarks/README.md | 11 ++++++++--- benchmarks/scripts/compare.sh | 17 +++++------------ 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 867e13132..a2da8e6d5 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -39,12 +39,12 @@ jobs: id: file # Run benchmarks - - name: Run benchmarks - Dataset ${{ github.event.inputs.dataset_name }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} + - name: Run benchmarks - Dataset ${{ github.event.inputs.dataset_name }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} run: | cd benchmarks cargo bench --bench ${{ github.event.inputs.dataset_name }} -- --save-baseline ${{ steps.file.outputs.basename }} - # Generate critcmpf files + # Generate critcmp files - name: Install critcmp run: cargo install critcmp - name: Export cripcmp file @@ -52,7 +52,7 @@ jobs: critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json # Upload benchmarks - - name: Upload to DO Spaces # DigitalOcean Spaces = S3 + - name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3 uses: BetaHuhn/do-spaces-action@v2 with: access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }} @@ -61,3 +61,11 @@ jobs: space_region: ${{ secrets.DO_SPACES_SPACE_REGION }} source: ${{ steps.file.outputs.basename }}.json out_dir: critcmp_results + + # Helper + - name: 'README: compare with another benchmark' + run: | + echo "${{ steps.file.outputs.basename }}.json has just been pushed." + echo 'How to compare this benchmark with another one?' + echo ' - Check the available files with: ./benchmarks/scripts/list.sh' + echo " - Run the following command: ./benchmaks/scipts/compare.sh ${{ steps.file.outputs.basename }}.json " diff --git a/benchmarks/README.md b/benchmarks/README.md index cde4062e5..caa4e163f 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -52,8 +52,12 @@ To trigger the benchmark workflow: This GitHub workflow will run the benchmarks and push the `critcmp` report to a DigitalOcean Space (= S3). +The name of the uploaded file is displayed in the workflow. + _[More about critcmp](https://github.com/BurntSushi/critcmp)._ +💡 To compare the just-uploaded benchmark with another one, check out the [next section](#comparison-between-benchmarks). + ### On your machine To run all the benchmarks (~4h): @@ -85,13 +89,14 @@ The benchmark reports we push are generated with `critcmp`. Thus, we use `critcm We provide a script to download and display the comparison report. Requirements: -- [`s3cmd`](https://github.com/s3tools/s3cmd) and being logged to the DigitalOcean Space "milli-benchmarks". See the [DigitalOcean guide](https://docs.digitalocean.com/products/spaces/resources/s3cmd/) +- `grep` +- `curl` - [`critcmp`](https://github.com/BurntSushi/critcmp) List the available file in the DO Space: ```bash -s3cmd ls s3://milli-benchmarks/critcmp_results/ +./benchmarks/script/list.sh ``` ```bash 2021-05-31 14:40 279890 s3://milli-benchmarks/critcmp_results/songs_main_09a4321.json @@ -101,5 +106,5 @@ s3cmd ls s3://milli-benchmarks/critcmp_results/ Run the comparison script: ```bash -bash benchmarks/scripts/compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json +./benchmarks/scripts/compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json ``` diff --git a/benchmarks/scripts/compare.sh b/benchmarks/scripts/compare.sh index e4231131d..6f8d0c5af 100755 --- a/benchmarks/scripts/compare.sh +++ b/benchmarks/scripts/compare.sh @@ -2,7 +2,7 @@ # Requirements: # - critcmp. See: https://github.com/BurntSushi/critcmp -# - wget +# - curl # Usage # $ bash compare.sh json_file1 json_file1 @@ -17,13 +17,6 @@ if [[ "$?" -ne 0 ]]; then exit 1 fi -# Checking that wget is installed -command -v wget > /dev/null 2>&1 -if [[ "$?" -ne 0 ]]; then - echo 'You must install wget to make this script working.' - exit 1 -fi - if [[ $# -ne 2 ]] then echo 'Need 2 arguments.' @@ -43,9 +36,9 @@ file1_local_path="/tmp/$file1" file2_local_path="/tmp/$file2" if [[ ! -f "$file1_local_path" ]]; then - wget "$file1_s3_url" -O "$file1_local_path" + curl "$file1_s3_url" -O "$file1_local_path" if [[ "$?" -ne 0 ]]; then - echo 'wget command failed. Check your configuration' + echo 'curl command failed. Check your configuration' exit 1 fi else @@ -53,9 +46,9 @@ else fi if [[ ! -f "$file2_local_path" ]]; then - wget "$file2_s3_url" -O "$file2_local_path" + curl "$file2_s3_url" -O "$file2_local_path" if [[ "$?" -ne 0 ]]; then - echo 'wget command failed. Check your configuration' + echo 'curl command failed. Check your configuration' exit 1 fi else From edfcdb171c8a85f56d0d26dd5e6eb5a841c13690 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Tue, 1 Jun 2021 18:54:39 +0200 Subject: [PATCH 28/33] Update benchmarks/scripts/list.sh Co-authored-by: Irevoire --- benchmarks/scripts/list.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/scripts/list.sh b/benchmarks/scripts/list.sh index b368028da..764193329 100755 --- a/benchmarks/scripts/list.sh +++ b/benchmarks/scripts/list.sh @@ -4,7 +4,7 @@ # - curl # - grep -res=$(curl -s https://milli-benchmarks.fra1.digitaloceanspaces.com | grep -oP "(?<=)[^<]+" | grep -oP --color=never "(?<=^critcmp_results/).+") +res=$(curl -s https://milli-benchmarks.fra1.digitaloceanspaces.com | grep -o '[^<]\+' | cut -c 5- | grep critcmp_results/ | cut -c 18-) for pattern in "$@" do From ef1ac8a0cb9c0ef39f220bd553dbc02d0450f167 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Tue, 1 Jun 2021 18:57:35 +0200 Subject: [PATCH 29/33] Update README --- benchmarks/README.md | 4 ++-- benchmarks/scripts/compare.sh | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index caa4e163f..ebe8eecdf 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -99,8 +99,8 @@ List the available file in the DO Space: ./benchmarks/script/list.sh ``` ```bash -2021-05-31 14:40 279890 s3://milli-benchmarks/critcmp_results/songs_main_09a4321.json -2021-05-31 13:49 279576 s3://milli-benchmarks/critcmp_results/songs_geosearch_24ec456.json +songs_main_09a4321.json +songs_geosearch_24ec456.json ``` Run the comparison script: diff --git a/benchmarks/scripts/compare.sh b/benchmarks/scripts/compare.sh index 6f8d0c5af..067772bec 100755 --- a/benchmarks/scripts/compare.sh +++ b/benchmarks/scripts/compare.sh @@ -21,9 +21,9 @@ if [[ $# -ne 2 ]] then echo 'Need 2 arguments.' echo 'Usage: ' - echo ' $ bash compare.sh file_to_download1 file_to_download2' + echo ' $ ./compare.sh file_to_download1 file_to_download2' echo 'Ex:' - echo ' $ bash compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json' + echo ' $ ./compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json' exit 1 fi @@ -38,7 +38,7 @@ file2_local_path="/tmp/$file2" if [[ ! -f "$file1_local_path" ]]; then curl "$file1_s3_url" -O "$file1_local_path" if [[ "$?" -ne 0 ]]; then - echo 'curl command failed. Check your configuration' + echo 'curl command failed.' exit 1 fi else @@ -48,7 +48,7 @@ fi if [[ ! -f "$file2_local_path" ]]; then curl "$file2_s3_url" -O "$file2_local_path" if [[ "$?" -ne 0 ]]; then - echo 'curl command failed. Check your configuration' + echo 'curl command failed.' exit 1 fi else From f346805c0c84e7c1b9dbb4f72ef2d51964a60166 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Wed, 2 Jun 2021 15:47:03 +0200 Subject: [PATCH 30/33] Update benchmarks/Cargo.toml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- benchmarks/Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index f7b66fe3a..6be9c79d1 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -4,7 +4,6 @@ version = "0.1.0" edition = "2018" publish = false -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] milli = { path = "../milli" } From 3db25153e5e3641c2fbd5e793dce359b6de326e7 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 2 Jun 2021 17:00:58 +0200 Subject: [PATCH 31/33] fix the faceted_fields one last time --- benchmarks/benches/songs.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/benches/songs.rs b/benchmarks/benches/songs.rs index dea8cd605..3f2822ca3 100644 --- a/benchmarks/benches/songs.rs +++ b/benchmarks/benches/songs.rs @@ -21,14 +21,14 @@ fn base_conf(builder: &mut Settings) { builder.set_searchable_fields(searchable_fields); let faceted_fields = [ - ("released-timestamp", "number"), - ("duration-float", "number"), - ("genre", "string"), - ("country", "string"), - ("artist", "string"), + "released-timestamp", + "duration-float", + "genre", + "country", + "artist", ] .iter() - .map(|(a, b)| (a.to_string(), b.to_string())) + .map(|s| s.to_string()) .collect(); builder.set_faceted_fields(faceted_fields); } From 087ae648997e51bad96dff8ffa83991e864f2415 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 2 Jun 2021 17:03:30 +0200 Subject: [PATCH 32/33] add a gitignore to avoid pushing the autogenerated file --- benchmarks/.gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 benchmarks/.gitignore diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore new file mode 100644 index 000000000..1f259516b --- /dev/null +++ b/benchmarks/.gitignore @@ -0,0 +1 @@ +benches/datasets_paths.rs From 6dc08bf45e768bdf9ef4a87b7e8dc7b0346a99e1 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 2 Jun 2021 17:09:21 +0200 Subject: [PATCH 33/33] remove the nop function --- benchmarks/benches/utils.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 6fa5f2d19..83367a7ca 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -29,15 +29,13 @@ pub struct Conf<'a> { } impl Conf<'_> { - fn nop(_builder: &mut Settings) {} - pub const BASE: Self = Conf { database_name: "benches.mmdb", dataset: "", group_name: "", queries: &[], criterion: None, - configure: Self::nop, + configure: |_| (), facet_condition: None, optional_words: true, primary_key: None,