2021-04-14 16:26:21 +02:00
|
|
|
use std::fs::{create_dir_all, remove_dir_all, File};
|
2021-06-03 15:59:43 +02:00
|
|
|
use std::path::Path;
|
2021-04-01 18:54:14 +02:00
|
|
|
|
2021-04-07 11:50:38 +02:00
|
|
|
use criterion::BenchmarkId;
|
2021-04-14 13:13:33 +02:00
|
|
|
use heed::EnvOpenOptions;
|
|
|
|
use milli::{
|
|
|
|
update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat},
|
2021-06-03 10:33:42 +02:00
|
|
|
FilterCondition, Index,
|
2021-04-14 13:13:33 +02:00
|
|
|
};
|
2021-04-01 18:54:14 +02:00
|
|
|
|
2021-04-07 11:50:38 +02:00
|
|
|
pub struct Conf<'a> {
|
2021-04-13 10:44:27 +02:00
|
|
|
/// where we are going to create our database.mmdb directory
|
|
|
|
/// each benchmark will first try to delete it and then recreate it
|
|
|
|
pub database_name: &'a str,
|
|
|
|
/// the dataset to be used, it must be an uncompressed csv
|
|
|
|
pub dataset: &'a str,
|
2021-04-07 11:50:38 +02:00
|
|
|
pub group_name: &'a str,
|
2021-04-14 13:13:33 +02:00
|
|
|
pub queries: &'a [&'a str],
|
2021-04-13 11:40:16 +02:00
|
|
|
/// here you can change which criterion are used and in which order.
|
|
|
|
/// - if you specify something all the base configuration will be thrown out
|
|
|
|
/// - if you don't specify anything (None) the default configuration will be kept
|
2021-04-07 11:50:38 +02:00
|
|
|
pub criterion: Option<&'a [&'a str]>,
|
2021-04-13 11:40:16 +02:00
|
|
|
/// the last chance to configure your database as you want
|
|
|
|
pub configure: fn(&mut Settings),
|
2021-06-03 10:33:42 +02:00
|
|
|
pub filter: Option<&'a str>,
|
2021-04-13 11:40:16 +02:00
|
|
|
/// enable or disable the optional words on the query
|
2021-04-07 11:50:38 +02:00
|
|
|
pub optional_words: bool,
|
2021-05-25 17:09:14 +02:00
|
|
|
/// primary key, if there is None we'll auto-generate docids for every documents
|
|
|
|
pub primary_key: Option<&'a str>,
|
2021-04-07 11:50:38 +02:00
|
|
|
}
|
|
|
|
|
2021-04-13 10:44:27 +02:00
|
|
|
impl Conf<'_> {
|
|
|
|
pub const BASE: Self = Conf {
|
|
|
|
database_name: "benches.mmdb",
|
|
|
|
dataset: "",
|
|
|
|
group_name: "",
|
|
|
|
queries: &[],
|
|
|
|
criterion: None,
|
2021-06-02 17:09:21 +02:00
|
|
|
configure: |_| (),
|
2021-06-03 10:33:42 +02:00
|
|
|
filter: None,
|
2021-04-13 10:44:27 +02:00
|
|
|
optional_words: true,
|
2021-05-25 17:09:14 +02:00
|
|
|
primary_key: None,
|
2021-04-13 10:44:27 +02:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2021-04-13 11:40:16 +02:00
|
|
|
pub fn base_setup(conf: &Conf) -> Index {
|
|
|
|
match remove_dir_all(&conf.database_name) {
|
2021-04-13 10:44:27 +02:00
|
|
|
Ok(_) => (),
|
|
|
|
Err(e) if e.kind() == std::io::ErrorKind::NotFound => (),
|
|
|
|
Err(e) => panic!("{}", e),
|
|
|
|
}
|
2021-04-13 11:40:16 +02:00
|
|
|
create_dir_all(&conf.database_name).unwrap();
|
2021-04-01 18:54:14 +02:00
|
|
|
|
|
|
|
let mut options = EnvOpenOptions::new();
|
|
|
|
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
|
|
|
options.max_readers(10);
|
2021-04-13 11:40:16 +02:00
|
|
|
let index = Index::new(options, conf.database_name).unwrap();
|
2021-05-25 17:55:45 +02:00
|
|
|
if let Some(primary_key) = conf.primary_key {
|
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
|
|
|
index.put_primary_key(&mut wtxn, primary_key).unwrap();
|
|
|
|
}
|
2021-04-01 18:54:14 +02:00
|
|
|
|
|
|
|
let update_builder = UpdateBuilder::new(0);
|
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
|
|
|
let mut builder = update_builder.settings(&mut wtxn, &index);
|
|
|
|
|
2021-04-13 11:40:16 +02:00
|
|
|
if let Some(criterion) = conf.criterion {
|
2021-06-03 10:33:42 +02:00
|
|
|
builder.reset_filterable_fields();
|
2021-04-01 18:54:14 +02:00
|
|
|
builder.reset_criteria();
|
|
|
|
builder.reset_stop_words();
|
|
|
|
|
2021-04-13 11:40:16 +02:00
|
|
|
let criterion = criterion.iter().map(|s| s.to_string()).collect();
|
2021-04-07 11:50:38 +02:00
|
|
|
builder.set_criteria(criterion);
|
2021-04-01 18:54:14 +02:00
|
|
|
}
|
|
|
|
|
2021-04-13 11:40:16 +02:00
|
|
|
(conf.configure)(&mut builder);
|
|
|
|
|
2021-04-01 18:54:14 +02:00
|
|
|
builder.execute(|_, _| ()).unwrap();
|
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
let update_builder = UpdateBuilder::new(0);
|
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
|
|
|
let mut builder = update_builder.index_documents(&mut wtxn, &index);
|
2021-05-25 17:55:45 +02:00
|
|
|
if let None = conf.primary_key {
|
|
|
|
builder.enable_autogenerate_docids();
|
|
|
|
}
|
2021-04-01 18:54:14 +02:00
|
|
|
builder.update_format(UpdateFormat::Csv);
|
|
|
|
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
|
2021-05-25 17:09:14 +02:00
|
|
|
let reader = File::open(conf.dataset)
|
|
|
|
.expect(&format!("could not find the dataset in: {}", conf.dataset));
|
2021-04-01 18:54:14 +02:00
|
|
|
builder.execute(reader, |_, _| ()).unwrap();
|
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
index
|
|
|
|
}
|
2021-04-07 11:50:38 +02:00
|
|
|
|
|
|
|
pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
|
|
|
|
for conf in confs {
|
2021-04-13 11:40:16 +02:00
|
|
|
let index = base_setup(conf);
|
2021-04-07 11:50:38 +02:00
|
|
|
|
2021-06-03 15:59:43 +02:00
|
|
|
let file_name = Path::new(conf.dataset).file_name().and_then(|f| f.to_str()).unwrap();
|
|
|
|
let name = format!("{}: {}", file_name, conf.group_name);
|
|
|
|
let mut group = c.benchmark_group(&name);
|
2021-04-07 11:50:38 +02:00
|
|
|
|
|
|
|
for &query in conf.queries {
|
|
|
|
group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| {
|
|
|
|
b.iter(|| {
|
|
|
|
let rtxn = index.read_txn().unwrap();
|
2021-04-13 10:44:27 +02:00
|
|
|
let mut search = index.search(&rtxn);
|
|
|
|
search.query(query).optional_words(conf.optional_words);
|
2021-06-03 10:33:42 +02:00
|
|
|
if let Some(filter) = conf.filter {
|
2021-06-03 15:59:43 +02:00
|
|
|
let filter = FilterCondition::from_str(&rtxn, &index, filter).unwrap();
|
2021-06-03 10:33:42 +02:00
|
|
|
search.filter(filter);
|
2021-04-13 10:44:27 +02:00
|
|
|
}
|
|
|
|
let _ids = search.execute().unwrap();
|
2021-04-07 11:50:38 +02:00
|
|
|
});
|
|
|
|
});
|
|
|
|
}
|
|
|
|
group.finish();
|
|
|
|
}
|
|
|
|
}
|