2021-09-22 12:10:21 +02:00
|
|
|
#![allow(dead_code)]
|
|
|
|
|
2021-04-14 16:26:21 +02:00
|
|
|
use std::fs::{create_dir_all, remove_dir_all, File};
|
2024-11-19 10:45:27 +01:00
|
|
|
use std::io::{self, BufReader, BufWriter, Read};
|
2021-06-03 15:59:43 +02:00
|
|
|
use std::path::Path;
|
2024-11-19 11:24:36 +01:00
|
|
|
use std::str::FromStr as _;
|
2021-04-01 18:54:14 +02:00
|
|
|
|
2024-11-19 10:45:27 +01:00
|
|
|
use anyhow::Context;
|
|
|
|
use bumpalo::Bump;
|
2021-04-07 11:50:38 +02:00
|
|
|
use criterion::BenchmarkId;
|
2024-11-19 10:45:27 +01:00
|
|
|
use memmap2::Mmap;
|
2022-08-11 11:15:46 +02:00
|
|
|
use milli::heed::EnvOpenOptions;
|
2024-11-19 10:45:27 +01:00
|
|
|
use milli::update::new::indexer;
|
|
|
|
use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings};
|
|
|
|
use milli::vector::EmbeddingConfigs;
|
2023-01-11 12:14:17 +01:00
|
|
|
use milli::{Criterion, Filter, Index, Object, TermsMatchingStrategy};
|
2022-06-15 15:36:27 +02:00
|
|
|
use serde_json::Value;
|
2021-04-01 18:54:14 +02:00
|
|
|
|
2021-04-07 11:50:38 +02:00
|
|
|
pub struct Conf<'a> {
|
2021-04-13 10:44:27 +02:00
|
|
|
/// where we are going to create our database.mmdb directory
|
|
|
|
/// each benchmark will first try to delete it and then recreate it
|
|
|
|
pub database_name: &'a str,
|
|
|
|
/// the dataset to be used, it must be an uncompressed csv
|
|
|
|
pub dataset: &'a str,
|
2021-09-13 18:08:28 +02:00
|
|
|
/// The format of the dataset
|
2021-09-22 12:10:21 +02:00
|
|
|
pub dataset_format: &'a str,
|
2021-04-07 11:50:38 +02:00
|
|
|
pub group_name: &'a str,
|
2021-04-14 13:13:33 +02:00
|
|
|
pub queries: &'a [&'a str],
|
2021-04-13 11:40:16 +02:00
|
|
|
/// here you can change which criterion are used and in which order.
|
|
|
|
/// - if you specify something all the base configuration will be thrown out
|
|
|
|
/// - if you don't specify anything (None) the default configuration will be kept
|
2021-04-07 11:50:38 +02:00
|
|
|
pub criterion: Option<&'a [&'a str]>,
|
2021-04-13 11:40:16 +02:00
|
|
|
/// the last chance to configure your database as you want
|
|
|
|
pub configure: fn(&mut Settings),
|
2021-06-03 10:33:42 +02:00
|
|
|
pub filter: Option<&'a str>,
|
2021-09-13 18:08:28 +02:00
|
|
|
pub sort: Option<Vec<&'a str>>,
|
2021-04-13 11:40:16 +02:00
|
|
|
/// enable or disable the optional words on the query
|
2021-04-07 11:50:38 +02:00
|
|
|
pub optional_words: bool,
|
2021-05-25 17:09:14 +02:00
|
|
|
/// primary key, if there is None we'll auto-generate docids for every documents
|
|
|
|
pub primary_key: Option<&'a str>,
|
2021-04-07 11:50:38 +02:00
|
|
|
}
|
|
|
|
|
2021-04-13 10:44:27 +02:00
|
|
|
impl Conf<'_> {
|
|
|
|
pub const BASE: Self = Conf {
|
|
|
|
database_name: "benches.mmdb",
|
2021-09-22 12:10:21 +02:00
|
|
|
dataset_format: "csv",
|
2021-04-13 10:44:27 +02:00
|
|
|
dataset: "",
|
|
|
|
group_name: "",
|
|
|
|
queries: &[],
|
|
|
|
criterion: None,
|
2021-06-02 17:09:21 +02:00
|
|
|
configure: |_| (),
|
2021-06-03 10:33:42 +02:00
|
|
|
filter: None,
|
2021-09-13 18:08:28 +02:00
|
|
|
sort: None,
|
2021-04-13 10:44:27 +02:00
|
|
|
optional_words: true,
|
2021-05-25 17:09:14 +02:00
|
|
|
primary_key: None,
|
2021-04-13 10:44:27 +02:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2021-04-13 11:40:16 +02:00
|
|
|
pub fn base_setup(conf: &Conf) -> Index {
|
2023-01-17 18:01:26 +01:00
|
|
|
match remove_dir_all(conf.database_name) {
|
2021-04-13 10:44:27 +02:00
|
|
|
Ok(_) => (),
|
|
|
|
Err(e) if e.kind() == std::io::ErrorKind::NotFound => (),
|
|
|
|
Err(e) => panic!("{}", e),
|
|
|
|
}
|
2023-01-17 18:01:26 +01:00
|
|
|
create_dir_all(conf.database_name).unwrap();
|
2021-04-01 18:54:14 +02:00
|
|
|
|
|
|
|
let mut options = EnvOpenOptions::new();
|
|
|
|
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
|
|
|
options.max_readers(10);
|
2021-04-13 11:40:16 +02:00
|
|
|
let index = Index::new(options, conf.database_name).unwrap();
|
2021-04-01 18:54:14 +02:00
|
|
|
|
2021-12-08 14:12:07 +01:00
|
|
|
let config = IndexerConfig::default();
|
2021-04-01 18:54:14 +02:00
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
2021-12-08 14:12:07 +01:00
|
|
|
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
2021-04-01 18:54:14 +02:00
|
|
|
|
2021-06-15 13:45:20 +02:00
|
|
|
if let Some(primary_key) = conf.primary_key {
|
|
|
|
builder.set_primary_key(primary_key.to_string());
|
|
|
|
}
|
|
|
|
|
2021-04-13 11:40:16 +02:00
|
|
|
if let Some(criterion) = conf.criterion {
|
2021-06-03 10:33:42 +02:00
|
|
|
builder.reset_filterable_fields();
|
2021-04-01 18:54:14 +02:00
|
|
|
builder.reset_criteria();
|
|
|
|
builder.reset_stop_words();
|
|
|
|
|
2023-01-11 12:14:17 +01:00
|
|
|
let criterion = criterion.iter().map(|s| Criterion::from_str(s).unwrap()).collect();
|
2021-04-07 11:50:38 +02:00
|
|
|
builder.set_criteria(criterion);
|
2021-04-01 18:54:14 +02:00
|
|
|
}
|
|
|
|
|
2021-04-13 11:40:16 +02:00
|
|
|
(conf.configure)(&mut builder);
|
|
|
|
|
2022-10-05 17:41:07 +02:00
|
|
|
builder.execute(|_| (), || false).unwrap();
|
2021-04-01 18:54:14 +02:00
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
2021-12-08 14:12:07 +01:00
|
|
|
let config = IndexerConfig::default();
|
2021-04-01 18:54:14 +02:00
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
2024-11-19 10:45:27 +01:00
|
|
|
let rtxn = index.read_txn().unwrap();
|
|
|
|
let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
|
|
|
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
|
|
|
|
2021-09-22 12:10:21 +02:00
|
|
|
let documents = documents_from(conf.dataset, conf.dataset_format);
|
2024-11-19 10:45:27 +01:00
|
|
|
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
|
|
|
|
indexer.add_documents(&documents).unwrap();
|
|
|
|
|
|
|
|
let indexer_alloc = Bump::new();
|
|
|
|
let (document_changes, _operation_stats, primary_key) =
|
|
|
|
indexer.into_changes(&indexer_alloc, &index, &rtxn, None, &mut new_fields_ids_map).unwrap();
|
|
|
|
|
|
|
|
indexer::index(
|
|
|
|
&mut wtxn,
|
|
|
|
&index,
|
|
|
|
config.grenad_parameters(),
|
|
|
|
&db_fields_ids_map,
|
|
|
|
new_fields_ids_map,
|
|
|
|
primary_key,
|
|
|
|
&document_changes,
|
|
|
|
EmbeddingConfigs::default(),
|
|
|
|
&|| false,
|
|
|
|
&|_| (),
|
|
|
|
)
|
|
|
|
.unwrap();
|
|
|
|
|
2021-04-01 18:54:14 +02:00
|
|
|
wtxn.commit().unwrap();
|
2024-11-19 10:45:27 +01:00
|
|
|
drop(rtxn);
|
2021-04-01 18:54:14 +02:00
|
|
|
|
|
|
|
index
|
|
|
|
}
|
2021-04-07 11:50:38 +02:00
|
|
|
|
|
|
|
pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
|
|
|
|
for conf in confs {
|
2021-04-13 11:40:16 +02:00
|
|
|
let index = base_setup(conf);
|
2021-04-07 11:50:38 +02:00
|
|
|
|
2021-06-03 15:59:43 +02:00
|
|
|
let file_name = Path::new(conf.dataset).file_name().and_then(|f| f.to_str()).unwrap();
|
|
|
|
let name = format!("{}: {}", file_name, conf.group_name);
|
|
|
|
let mut group = c.benchmark_group(&name);
|
2021-04-07 11:50:38 +02:00
|
|
|
|
|
|
|
for &query in conf.queries {
|
|
|
|
group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| {
|
|
|
|
b.iter(|| {
|
|
|
|
let rtxn = index.read_txn().unwrap();
|
2021-04-13 10:44:27 +02:00
|
|
|
let mut search = index.search(&rtxn);
|
2022-08-22 17:37:36 +02:00
|
|
|
search.query(query).terms_matching_strategy(TermsMatchingStrategy::default());
|
2021-06-03 10:33:42 +02:00
|
|
|
if let Some(filter) = conf.filter {
|
2021-12-09 12:14:16 +01:00
|
|
|
let filter = Filter::from_str(filter).unwrap().unwrap();
|
2021-06-03 10:33:42 +02:00
|
|
|
search.filter(filter);
|
2021-04-13 10:44:27 +02:00
|
|
|
}
|
2021-09-13 18:08:28 +02:00
|
|
|
if let Some(sort) = &conf.sort {
|
|
|
|
let sort = sort.iter().map(|sort| sort.parse().unwrap()).collect();
|
|
|
|
search.sort_criteria(sort);
|
|
|
|
}
|
2021-04-13 10:44:27 +02:00
|
|
|
let _ids = search.execute().unwrap();
|
2021-04-07 11:50:38 +02:00
|
|
|
});
|
|
|
|
});
|
|
|
|
}
|
|
|
|
group.finish();
|
2021-07-29 14:31:00 +02:00
|
|
|
|
|
|
|
index.prepare_for_closing().wait();
|
2021-04-07 11:50:38 +02:00
|
|
|
}
|
|
|
|
}
|
2021-09-22 12:10:21 +02:00
|
|
|
|
2024-11-18 17:39:55 +01:00
|
|
|
pub fn documents_from(filename: &str, filetype: &str) -> Mmap {
|
2024-11-19 10:45:27 +01:00
|
|
|
let file = File::open(filename)
|
|
|
|
.unwrap_or_else(|_| panic!("could not find the dataset in: {filename}"));
|
|
|
|
match filetype {
|
|
|
|
"csv" => documents_from_csv(file).unwrap(),
|
|
|
|
"json" => documents_from_json(file).unwrap(),
|
|
|
|
"jsonl" => documents_from_jsonl(file).unwrap(),
|
|
|
|
otherwise => panic!("invalid update format {otherwise:?}"),
|
2021-09-22 12:10:21 +02:00
|
|
|
}
|
2024-11-19 10:45:27 +01:00
|
|
|
}
|
2021-09-22 12:10:21 +02:00
|
|
|
|
2024-11-19 10:45:27 +01:00
|
|
|
fn documents_from_jsonl(file: File) -> anyhow::Result<Mmap> {
|
|
|
|
unsafe { Mmap::map(&file).map_err(Into::into) }
|
2021-09-22 12:10:21 +02:00
|
|
|
}
|
|
|
|
|
2024-11-19 10:45:27 +01:00
|
|
|
fn documents_from_json(file: File) -> anyhow::Result<Mmap> {
|
|
|
|
let reader = BufReader::new(file);
|
|
|
|
let documents: Vec<milli::Object> = serde_json::from_reader(reader)?;
|
|
|
|
let mut output = tempfile::tempfile().map(BufWriter::new)?;
|
2021-09-22 12:10:21 +02:00
|
|
|
|
2024-11-19 10:45:27 +01:00
|
|
|
for document in documents {
|
|
|
|
serde_json::to_writer(&mut output, &document)?;
|
|
|
|
}
|
2021-09-22 12:10:21 +02:00
|
|
|
|
2024-11-19 10:45:27 +01:00
|
|
|
let file = output.into_inner()?;
|
|
|
|
unsafe { Mmap::map(&file).map_err(Into::into) }
|
2021-09-22 12:10:21 +02:00
|
|
|
}
|
|
|
|
|
2024-11-19 10:45:27 +01:00
|
|
|
fn documents_from_csv(file: File) -> anyhow::Result<Mmap> {
|
|
|
|
let output = tempfile::tempfile()?;
|
|
|
|
let mut output = BufWriter::new(output);
|
|
|
|
let mut reader = csv::ReaderBuilder::new().from_reader(file);
|
|
|
|
|
|
|
|
let headers = reader.headers().context("while retrieving headers")?.clone();
|
|
|
|
let typed_fields: Vec<_> = headers.iter().map(parse_csv_header).collect();
|
|
|
|
let mut object: serde_json::Map<_, _> =
|
|
|
|
typed_fields.iter().map(|(k, _)| (k.to_string(), Value::Null)).collect();
|
|
|
|
|
|
|
|
let mut line = 0;
|
|
|
|
let mut record = csv::StringRecord::new();
|
|
|
|
while reader.read_record(&mut record).context("while reading a record")? {
|
|
|
|
// We increment here and not at the end of the loop
|
|
|
|
// to take the header offset into account.
|
|
|
|
line += 1;
|
|
|
|
|
|
|
|
// Reset the document values
|
|
|
|
object.iter_mut().for_each(|(_, v)| *v = Value::Null);
|
|
|
|
|
|
|
|
for (i, (name, atype)) in typed_fields.iter().enumerate() {
|
|
|
|
let value = &record[i];
|
|
|
|
let trimmed_value = value.trim();
|
|
|
|
let value = match atype {
|
|
|
|
AllowedType::Number if trimmed_value.is_empty() => Value::Null,
|
|
|
|
AllowedType::Number => {
|
|
|
|
match trimmed_value.parse::<i64>() {
|
|
|
|
Ok(integer) => Value::from(integer),
|
|
|
|
Err(_) => match trimmed_value.parse::<f64>() {
|
|
|
|
Ok(float) => Value::from(float),
|
|
|
|
Err(error) => {
|
|
|
|
anyhow::bail!("document format error on line {line}: {error}. For value: {value}")
|
|
|
|
}
|
|
|
|
},
|
|
|
|
}
|
|
|
|
}
|
|
|
|
AllowedType::Boolean if trimmed_value.is_empty() => Value::Null,
|
|
|
|
AllowedType::Boolean => match trimmed_value.parse::<bool>() {
|
|
|
|
Ok(bool) => Value::from(bool),
|
|
|
|
Err(error) => {
|
|
|
|
anyhow::bail!(
|
|
|
|
"document format error on line {line}: {error}. For value: {value}"
|
|
|
|
)
|
|
|
|
}
|
|
|
|
},
|
|
|
|
AllowedType::String if value.is_empty() => Value::Null,
|
|
|
|
AllowedType::String => Value::from(value),
|
|
|
|
};
|
|
|
|
|
|
|
|
*object.get_mut(name).expect("encountered an unknown field") = value;
|
|
|
|
}
|
2022-06-14 18:17:48 +02:00
|
|
|
|
2024-11-19 10:45:27 +01:00
|
|
|
serde_json::to_writer(&mut output, &object).context("while writing to disk")?;
|
|
|
|
}
|
2021-09-22 12:10:21 +02:00
|
|
|
|
2024-11-19 10:45:27 +01:00
|
|
|
let output = output.into_inner()?;
|
|
|
|
unsafe { Mmap::map(&output).map_err(Into::into) }
|
2021-09-22 12:10:21 +02:00
|
|
|
}
|
2021-09-28 15:58:36 +02:00
|
|
|
|
|
|
|
enum AllowedType {
|
|
|
|
String,
|
2024-11-19 10:45:27 +01:00
|
|
|
Boolean,
|
2021-09-28 15:58:36 +02:00
|
|
|
Number,
|
|
|
|
}
|
|
|
|
|
|
|
|
fn parse_csv_header(header: &str) -> (String, AllowedType) {
|
|
|
|
// if there are several separators we only split on the last one.
|
|
|
|
match header.rsplit_once(':') {
|
|
|
|
Some((field_name, field_type)) => match field_type {
|
|
|
|
"string" => (field_name.to_string(), AllowedType::String),
|
2024-11-19 10:45:27 +01:00
|
|
|
"boolean" => (field_name.to_string(), AllowedType::Boolean),
|
2021-09-28 15:58:36 +02:00
|
|
|
"number" => (field_name.to_string(), AllowedType::Number),
|
2024-11-19 10:45:27 +01:00
|
|
|
// if the pattern isn't recognized, we keep the whole field.
|
2021-09-28 15:58:36 +02:00
|
|
|
_otherwise => (header.to_string(), AllowedType::String),
|
|
|
|
},
|
|
|
|
None => (header.to_string(), AllowedType::String),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
struct CSVDocumentDeserializer<R>
|
|
|
|
where
|
|
|
|
R: Read,
|
|
|
|
{
|
|
|
|
documents: csv::StringRecordsIntoIter<R>,
|
|
|
|
headers: Vec<(String, AllowedType)>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<R: Read> CSVDocumentDeserializer<R> {
|
|
|
|
fn from_reader(reader: R) -> io::Result<Self> {
|
|
|
|
let mut records = csv::Reader::from_reader(reader);
|
|
|
|
|
|
|
|
let headers = records.headers()?.into_iter().map(parse_csv_header).collect();
|
|
|
|
|
|
|
|
Ok(Self { documents: records.into_records(), headers })
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<R: Read> Iterator for CSVDocumentDeserializer<R> {
|
2022-06-15 15:36:27 +02:00
|
|
|
type Item = anyhow::Result<Object>;
|
2021-09-28 15:58:36 +02:00
|
|
|
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
|
|
let csv_document = self.documents.next()?;
|
|
|
|
|
|
|
|
match csv_document {
|
|
|
|
Ok(csv_document) => {
|
2022-06-15 15:36:27 +02:00
|
|
|
let mut document = Object::new();
|
2021-09-28 15:58:36 +02:00
|
|
|
|
|
|
|
for ((field_name, field_type), value) in
|
|
|
|
self.headers.iter().zip(csv_document.into_iter())
|
|
|
|
{
|
2024-11-19 10:45:27 +01:00
|
|
|
let parsed_value: anyhow::Result<Value> = match field_type {
|
2021-09-28 15:58:36 +02:00
|
|
|
AllowedType::Number => {
|
|
|
|
value.parse::<f64>().map(Value::from).map_err(Into::into)
|
|
|
|
}
|
2024-11-19 10:45:27 +01:00
|
|
|
AllowedType::Boolean => {
|
|
|
|
value.parse::<bool>().map(Value::from).map_err(Into::into)
|
|
|
|
}
|
2021-09-28 15:58:36 +02:00
|
|
|
AllowedType::String => Ok(Value::String(value.to_string())),
|
|
|
|
};
|
|
|
|
|
|
|
|
match parsed_value {
|
|
|
|
Ok(value) => drop(document.insert(field_name.to_string(), value)),
|
|
|
|
Err(_e) => {
|
|
|
|
return Some(Err(anyhow::anyhow!(
|
|
|
|
"Value '{}' is not a valid number",
|
|
|
|
value
|
|
|
|
)))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Some(Ok(document))
|
|
|
|
}
|
|
|
|
Err(e) => Some(Err(anyhow::anyhow!("Error parsing csv document: {}", e))),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|