mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-30 00:34:26 +01:00
Fix the benchmark tests
This commit is contained in:
parent
aba8a0e9e0
commit
3cf1352ae1
28
Cargo.lock
generated
28
Cargo.lock
generated
@ -494,11 +494,13 @@ name = "benchmarks"
|
|||||||
version = "1.11.0"
|
version = "1.11.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
|
"bumpalo",
|
||||||
"bytes",
|
"bytes",
|
||||||
"convert_case 0.6.0",
|
"convert_case 0.6.0",
|
||||||
"criterion",
|
"criterion",
|
||||||
"csv",
|
"csv",
|
||||||
"flate2",
|
"flate2",
|
||||||
|
"memmap2",
|
||||||
"milli",
|
"milli",
|
||||||
"mimalloc",
|
"mimalloc",
|
||||||
"rand",
|
"rand",
|
||||||
@ -506,6 +508,7 @@ dependencies = [
|
|||||||
"reqwest",
|
"reqwest",
|
||||||
"roaring",
|
"roaring",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
"tempfile",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -1860,9 +1863,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fastrand"
|
name = "fastrand"
|
||||||
version = "2.1.0"
|
version = "2.2.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"
|
checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "file-store"
|
name = "file-store"
|
||||||
@ -2869,9 +2872,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "libc"
|
name = "libc"
|
||||||
version = "0.2.155"
|
version = "0.2.164"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
|
checksum = "433bfe06b8c75da9b2e3fbea6e5329ff87748f0b144ef75306e674c3f6f7c13f"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "libgit2-sys"
|
name = "libgit2-sys"
|
||||||
@ -3255,9 +3258,9 @@ checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "linux-raw-sys"
|
name = "linux-raw-sys"
|
||||||
version = "0.4.12"
|
version = "0.4.14"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c4cd1a83af159aa67994778be9070f0ae1bd732942279cabb14f86f986a21456"
|
checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "liquid"
|
name = "liquid"
|
||||||
@ -3591,9 +3594,9 @@ checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "memmap2"
|
name = "memmap2"
|
||||||
version = "0.9.4"
|
version = "0.9.5"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322"
|
checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"libc",
|
"libc",
|
||||||
"stable_deref_trait",
|
"stable_deref_trait",
|
||||||
@ -4801,9 +4804,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustix"
|
name = "rustix"
|
||||||
version = "0.38.31"
|
version = "0.38.41"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949"
|
checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bitflags 2.6.0",
|
"bitflags 2.6.0",
|
||||||
"errno",
|
"errno",
|
||||||
@ -5372,12 +5375,13 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tempfile"
|
name = "tempfile"
|
||||||
version = "3.10.1"
|
version = "3.14.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1"
|
checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"fastrand",
|
"fastrand",
|
||||||
|
"once_cell",
|
||||||
"rustix",
|
"rustix",
|
||||||
"windows-sys 0.52.0",
|
"windows-sys 0.52.0",
|
||||||
]
|
]
|
||||||
|
@ -12,10 +12,13 @@ license.workspace = true
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow = "1.0.86"
|
anyhow = "1.0.86"
|
||||||
|
bumpalo = "3.16.0"
|
||||||
csv = "1.3.0"
|
csv = "1.3.0"
|
||||||
|
memmap2 = "0.9.5"
|
||||||
milli = { path = "../milli" }
|
milli = { path = "../milli" }
|
||||||
mimalloc = { version = "0.1.43", default-features = false }
|
mimalloc = { version = "0.1.43", default-features = false }
|
||||||
serde_json = { version = "1.0.120", features = ["preserve_order"] }
|
serde_json = { version = "1.0.120", features = ["preserve_order"] }
|
||||||
|
tempfile = "3.14.0"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
criterion = { version = "0.5.1", features = ["html_reports"] }
|
criterion = { version = "0.5.1", features = ["html_reports"] }
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -1,17 +1,19 @@
|
|||||||
#![allow(dead_code)]
|
#![allow(dead_code)]
|
||||||
|
|
||||||
use std::fs::{create_dir_all, remove_dir_all, File};
|
use std::fs::{create_dir_all, remove_dir_all, File};
|
||||||
use std::io::{self, BufRead, BufReader, Cursor, Read, Seek};
|
use std::io::{self, BufReader, BufWriter, Read};
|
||||||
use std::num::ParseFloatError;
|
use std::num::ParseFloatError;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
|
|
||||||
|
use anyhow::Context;
|
||||||
|
use bumpalo::Bump;
|
||||||
use criterion::BenchmarkId;
|
use criterion::BenchmarkId;
|
||||||
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
use memmap2::Mmap;
|
||||||
use milli::heed::EnvOpenOptions;
|
use milli::heed::EnvOpenOptions;
|
||||||
use milli::update::{
|
use milli::update::new::indexer;
|
||||||
IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
|
use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings};
|
||||||
};
|
use milli::vector::EmbeddingConfigs;
|
||||||
use milli::{Criterion, Filter, Index, Object, TermsMatchingStrategy};
|
use milli::{Criterion, Filter, Index, Object, TermsMatchingStrategy};
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
||||||
@ -92,18 +94,34 @@ pub fn base_setup(conf: &Conf) -> Index {
|
|||||||
|
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
let indexing_config = IndexDocumentsConfig {
|
let rtxn = index.read_txn().unwrap();
|
||||||
autogenerate_docids: conf.primary_key.is_none(),
|
let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||||
..Default::default()
|
|
||||||
};
|
|
||||||
let builder =
|
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap();
|
|
||||||
let documents = documents_from(conf.dataset, conf.dataset_format);
|
let documents = documents_from(conf.dataset, conf.dataset_format);
|
||||||
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
|
||||||
user_error.unwrap();
|
indexer.add_documents(&documents).unwrap();
|
||||||
builder.execute().unwrap();
|
|
||||||
|
let indexer_alloc = Bump::new();
|
||||||
|
let (document_changes, _operation_stats, primary_key) =
|
||||||
|
indexer.into_changes(&indexer_alloc, &index, &rtxn, None, &mut new_fields_ids_map).unwrap();
|
||||||
|
|
||||||
|
indexer::index(
|
||||||
|
&mut wtxn,
|
||||||
|
&index,
|
||||||
|
config.grenad_parameters(),
|
||||||
|
&db_fields_ids_map,
|
||||||
|
new_fields_ids_map,
|
||||||
|
primary_key,
|
||||||
|
&document_changes,
|
||||||
|
EmbeddingConfigs::default(),
|
||||||
|
&|| false,
|
||||||
|
&|_| (),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
drop(rtxn);
|
||||||
|
|
||||||
index
|
index
|
||||||
}
|
}
|
||||||
@ -141,48 +159,95 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn documents_from(filename: &str, filetype: &str) -> Mmap {
|
pub fn documents_from(filename: &str, filetype: &str) -> Mmap {
|
||||||
let reader = File::open(filename)
|
let file = File::open(filename)
|
||||||
.unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename));
|
.unwrap_or_else(|_| panic!("could not find the dataset in: {filename}"));
|
||||||
let reader = BufReader::new(reader);
|
match filetype {
|
||||||
let documents = match filetype {
|
"csv" => documents_from_csv(file).unwrap(),
|
||||||
"csv" => documents_from_csv(reader).unwrap(),
|
"json" => documents_from_json(file).unwrap(),
|
||||||
"json" => documents_from_json(reader).unwrap(),
|
"jsonl" => documents_from_jsonl(file).unwrap(),
|
||||||
"jsonl" => documents_from_jsonl(reader).unwrap(),
|
otherwise => panic!("invalid update format {otherwise:?}"),
|
||||||
otherwise => panic!("invalid update format {:?}", otherwise),
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn documents_from_jsonl(file: File) -> anyhow::Result<Mmap> {
|
||||||
|
unsafe { Mmap::map(&file).map_err(Into::into) }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn documents_from_json(file: File) -> anyhow::Result<Mmap> {
|
||||||
|
let reader = BufReader::new(file);
|
||||||
|
let documents: Vec<milli::Object> = serde_json::from_reader(reader)?;
|
||||||
|
let mut output = tempfile::tempfile().map(BufWriter::new)?;
|
||||||
|
|
||||||
|
for document in documents {
|
||||||
|
serde_json::to_writer(&mut output, &document)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let file = output.into_inner()?;
|
||||||
|
unsafe { Mmap::map(&file).map_err(Into::into) }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn documents_from_csv(file: File) -> anyhow::Result<Mmap> {
|
||||||
|
let output = tempfile::tempfile()?;
|
||||||
|
let mut output = BufWriter::new(output);
|
||||||
|
let mut reader = csv::ReaderBuilder::new().from_reader(file);
|
||||||
|
|
||||||
|
let headers = reader.headers().context("while retrieving headers")?.clone();
|
||||||
|
let typed_fields: Vec<_> = headers.iter().map(parse_csv_header).collect();
|
||||||
|
let mut object: serde_json::Map<_, _> =
|
||||||
|
typed_fields.iter().map(|(k, _)| (k.to_string(), Value::Null)).collect();
|
||||||
|
|
||||||
|
let mut line = 0;
|
||||||
|
let mut record = csv::StringRecord::new();
|
||||||
|
while reader.read_record(&mut record).context("while reading a record")? {
|
||||||
|
// We increment here and not at the end of the loop
|
||||||
|
// to take the header offset into account.
|
||||||
|
line += 1;
|
||||||
|
|
||||||
|
// Reset the document values
|
||||||
|
object.iter_mut().for_each(|(_, v)| *v = Value::Null);
|
||||||
|
|
||||||
|
for (i, (name, atype)) in typed_fields.iter().enumerate() {
|
||||||
|
let value = &record[i];
|
||||||
|
let trimmed_value = value.trim();
|
||||||
|
let value = match atype {
|
||||||
|
AllowedType::Number if trimmed_value.is_empty() => Value::Null,
|
||||||
|
AllowedType::Number => {
|
||||||
|
match trimmed_value.parse::<i64>() {
|
||||||
|
Ok(integer) => Value::from(integer),
|
||||||
|
Err(_) => match trimmed_value.parse::<f64>() {
|
||||||
|
Ok(float) => Value::from(float),
|
||||||
|
Err(error) => {
|
||||||
|
anyhow::bail!("document format error on line {line}: {error}. For value: {value}")
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
AllowedType::Boolean if trimmed_value.is_empty() => Value::Null,
|
||||||
|
AllowedType::Boolean => match trimmed_value.parse::<bool>() {
|
||||||
|
Ok(bool) => Value::from(bool),
|
||||||
|
Err(error) => {
|
||||||
|
anyhow::bail!(
|
||||||
|
"document format error on line {line}: {error}. For value: {value}"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
AllowedType::String if value.is_empty() => Value::Null,
|
||||||
|
AllowedType::String => Value::from(value),
|
||||||
};
|
};
|
||||||
DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap()
|
|
||||||
|
*object.get_mut(name).expect("encountered an unknown field") = value;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn documents_from_jsonl(reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
|
serde_json::to_writer(&mut output, &object).context("while writing to disk")?;
|
||||||
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
|
|
||||||
for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
|
|
||||||
let object = result?;
|
|
||||||
documents.append_json_object(&object)?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
documents.into_inner().map_err(Into::into)
|
let output = output.into_inner()?;
|
||||||
}
|
unsafe { Mmap::map(&output).map_err(Into::into) }
|
||||||
|
|
||||||
fn documents_from_json(reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
|
|
||||||
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
|
|
||||||
documents.append_json_array(reader)?;
|
|
||||||
|
|
||||||
documents.into_inner().map_err(Into::into)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn documents_from_csv(reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
|
|
||||||
let csv = csv::Reader::from_reader(reader);
|
|
||||||
|
|
||||||
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
documents.append_csv(csv)?;
|
|
||||||
|
|
||||||
documents.into_inner().map_err(Into::into)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
enum AllowedType {
|
enum AllowedType {
|
||||||
String,
|
String,
|
||||||
|
Boolean,
|
||||||
Number,
|
Number,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -191,8 +256,9 @@ fn parse_csv_header(header: &str) -> (String, AllowedType) {
|
|||||||
match header.rsplit_once(':') {
|
match header.rsplit_once(':') {
|
||||||
Some((field_name, field_type)) => match field_type {
|
Some((field_name, field_type)) => match field_type {
|
||||||
"string" => (field_name.to_string(), AllowedType::String),
|
"string" => (field_name.to_string(), AllowedType::String),
|
||||||
|
"boolean" => (field_name.to_string(), AllowedType::Boolean),
|
||||||
"number" => (field_name.to_string(), AllowedType::Number),
|
"number" => (field_name.to_string(), AllowedType::Number),
|
||||||
// we may return an error in this case.
|
// if the pattern isn't recognized, we keep the whole field.
|
||||||
_otherwise => (header.to_string(), AllowedType::String),
|
_otherwise => (header.to_string(), AllowedType::String),
|
||||||
},
|
},
|
||||||
None => (header.to_string(), AllowedType::String),
|
None => (header.to_string(), AllowedType::String),
|
||||||
@ -230,10 +296,13 @@ impl<R: Read> Iterator for CSVDocumentDeserializer<R> {
|
|||||||
for ((field_name, field_type), value) in
|
for ((field_name, field_type), value) in
|
||||||
self.headers.iter().zip(csv_document.into_iter())
|
self.headers.iter().zip(csv_document.into_iter())
|
||||||
{
|
{
|
||||||
let parsed_value: Result<Value, ParseFloatError> = match field_type {
|
let parsed_value: anyhow::Result<Value> = match field_type {
|
||||||
AllowedType::Number => {
|
AllowedType::Number => {
|
||||||
value.parse::<f64>().map(Value::from).map_err(Into::into)
|
value.parse::<f64>().map(Value::from).map_err(Into::into)
|
||||||
}
|
}
|
||||||
|
AllowedType::Boolean => {
|
||||||
|
value.parse::<bool>().map(Value::from).map_err(Into::into)
|
||||||
|
}
|
||||||
AllowedType::String => Ok(Value::String(value.to_string())),
|
AllowedType::String => Ok(Value::String(value.to_string())),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -12,7 +12,7 @@ use crate::{DocumentId, Result};
|
|||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct DocumentDeletion {
|
pub struct DocumentDeletion {
|
||||||
pub to_delete: RoaringBitmap,
|
to_delete: RoaringBitmap,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DocumentDeletion {
|
impl DocumentDeletion {
|
||||||
@ -26,11 +26,11 @@ impl DocumentDeletion {
|
|||||||
|
|
||||||
pub fn into_changes<'indexer>(
|
pub fn into_changes<'indexer>(
|
||||||
self,
|
self,
|
||||||
indexer: &'indexer Bump,
|
indexer_alloc: &'indexer Bump,
|
||||||
primary_key: PrimaryKey<'indexer>,
|
primary_key: PrimaryKey<'indexer>,
|
||||||
) -> DocumentDeletionChanges<'indexer> {
|
) -> DocumentDeletionChanges<'indexer> {
|
||||||
let to_delete: bumpalo::collections::Vec<_> =
|
let to_delete: bumpalo::collections::Vec<_> =
|
||||||
self.to_delete.into_iter().collect_in(indexer);
|
self.to_delete.into_iter().collect_in(indexer_alloc);
|
||||||
|
|
||||||
let to_delete = to_delete.into_bump_slice();
|
let to_delete = to_delete.into_bump_slice();
|
||||||
|
|
||||||
|
@ -107,6 +107,12 @@ impl<'pl> DocumentOperation<'pl> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Default for DocumentOperation<'_> {
|
||||||
|
fn default() -> Self {
|
||||||
|
DocumentOperation::new(IndexDocumentsMethod::default())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
fn extract_addition_payload_changes<'r, 'pl: 'r>(
|
fn extract_addition_payload_changes<'r, 'pl: 'r>(
|
||||||
indexer: &'pl Bump,
|
indexer: &'pl Bump,
|
||||||
|
Loading…
Reference in New Issue
Block a user