diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index b5335d799..7338d134b 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -33,7 +33,7 @@ jobs: uses: actions-rs/cargo@v1 with: command: check - args: --all + args: --workspace --all-targets - name: Run cargo test uses: actions-rs/cargo@v1 with: diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index b598f2f6f..99a36b740 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -6,6 +6,9 @@ publish = false [dependencies] milli = { path = "../milli" } +anyhow = "1.0" +serde_json = { version = "1.0.62", features = ["preserve_order"] } +csv = "1.1.6" [target.'cfg(target_os = "linux")'.dependencies] jemallocator = "0.3.2" diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 30532aef8..66ecc7154 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -1,11 +1,12 @@ mod datasets_paths; +mod utils; -use std::fs::{create_dir_all, remove_dir_all, File}; +use std::fs::{create_dir_all, remove_dir_all}; use std::path::Path; use criterion::{criterion_group, criterion_main, Criterion}; use heed::EnvOpenOptions; -use milli::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}; +use milli::update::UpdateBuilder; use milli::Index; #[cfg(target_os = "linux")] @@ -67,15 +68,10 @@ fn indexing_songs_default(c: &mut Criterion) { move |index| { let update_builder = UpdateBuilder::new(0); let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.index_documents(&mut wtxn, &index); + let builder = update_builder.index_documents(&mut wtxn, &index); - builder.update_format(UpdateFormat::Csv); - builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!( - "could not find the dataset in: {}", - datasets_paths::SMOL_SONGS - )); - builder.execute(reader, |_, _| ()).unwrap(); + let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); + builder.execute(documents, |_, _| ()).unwrap(); wtxn.commit().unwrap(); index.prepare_for_closing().wait(); @@ -118,15 +114,10 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { move |index| { let update_builder = UpdateBuilder::new(0); let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.index_documents(&mut wtxn, &index); + let builder = update_builder.index_documents(&mut wtxn, &index); - builder.update_format(UpdateFormat::Csv); - builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!( - "could not find the dataset in: {}", - datasets_paths::SMOL_SONGS - )); - builder.execute(reader, |_, _| ()).unwrap(); + let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); + builder.execute(documents, |_, _| ()).unwrap(); wtxn.commit().unwrap(); index.prepare_for_closing().wait(); @@ -165,15 +156,10 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { move |index| { let update_builder = UpdateBuilder::new(0); let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.index_documents(&mut wtxn, &index); + let builder = update_builder.index_documents(&mut wtxn, &index); - builder.update_format(UpdateFormat::Csv); - builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!( - "could not find the dataset in: {}", - datasets_paths::SMOL_SONGS - )); - builder.execute(reader, |_, _| ()).unwrap(); + let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); + builder.execute(documents, |_, _| ()).unwrap(); wtxn.commit().unwrap(); index.prepare_for_closing().wait(); @@ -211,15 +197,10 @@ fn indexing_wiki(c: &mut Criterion) { move |index| { let update_builder = UpdateBuilder::new(0); let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.index_documents(&mut wtxn, &index); + let builder = update_builder.index_documents(&mut wtxn, &index); - builder.update_format(UpdateFormat::Csv); - builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - let reader = File::open(datasets_paths::SMOL_WIKI_ARTICLES).expect(&format!( - "could not find the dataset in: {}", - datasets_paths::SMOL_SONGS - )); - builder.execute(reader, |_, _| ()).unwrap(); + let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); + builder.execute(documents, |_, _| ()).unwrap(); wtxn.commit().unwrap(); index.prepare_for_closing().wait(); @@ -262,13 +243,10 @@ fn indexing_movies_default(c: &mut Criterion) { move |index| { let update_builder = UpdateBuilder::new(0); let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.index_documents(&mut wtxn, &index); + let builder = update_builder.index_documents(&mut wtxn, &index); - builder.update_format(UpdateFormat::Json); - builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - let reader = File::open(datasets_paths::MOVIES) - .expect(&format!("could not find the dataset in: {}", datasets_paths::MOVIES)); - builder.execute(reader, |_, _| ()).unwrap(); + let documents = utils::documents_from(datasets_paths::MOVIES, "json"); + builder.execute(documents, |_, _| ()).unwrap(); wtxn.commit().unwrap(); index.prepare_for_closing().wait(); @@ -316,15 +294,11 @@ fn indexing_geo(c: &mut Criterion) { move |index| { let update_builder = UpdateBuilder::new(0); let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.index_documents(&mut wtxn, &index); + let builder = update_builder.index_documents(&mut wtxn, &index); + + let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); + builder.execute(documents, |_, _| ()).unwrap(); - builder.update_format(UpdateFormat::JsonStream); - builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - let reader = File::open(datasets_paths::SMOL_ALL_COUNTRIES).expect(&format!( - "could not find the dataset in: {}", - datasets_paths::SMOL_ALL_COUNTRIES - )); - builder.execute(reader, |_, _| ()).unwrap(); wtxn.commit().unwrap(); index.prepare_for_closing().wait(); diff --git a/benchmarks/benches/search_geo.rs b/benchmarks/benches/search_geo.rs index 1432f691b..84448c32d 100644 --- a/benchmarks/benches/search_geo.rs +++ b/benchmarks/benches/search_geo.rs @@ -2,7 +2,7 @@ mod datasets_paths; mod utils; use criterion::{criterion_group, criterion_main}; -use milli::update::{Settings, UpdateFormat}; +use milli::update::Settings; use utils::Conf; #[cfg(target_os = "linux")] @@ -33,7 +33,7 @@ fn base_conf(builder: &mut Settings) { #[rustfmt::skip] const BASE_CONF: Conf = Conf { dataset: datasets_paths::SMOL_ALL_COUNTRIES, - dataset_format: UpdateFormat::JsonStream, + dataset_format: "jsonl", queries: &[ "", ], diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 72eac59d9..e5bdbdfaa 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -1,10 +1,15 @@ +#![allow(dead_code)] + use std::fs::{create_dir_all, remove_dir_all, File}; +use std::io::{self, Cursor, Read, Seek}; use std::path::Path; use criterion::BenchmarkId; use heed::EnvOpenOptions; -use milli::update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat}; +use milli::documents::DocumentBatchReader; +use milli::update::{IndexDocumentsMethod, Settings, UpdateBuilder}; use milli::{FilterCondition, Index}; +use serde_json::{Map, Value}; pub struct Conf<'a> { /// where we are going to create our database.mmdb directory @@ -13,7 +18,7 @@ pub struct Conf<'a> { /// the dataset to be used, it must be an uncompressed csv pub dataset: &'a str, /// The format of the dataset - pub dataset_format: UpdateFormat, + pub dataset_format: &'a str, pub group_name: &'a str, pub queries: &'a [&'a str], /// here you can change which criterion are used and in which order. @@ -33,7 +38,7 @@ pub struct Conf<'a> { impl Conf<'_> { pub const BASE: Self = Conf { database_name: "benches.mmdb", - dataset_format: UpdateFormat::Csv, + dataset_format: "csv", dataset: "", group_name: "", queries: &[], @@ -87,11 +92,10 @@ pub fn base_setup(conf: &Conf) -> Index { if let None = conf.primary_key { builder.enable_autogenerate_docids(); } - builder.update_format(conf.dataset_format); + let documents = documents_from(conf.dataset, conf.dataset_format); + builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - let reader = File::open(conf.dataset) - .expect(&format!("could not find the dataset in: {}", conf.dataset)); - builder.execute(reader, |_, _| ()).unwrap(); + builder.execute(documents, |_, _| ()).unwrap(); wtxn.commit().unwrap(); index @@ -128,3 +132,58 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { index.prepare_for_closing().wait(); } } + +pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader { + let reader = + File::open(filename).expect(&format!("could not find the dataset in: {}", filename)); + let documents = match filetype { + "csv" => documents_from_csv(reader).unwrap(), + "json" => documents_from_json(reader).unwrap(), + "jsonl" => documents_from_jsonl(reader).unwrap(), + otherwise => panic!("invalid update format {:?}", otherwise), + }; + DocumentBatchReader::from_reader(Cursor::new(documents)).unwrap() +} + +fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result> { + let mut writer = Cursor::new(Vec::new()); + let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; + + let values = serde_json::Deserializer::from_reader(reader) + .into_iter::>(); + for document in values { + let document = document?; + documents.add_documents(document)?; + } + documents.finish()?; + + Ok(writer.into_inner()) +} + +fn documents_from_json(reader: impl io::Read) -> anyhow::Result> { + let mut writer = Cursor::new(Vec::new()); + let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; + + let json: serde_json::Value = serde_json::from_reader(reader)?; + documents.add_documents(json)?; + documents.finish()?; + + Ok(writer.into_inner()) +} + +fn documents_from_csv(reader: impl io::Read) -> anyhow::Result> { + let mut writer = Cursor::new(Vec::new()); + let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; + + let mut records = csv::Reader::from_reader(reader); + let iter = records.deserialize::>(); + + for doc in iter { + let doc = doc?; + documents.add_documents(doc)?; + } + + documents.finish()?; + + Ok(writer.into_inner()) +}