561: Enriched documents batch reader r=curquiza a=Kerollmops

~This PR is based on #555 and must be rebased on main after it has been merged to ease the review.~
This PR contains the work in #555 and can be merged on main as soon as reviewed and approved.

- [x] Create an `EnrichedDocumentsBatchReader` that contains the external documents id.
- [x] Extract the primary key name and make it accessible in the `EnrichedDocumentsBatchReader`.
- [x] Use the external id from the `EnrichedDocumentsBatchReader` in the `Transform::read_documents`.
- [x] Remove the `update_primary_key` from the _transform.rs_ file.
- [x] Really generate the auto-generated documents ids.
- [x] Insert the (auto-generated) document ids in the document while processing it in `Transform::read_documents`.

Co-authored-by: Kerollmops <clement@meilisearch.com>
This commit is contained in:
bors[bot] 2022-07-21 07:08:50 +00:00 committed by GitHub
commit 941af58239
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
29 changed files with 1668 additions and 1361 deletions

View file

@ -132,12 +132,13 @@ fn indexing_songs_default(c: &mut Criterion) {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -169,12 +170,13 @@ fn reindexing_songs_default(c: &mut Criterion) {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -184,12 +186,13 @@ fn reindexing_songs_default(c: &mut Criterion) {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -223,11 +226,12 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) {
let config = IndexerConfig::default();
let mut wtxn = index.write_txn().unwrap();
let indexing_config = IndexDocumentsConfig::default();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -279,11 +283,12 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) {
let config = IndexerConfig::default();
let mut wtxn = index.write_txn().unwrap();
let indexing_config = IndexDocumentsConfig::default();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_SONGS_1_2, "csv");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -294,19 +299,21 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_SONGS_3_4, "csv");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
let indexing_config = IndexDocumentsConfig::default();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_SONGS_4_4, "csv");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -339,13 +346,14 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -377,12 +385,13 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -415,12 +424,13 @@ fn indexing_wiki(c: &mut Criterion) {
let indexing_config =
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
let mut wtxn = index.write_txn().unwrap();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -452,12 +462,13 @@ fn reindexing_wiki(c: &mut Criterion) {
let indexing_config =
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
let mut wtxn = index.write_txn().unwrap();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -468,12 +479,13 @@ fn reindexing_wiki(c: &mut Criterion) {
let indexing_config =
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
let mut wtxn = index.write_txn().unwrap();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -507,11 +519,12 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) {
let mut wtxn = index.write_txn().unwrap();
let indexing_config =
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -564,12 +577,13 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) {
let config = IndexerConfig::default();
let indexing_config =
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents =
utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_1_2, "csv");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -581,24 +595,26 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) {
let indexing_config =
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
let mut wtxn = index.write_txn().unwrap();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents =
utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_3_4, "csv");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
let indexing_config =
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents =
utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_4_4, "csv");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -631,12 +647,13 @@ fn indexing_movies_default(c: &mut Criterion) {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::MOVIES, "json");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -667,12 +684,13 @@ fn reindexing_movies_default(c: &mut Criterion) {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::MOVIES, "json");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -682,12 +700,13 @@ fn reindexing_movies_default(c: &mut Criterion) {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::MOVIES, "json");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -720,11 +739,12 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) {
let config = IndexerConfig::default();
let mut wtxn = index.write_txn().unwrap();
let indexing_config = IndexDocumentsConfig::default();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::MOVIES, "json");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -775,12 +795,13 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) {
// as we don't care about the time it takes.
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::MOVIES_1_2, "json");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -791,21 +812,23 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::MOVIES_3_4, "json");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
let indexing_config = IndexDocumentsConfig::default();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::MOVIES_4_4, "json");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -861,12 +884,13 @@ fn indexing_nested_movies_default(c: &mut Criterion) {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -922,11 +946,12 @@ fn deleting_nested_movies_in_batches_default(c: &mut Criterion) {
let config = IndexerConfig::default();
let mut wtxn = index.write_txn().unwrap();
let indexing_config = IndexDocumentsConfig::default();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -984,12 +1009,13 @@ fn indexing_nested_movies_without_faceted_fields(c: &mut Criterion) {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -1021,12 +1047,13 @@ fn indexing_geo(c: &mut Criterion) {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -1058,12 +1085,13 @@ fn reindexing_geo(c: &mut Criterion) {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -1074,12 +1102,13 @@ fn reindexing_geo(c: &mut Criterion) {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
builder.add_documents(documents).unwrap();
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
@ -1113,11 +1142,12 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) {
let config = IndexerConfig::default();
let mut wtxn = index.write_txn().unwrap();
let indexing_config = IndexDocumentsConfig::default();
let mut builder =
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "json");
builder.add_documents(documents).unwrap();
let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();