From 4391cba6ca10a60cce5be068f61634ee02e7a3aa Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 17 May 2023 18:19:43 +0200 Subject: [PATCH] fix the addition + deletion bug --- Cargo.lock | 22 +++ .../after_processing_the_batch.snap | 43 +++++ .../documents.snap | 9 + .../registered_the_first_task.snap | 37 ++++ .../registered_the_second_task.snap | 40 ++++ .../after_failing_the_deletion.snap | 43 +++++ .../after_last_successful_addition.snap | 46 +++++ .../documents.snap | 17 ++ .../registered_the_first_task.snap | 36 ++++ .../registered_the_second_task.snap | 40 ++++ milli/Cargo.toml | 4 +- milli/src/documents/mod.rs | 2 - milli/src/update/index_documents/mod.rs | 176 +++++++++++++++++- milli/src/update/index_documents/transform.rs | 21 +-- 14 files changed, 518 insertions(+), 18 deletions(-) create mode 100644 index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/after_processing_the_batch.snap create mode 100644 index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/documents.snap create mode 100644 index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/registered_the_first_task.snap create mode 100644 index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/registered_the_second_task.snap create mode 100644 index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/after_failing_the_deletion.snap create mode 100644 index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/after_last_successful_addition.snap create mode 100644 index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/documents.snap create mode 100644 index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/registered_the_first_task.snap create mode 100644 index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/registered_the_second_task.snap diff --git a/Cargo.lock b/Cargo.lock index 5f192b6d1..a432908a2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -359,6 +359,15 @@ dependencies = [ "backtrace", ] +[[package]] +name = "arbitrary" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2d098ff73c1ca148721f37baad5ea6a465a13f9573aba8641fbbbae8164a54e" +dependencies = [ + "derive_arbitrary", +] + [[package]] name = "assert-json-diff" version = "2.0.2" @@ -1096,6 +1105,17 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "derive_arbitrary" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cdeb9ec472d588e539a818b2dee436825730da08ad0017c4b1a17676bdc8b7" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "derive_builder" version = "0.12.0" @@ -2711,6 +2731,7 @@ dependencies = [ name = "milli" version = "1.2.0" dependencies = [ + "arbitrary", "big_s", "bimap", "bincode", @@ -2722,6 +2743,7 @@ dependencies = [ "csv", "deserr", "either", + "fastrand", "filter-parser", "flatten-serde-json", "fst", diff --git a/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/after_processing_the_batch.snap b/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/after_processing_the_batch.snap new file mode 100644 index 000000000..b27288a0f --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/after_processing_the_batch.snap @@ -0,0 +1,43 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} +1 {uid: 1, status: succeeded, details: { received_document_ids: 2, deleted_documents: Some(2) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }} +---------------------------------------------------------------------- +### Status: +enqueued [] +succeeded [0,1,] +---------------------------------------------------------------------- +### Kind: +"documentAdditionOrUpdate" [0,] +"documentDeletion" [1,] +---------------------------------------------------------------------- +### Index Tasks: +doggos [0,1,] +---------------------------------------------------------------------- +### Index Mapper: +doggos: { number_of_documents: 1, field_distribution: {"doggo": 1, "id": 1} } + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### Started At: +[timestamp] [0,1,] +---------------------------------------------------------------------- +### Finished At: +[timestamp] [0,1,] +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- + diff --git a/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/documents.snap b/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/documents.snap new file mode 100644 index 000000000..2b56b71d1 --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/documents.snap @@ -0,0 +1,9 @@ +--- +source: index-scheduler/src/lib.rs +--- +[ + { + "id": 3, + "doggo": "bork" + } +] diff --git a/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/registered_the_first_task.snap b/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/registered_the_first_task.snap new file mode 100644 index 000000000..d26e62bff --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/registered_the_first_task.snap @@ -0,0 +1,37 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} +---------------------------------------------------------------------- +### Status: +enqueued [0,] +---------------------------------------------------------------------- +### Kind: +"documentAdditionOrUpdate" [0,] +---------------------------------------------------------------------- +### Index Tasks: +doggos [0,] +---------------------------------------------------------------------- +### Index Mapper: + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Started At: +---------------------------------------------------------------------- +### Finished At: +---------------------------------------------------------------------- +### File Store: +00000000-0000-0000-0000-000000000000 + +---------------------------------------------------------------------- + diff --git a/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/registered_the_second_task.snap b/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/registered_the_second_task.snap new file mode 100644 index 000000000..e0f371120 --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/registered_the_second_task.snap @@ -0,0 +1,40 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} +1 {uid: 1, status: enqueued, details: { received_document_ids: 2, deleted_documents: None }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }} +---------------------------------------------------------------------- +### Status: +enqueued [0,1,] +---------------------------------------------------------------------- +### Kind: +"documentAdditionOrUpdate" [0,] +"documentDeletion" [1,] +---------------------------------------------------------------------- +### Index Tasks: +doggos [0,1,] +---------------------------------------------------------------------- +### Index Mapper: + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### Started At: +---------------------------------------------------------------------- +### Finished At: +---------------------------------------------------------------------- +### File Store: +00000000-0000-0000-0000-000000000000 + +---------------------------------------------------------------------- + diff --git a/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/after_failing_the_deletion.snap b/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/after_failing_the_deletion.snap new file mode 100644 index 000000000..1d4aa24e2 --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/after_failing_the_deletion.snap @@ -0,0 +1,43 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: failed, error: ResponseError { code: 200, message: "Index `doggos` not found.", error_code: "index_not_found", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#index_not_found" }, details: { received_document_ids: 2, deleted_documents: Some(0) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }} +1 {uid: 1, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} +---------------------------------------------------------------------- +### Status: +enqueued [1,] +failed [0,] +---------------------------------------------------------------------- +### Kind: +"documentAdditionOrUpdate" [1,] +"documentDeletion" [0,] +---------------------------------------------------------------------- +### Index Tasks: +doggos [0,1,] +---------------------------------------------------------------------- +### Index Mapper: + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### Started At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Finished At: +[timestamp] [0,] +---------------------------------------------------------------------- +### File Store: +00000000-0000-0000-0000-000000000000 + +---------------------------------------------------------------------- + diff --git a/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/after_last_successful_addition.snap b/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/after_last_successful_addition.snap new file mode 100644 index 000000000..0f9dfd3e6 --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/after_last_successful_addition.snap @@ -0,0 +1,46 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: failed, error: ResponseError { code: 200, message: "Index `doggos` not found.", error_code: "index_not_found", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#index_not_found" }, details: { received_document_ids: 2, deleted_documents: Some(0) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }} +1 {uid: 1, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} +---------------------------------------------------------------------- +### Status: +enqueued [] +succeeded [1,] +failed [0,] +---------------------------------------------------------------------- +### Kind: +"documentAdditionOrUpdate" [1,] +"documentDeletion" [0,] +---------------------------------------------------------------------- +### Index Tasks: +doggos [0,1,] +---------------------------------------------------------------------- +### Index Mapper: +doggos: { number_of_documents: 3, field_distribution: {"catto": 1, "doggo": 2, "id": 3} } + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### Started At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### Finished At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- + diff --git a/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/documents.snap b/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/documents.snap new file mode 100644 index 000000000..8204d059b --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/documents.snap @@ -0,0 +1,17 @@ +--- +source: index-scheduler/src/lib.rs +--- +[ + { + "id": 1, + "doggo": "jean bob" + }, + { + "id": 2, + "catto": "jorts" + }, + { + "id": 3, + "doggo": "bork" + } +] diff --git a/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/registered_the_first_task.snap b/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/registered_the_first_task.snap new file mode 100644 index 000000000..5753db7e6 --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/registered_the_first_task.snap @@ -0,0 +1,36 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: enqueued, details: { received_document_ids: 2, deleted_documents: None }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }} +---------------------------------------------------------------------- +### Status: +enqueued [0,] +---------------------------------------------------------------------- +### Kind: +"documentDeletion" [0,] +---------------------------------------------------------------------- +### Index Tasks: +doggos [0,] +---------------------------------------------------------------------- +### Index Mapper: + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Started At: +---------------------------------------------------------------------- +### Finished At: +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- + diff --git a/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/registered_the_second_task.snap b/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/registered_the_second_task.snap new file mode 100644 index 000000000..0b6191f9e --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/registered_the_second_task.snap @@ -0,0 +1,40 @@ +--- +source: index-scheduler/src/lib.rs +--- +### Autobatching Enabled = true +### Processing Tasks: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: enqueued, details: { received_document_ids: 2, deleted_documents: None }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }} +1 {uid: 1, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} +---------------------------------------------------------------------- +### Status: +enqueued [0,1,] +---------------------------------------------------------------------- +### Kind: +"documentAdditionOrUpdate" [1,] +"documentDeletion" [0,] +---------------------------------------------------------------------- +### Index Tasks: +doggos [0,1,] +---------------------------------------------------------------------- +### Index Mapper: + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### Started At: +---------------------------------------------------------------------- +### Finished At: +---------------------------------------------------------------------- +### File Store: +00000000-0000-0000-0000-000000000000 + +---------------------------------------------------------------------- + diff --git a/milli/Cargo.toml b/milli/Cargo.toml index de0f4e31d..ea48e008c 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -56,6 +56,7 @@ itertools = "0.10.5" log = "0.4.17" logging_timer = "1.1.0" csv = "1.2.1" +fastrand = "1.9.0" [dev-dependencies] mimalloc = { version = "0.1.29", default-features = false } @@ -64,12 +65,13 @@ insta = "1.29.0" maplit = "1.0.2" md5 = "0.7.0" rand = {version = "0.8.5", features = ["small_rng"] } +arbitrary = { version = "1.3.0", features = ["derive"] } [target.'cfg(fuzzing)'.dev-dependencies] fuzzcheck = "0.12.1" [features] -all-tokenizations = [ "charabia/default" ] +all-tokenizations = ["charabia/default"] # Use POSIX semaphores instead of SysV semaphores in LMDB # For more information on this feature, see heed's Cargo.toml diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 43b31187d..7c037b3bf 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -111,7 +111,6 @@ pub enum Error { Io(#[from] io::Error), } -#[cfg(test)] pub fn objects_from_json_value(json: serde_json::Value) -> Vec { let documents = match json { object @ serde_json::Value::Object(_) => vec![object], @@ -141,7 +140,6 @@ macro_rules! documents { }}; } -#[cfg(test)] pub fn documents_batch_reader_from_objects( objects: impl IntoIterator, ) -> DocumentsBatchReader>> { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index bbfa1d00c..406bfb0c9 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -198,6 +198,7 @@ where let number_of_documents = self.index.number_of_documents(self.wtxn)?; return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents }); } + let output = self .transform .take() @@ -220,6 +221,7 @@ where } let indexed_documents = output.documents_count as u64; + let number_of_documents = self.execute_raw(output)?; Ok(DocumentAdditionResult { indexed_documents, number_of_documents }) @@ -236,7 +238,7 @@ where primary_key, fields_ids_map, field_distribution, - mut external_documents_ids, + new_external_documents_ids, new_documents_ids, replaced_documents_ids, documents_count, @@ -363,9 +365,6 @@ where deletion_builder.delete_documents(&replaced_documents_ids); let deleted_documents_result = deletion_builder.execute_inner()?; debug!("{} documents actually deleted", deleted_documents_result.deleted_documents); - if !deleted_documents_result.soft_deletion_used { - external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?; - } } let index_documents_ids = self.index.documents_ids(self.wtxn)?; @@ -445,6 +444,9 @@ where self.index.put_primary_key(self.wtxn, &primary_key)?; // We write the external documents ids into the main database. + let mut external_documents_ids = self.index.external_documents_ids(self.wtxn)?; + external_documents_ids.insert_ids(&new_external_documents_ids)?; + let external_documents_ids = external_documents_ids.into_static(); self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?; let all_documents_ids = index_documents_ids | new_documents_ids; @@ -2515,4 +2517,170 @@ mod tests { db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f"); db_snap!(index, docid_word_positions, 3, @"5287245332627675740b28bd46e1cde1"); } + + #[test] + fn reproduce_the_bug() { + /* + [milli/examples/fuzz.rs:69] &batches = [ + Batch( + [ + AddDoc( + { "id": 1, "doggo": "bernese" }, => internal 0 + ), + ], + ), + Batch( + [ + DeleteDoc( + 1, => delete internal 0 + ), + AddDoc( + { "id": 0, "catto": "jorts" }, => internal 1 + ), + ], + ), + Batch( + [ + AddDoc( + { "id": 1, "catto": "jorts" }, => internal 2 + ), + ], + ), + ] + */ + let mut index = TempIndex::new(); + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; + + // START OF BATCH + + println!("--- ENTERING BATCH 1"); + + let mut wtxn = index.write_txn().unwrap(); + + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &index.indexer_config, + index.index_documents_config.clone(), + |_| (), + || false, + ) + .unwrap(); + + // OP + + let documents = documents!([ + { "id": 1, "doggo": "bernese" }, + ]); + let (builder, added) = builder.add_documents(documents).unwrap(); + insta::assert_display_snapshot!(added.unwrap(), @"1"); + + // FINISHING + let addition = builder.execute().unwrap(); + insta::assert_debug_snapshot!(addition, @r###" + DocumentAdditionResult { + indexed_documents: 1, + number_of_documents: 1, + } + "###); + wtxn.commit().unwrap(); + + db_snap!(index, documents, @r###" + {"id":1,"doggo":"bernese"} + "###); + db_snap!(index, external_documents_ids, @r###" + soft: + hard: + 1 0 + "###); + + // A first batch of documents has been inserted + + // BATCH 2 + + println!("--- ENTERING BATCH 2"); + + let mut wtxn = index.write_txn().unwrap(); + + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &index.indexer_config, + index.index_documents_config.clone(), + |_| (), + || false, + ) + .unwrap(); + + let (builder, removed) = builder.remove_documents(vec![S("1")]).unwrap(); + insta::assert_display_snapshot!(removed.unwrap(), @"1"); + + let documents = documents!([ + { "id": 0, "catto": "jorts" }, + ]); + let (builder, added) = builder.add_documents(documents).unwrap(); + insta::assert_display_snapshot!(added.unwrap(), @"1"); + + let addition = builder.execute().unwrap(); + insta::assert_debug_snapshot!(addition, @r###" + DocumentAdditionResult { + indexed_documents: 1, + number_of_documents: 1, + } + "###); + wtxn.commit().unwrap(); + + db_snap!(index, documents, @r###" + {"id":0,"catto":"jorts"} + "###); + + db_snap!(index, external_documents_ids, @r###" + soft: + hard: + 0 1 + "###); + + db_snap!(index, soft_deleted_documents_ids, @"[]"); + + // BATCH 3 + + println!("--- ENTERING BATCH 3"); + + let mut wtxn = index.write_txn().unwrap(); + + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &index.indexer_config, + index.index_documents_config.clone(), + |_| (), + || false, + ) + .unwrap(); + + let documents = documents!([ + { "id": 1, "catto": "jorts" }, + ]); + let (builder, added) = builder.add_documents(documents).unwrap(); + insta::assert_display_snapshot!(added.unwrap(), @"1"); + + let addition = builder.execute().unwrap(); + insta::assert_debug_snapshot!(addition, @r###" + DocumentAdditionResult { + indexed_documents: 1, + number_of_documents: 2, + } + "###); + wtxn.commit().unwrap(); + + db_snap!(index, documents, @r###" + {"id":1,"catto":"jorts"} + {"id":0,"catto":"jorts"} + "###); + + // Ensuring all the returned IDs actually exists + let rtxn = index.read_txn().unwrap(); + let res = index.search(&rtxn).execute().unwrap(); + index.documents(&rtxn, res.documents_ids).unwrap(); + } } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 6097278a7..e2a260391 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -21,15 +21,14 @@ use crate::error::{Error, InternalError, UserError}; use crate::index::{db_name, main_key}; use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep}; use crate::{ - ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, - Result, BEU32, + FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32, }; pub struct TransformOutput { pub primary_key: String, pub fields_ids_map: FieldsIdsMap, pub field_distribution: FieldDistribution, - pub external_documents_ids: ExternalDocumentsIds<'static>, + pub new_external_documents_ids: fst::Map>, pub new_documents_ids: RoaringBitmap, pub replaced_documents_ids: RoaringBitmap, pub documents_count: usize, @@ -58,8 +57,8 @@ pub struct Transform<'a, 'i> { original_sorter: grenad::Sorter, flattened_sorter: grenad::Sorter, - replaced_documents_ids: RoaringBitmap, - new_documents_ids: RoaringBitmap, + pub replaced_documents_ids: RoaringBitmap, + pub new_documents_ids: RoaringBitmap, // To increase the cache locality and decrease the heap usage we use compact smartstring. new_external_documents_ids_builder: FxHashMap, u64>, documents_count: usize, @@ -568,8 +567,6 @@ impl<'a, 'i> Transform<'a, 'i> { }))? .to_string(); - let mut external_documents_ids = self.index.external_documents_ids(wtxn)?; - // We create a final writer to write the new documents in order from the sorter. let mut writer = create_writer( self.indexer_settings.chunk_compression_type, @@ -651,13 +648,14 @@ impl<'a, 'i> Transform<'a, 'i> { fst_new_external_documents_ids_builder.insert(key, value) })?; let new_external_documents_ids = fst_new_external_documents_ids_builder.into_map(); - external_documents_ids.insert_ids(&new_external_documents_ids)?; Ok(TransformOutput { primary_key, fields_ids_map: self.fields_ids_map, field_distribution, - external_documents_ids: external_documents_ids.into_static(), + new_external_documents_ids: new_external_documents_ids + .map_data(|c| Cow::Owned(c)) + .unwrap(), new_documents_ids: self.new_documents_ids, replaced_documents_ids: self.replaced_documents_ids, documents_count: self.documents_count, @@ -691,7 +689,8 @@ impl<'a, 'i> Transform<'a, 'i> { let new_external_documents_ids = { let mut external_documents_ids = self.index.external_documents_ids(wtxn)?; external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?; - external_documents_ids + // it is safe to get the hard document IDs + external_documents_ids.into_static().hard }; let documents_ids = self.index.documents_ids(wtxn)?; @@ -776,7 +775,7 @@ impl<'a, 'i> Transform<'a, 'i> { primary_key, fields_ids_map: new_fields_ids_map, field_distribution, - external_documents_ids: new_external_documents_ids.into_static(), + new_external_documents_ids, new_documents_ids: documents_ids, replaced_documents_ids: RoaringBitmap::default(), documents_count,