fix the addition + deletion bug

This commit is contained in:
Tamo 2023-05-17 18:19:43 +02:00
parent d7ddf4925e
commit 4391cba6ca
No known key found for this signature in database
GPG Key ID: 20CD8020AFA88D69
14 changed files with 518 additions and 18 deletions

22
Cargo.lock generated
View File

@ -359,6 +359,15 @@ dependencies = [
"backtrace",
]
[[package]]
name = "arbitrary"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2d098ff73c1ca148721f37baad5ea6a465a13f9573aba8641fbbbae8164a54e"
dependencies = [
"derive_arbitrary",
]
[[package]]
name = "assert-json-diff"
version = "2.0.2"
@ -1096,6 +1105,17 @@ dependencies = [
"syn 1.0.109",
]
[[package]]
name = "derive_arbitrary"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f3cdeb9ec472d588e539a818b2dee436825730da08ad0017c4b1a17676bdc8b7"
dependencies = [
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "derive_builder"
version = "0.12.0"
@ -2711,6 +2731,7 @@ dependencies = [
name = "milli"
version = "1.2.0"
dependencies = [
"arbitrary",
"big_s",
"bimap",
"bincode",
@ -2722,6 +2743,7 @@ dependencies = [
"csv",
"deserr",
"either",
"fastrand",
"filter-parser",
"flatten-serde-json",
"fst",

View File

@ -0,0 +1,43 @@
---
source: index-scheduler/src/lib.rs
---
### Autobatching Enabled = true
### Processing Tasks:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
1 {uid: 1, status: succeeded, details: { received_document_ids: 2, deleted_documents: Some(2) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
----------------------------------------------------------------------
### Status:
enqueued []
succeeded [0,1,]
----------------------------------------------------------------------
### Kind:
"documentAdditionOrUpdate" [0,]
"documentDeletion" [1,]
----------------------------------------------------------------------
### Index Tasks:
doggos [0,1,]
----------------------------------------------------------------------
### Index Mapper:
doggos: { number_of_documents: 1, field_distribution: {"doggo": 1, "id": 1} }
----------------------------------------------------------------------
### Canceled By:
----------------------------------------------------------------------
### Enqueued At:
[timestamp] [0,]
[timestamp] [1,]
----------------------------------------------------------------------
### Started At:
[timestamp] [0,1,]
----------------------------------------------------------------------
### Finished At:
[timestamp] [0,1,]
----------------------------------------------------------------------
### File Store:
----------------------------------------------------------------------

View File

@ -0,0 +1,9 @@
---
source: index-scheduler/src/lib.rs
---
[
{
"id": 3,
"doggo": "bork"
}
]

View File

@ -0,0 +1,37 @@
---
source: index-scheduler/src/lib.rs
---
### Autobatching Enabled = true
### Processing Tasks:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
----------------------------------------------------------------------
### Status:
enqueued [0,]
----------------------------------------------------------------------
### Kind:
"documentAdditionOrUpdate" [0,]
----------------------------------------------------------------------
### Index Tasks:
doggos [0,]
----------------------------------------------------------------------
### Index Mapper:
----------------------------------------------------------------------
### Canceled By:
----------------------------------------------------------------------
### Enqueued At:
[timestamp] [0,]
----------------------------------------------------------------------
### Started At:
----------------------------------------------------------------------
### Finished At:
----------------------------------------------------------------------
### File Store:
00000000-0000-0000-0000-000000000000
----------------------------------------------------------------------

View File

@ -0,0 +1,40 @@
---
source: index-scheduler/src/lib.rs
---
### Autobatching Enabled = true
### Processing Tasks:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
1 {uid: 1, status: enqueued, details: { received_document_ids: 2, deleted_documents: None }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
----------------------------------------------------------------------
### Status:
enqueued [0,1,]
----------------------------------------------------------------------
### Kind:
"documentAdditionOrUpdate" [0,]
"documentDeletion" [1,]
----------------------------------------------------------------------
### Index Tasks:
doggos [0,1,]
----------------------------------------------------------------------
### Index Mapper:
----------------------------------------------------------------------
### Canceled By:
----------------------------------------------------------------------
### Enqueued At:
[timestamp] [0,]
[timestamp] [1,]
----------------------------------------------------------------------
### Started At:
----------------------------------------------------------------------
### Finished At:
----------------------------------------------------------------------
### File Store:
00000000-0000-0000-0000-000000000000
----------------------------------------------------------------------

View File

@ -0,0 +1,43 @@
---
source: index-scheduler/src/lib.rs
---
### Autobatching Enabled = true
### Processing Tasks:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: failed, error: ResponseError { code: 200, message: "Index `doggos` not found.", error_code: "index_not_found", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#index_not_found" }, details: { received_document_ids: 2, deleted_documents: Some(0) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
1 {uid: 1, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
----------------------------------------------------------------------
### Status:
enqueued [1,]
failed [0,]
----------------------------------------------------------------------
### Kind:
"documentAdditionOrUpdate" [1,]
"documentDeletion" [0,]
----------------------------------------------------------------------
### Index Tasks:
doggos [0,1,]
----------------------------------------------------------------------
### Index Mapper:
----------------------------------------------------------------------
### Canceled By:
----------------------------------------------------------------------
### Enqueued At:
[timestamp] [0,]
[timestamp] [1,]
----------------------------------------------------------------------
### Started At:
[timestamp] [0,]
----------------------------------------------------------------------
### Finished At:
[timestamp] [0,]
----------------------------------------------------------------------
### File Store:
00000000-0000-0000-0000-000000000000
----------------------------------------------------------------------

View File

@ -0,0 +1,46 @@
---
source: index-scheduler/src/lib.rs
---
### Autobatching Enabled = true
### Processing Tasks:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: failed, error: ResponseError { code: 200, message: "Index `doggos` not found.", error_code: "index_not_found", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#index_not_found" }, details: { received_document_ids: 2, deleted_documents: Some(0) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
1 {uid: 1, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
----------------------------------------------------------------------
### Status:
enqueued []
succeeded [1,]
failed [0,]
----------------------------------------------------------------------
### Kind:
"documentAdditionOrUpdate" [1,]
"documentDeletion" [0,]
----------------------------------------------------------------------
### Index Tasks:
doggos [0,1,]
----------------------------------------------------------------------
### Index Mapper:
doggos: { number_of_documents: 3, field_distribution: {"catto": 1, "doggo": 2, "id": 3} }
----------------------------------------------------------------------
### Canceled By:
----------------------------------------------------------------------
### Enqueued At:
[timestamp] [0,]
[timestamp] [1,]
----------------------------------------------------------------------
### Started At:
[timestamp] [0,]
[timestamp] [1,]
----------------------------------------------------------------------
### Finished At:
[timestamp] [0,]
[timestamp] [1,]
----------------------------------------------------------------------
### File Store:
----------------------------------------------------------------------

View File

@ -0,0 +1,17 @@
---
source: index-scheduler/src/lib.rs
---
[
{
"id": 1,
"doggo": "jean bob"
},
{
"id": 2,
"catto": "jorts"
},
{
"id": 3,
"doggo": "bork"
}
]

View File

@ -0,0 +1,36 @@
---
source: index-scheduler/src/lib.rs
---
### Autobatching Enabled = true
### Processing Tasks:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: enqueued, details: { received_document_ids: 2, deleted_documents: None }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
----------------------------------------------------------------------
### Status:
enqueued [0,]
----------------------------------------------------------------------
### Kind:
"documentDeletion" [0,]
----------------------------------------------------------------------
### Index Tasks:
doggos [0,]
----------------------------------------------------------------------
### Index Mapper:
----------------------------------------------------------------------
### Canceled By:
----------------------------------------------------------------------
### Enqueued At:
[timestamp] [0,]
----------------------------------------------------------------------
### Started At:
----------------------------------------------------------------------
### Finished At:
----------------------------------------------------------------------
### File Store:
----------------------------------------------------------------------

View File

@ -0,0 +1,40 @@
---
source: index-scheduler/src/lib.rs
---
### Autobatching Enabled = true
### Processing Tasks:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: enqueued, details: { received_document_ids: 2, deleted_documents: None }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
1 {uid: 1, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
----------------------------------------------------------------------
### Status:
enqueued [0,1,]
----------------------------------------------------------------------
### Kind:
"documentAdditionOrUpdate" [1,]
"documentDeletion" [0,]
----------------------------------------------------------------------
### Index Tasks:
doggos [0,1,]
----------------------------------------------------------------------
### Index Mapper:
----------------------------------------------------------------------
### Canceled By:
----------------------------------------------------------------------
### Enqueued At:
[timestamp] [0,]
[timestamp] [1,]
----------------------------------------------------------------------
### Started At:
----------------------------------------------------------------------
### Finished At:
----------------------------------------------------------------------
### File Store:
00000000-0000-0000-0000-000000000000
----------------------------------------------------------------------

View File

@ -56,6 +56,7 @@ itertools = "0.10.5"
log = "0.4.17"
logging_timer = "1.1.0"
csv = "1.2.1"
fastrand = "1.9.0"
[dev-dependencies]
mimalloc = { version = "0.1.29", default-features = false }
@ -64,12 +65,13 @@ insta = "1.29.0"
maplit = "1.0.2"
md5 = "0.7.0"
rand = {version = "0.8.5", features = ["small_rng"] }
arbitrary = { version = "1.3.0", features = ["derive"] }
[target.'cfg(fuzzing)'.dev-dependencies]
fuzzcheck = "0.12.1"
[features]
all-tokenizations = [ "charabia/default" ]
all-tokenizations = ["charabia/default"]
# Use POSIX semaphores instead of SysV semaphores in LMDB
# For more information on this feature, see heed's Cargo.toml

View File

@ -111,7 +111,6 @@ pub enum Error {
Io(#[from] io::Error),
}
#[cfg(test)]
pub fn objects_from_json_value(json: serde_json::Value) -> Vec<crate::Object> {
let documents = match json {
object @ serde_json::Value::Object(_) => vec![object],
@ -141,7 +140,6 @@ macro_rules! documents {
}};
}
#[cfg(test)]
pub fn documents_batch_reader_from_objects(
objects: impl IntoIterator<Item = Object>,
) -> DocumentsBatchReader<std::io::Cursor<Vec<u8>>> {

View File

@ -198,6 +198,7 @@ where
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents });
}
let output = self
.transform
.take()
@ -220,6 +221,7 @@ where
}
let indexed_documents = output.documents_count as u64;
let number_of_documents = self.execute_raw(output)?;
Ok(DocumentAdditionResult { indexed_documents, number_of_documents })
@ -236,7 +238,7 @@ where
primary_key,
fields_ids_map,
field_distribution,
mut external_documents_ids,
new_external_documents_ids,
new_documents_ids,
replaced_documents_ids,
documents_count,
@ -363,9 +365,6 @@ where
deletion_builder.delete_documents(&replaced_documents_ids);
let deleted_documents_result = deletion_builder.execute_inner()?;
debug!("{} documents actually deleted", deleted_documents_result.deleted_documents);
if !deleted_documents_result.soft_deletion_used {
external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
}
}
let index_documents_ids = self.index.documents_ids(self.wtxn)?;
@ -445,6 +444,9 @@ where
self.index.put_primary_key(self.wtxn, &primary_key)?;
// We write the external documents ids into the main database.
let mut external_documents_ids = self.index.external_documents_ids(self.wtxn)?;
external_documents_ids.insert_ids(&new_external_documents_ids)?;
let external_documents_ids = external_documents_ids.into_static();
self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?;
let all_documents_ids = index_documents_ids | new_documents_ids;
@ -2515,4 +2517,170 @@ mod tests {
db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
db_snap!(index, docid_word_positions, 3, @"5287245332627675740b28bd46e1cde1");
}
#[test]
fn reproduce_the_bug() {
/*
[milli/examples/fuzz.rs:69] &batches = [
Batch(
[
AddDoc(
{ "id": 1, "doggo": "bernese" }, => internal 0
),
],
),
Batch(
[
DeleteDoc(
1, => delete internal 0
),
AddDoc(
{ "id": 0, "catto": "jorts" }, => internal 1
),
],
),
Batch(
[
AddDoc(
{ "id": 1, "catto": "jorts" }, => internal 2
),
],
),
]
*/
let mut index = TempIndex::new();
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard;
// START OF BATCH
println!("--- ENTERING BATCH 1");
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&index.indexer_config,
index.index_documents_config.clone(),
|_| (),
|| false,
)
.unwrap();
// OP
let documents = documents!([
{ "id": 1, "doggo": "bernese" },
]);
let (builder, added) = builder.add_documents(documents).unwrap();
insta::assert_display_snapshot!(added.unwrap(), @"1");
// FINISHING
let addition = builder.execute().unwrap();
insta::assert_debug_snapshot!(addition, @r###"
DocumentAdditionResult {
indexed_documents: 1,
number_of_documents: 1,
}
"###);
wtxn.commit().unwrap();
db_snap!(index, documents, @r###"
{"id":1,"doggo":"bernese"}
"###);
db_snap!(index, external_documents_ids, @r###"
soft:
hard:
1 0
"###);
// A first batch of documents has been inserted
// BATCH 2
println!("--- ENTERING BATCH 2");
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&index.indexer_config,
index.index_documents_config.clone(),
|_| (),
|| false,
)
.unwrap();
let (builder, removed) = builder.remove_documents(vec![S("1")]).unwrap();
insta::assert_display_snapshot!(removed.unwrap(), @"1");
let documents = documents!([
{ "id": 0, "catto": "jorts" },
]);
let (builder, added) = builder.add_documents(documents).unwrap();
insta::assert_display_snapshot!(added.unwrap(), @"1");
let addition = builder.execute().unwrap();
insta::assert_debug_snapshot!(addition, @r###"
DocumentAdditionResult {
indexed_documents: 1,
number_of_documents: 1,
}
"###);
wtxn.commit().unwrap();
db_snap!(index, documents, @r###"
{"id":0,"catto":"jorts"}
"###);
db_snap!(index, external_documents_ids, @r###"
soft:
hard:
0 1
"###);
db_snap!(index, soft_deleted_documents_ids, @"[]");
// BATCH 3
println!("--- ENTERING BATCH 3");
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&index.indexer_config,
index.index_documents_config.clone(),
|_| (),
|| false,
)
.unwrap();
let documents = documents!([
{ "id": 1, "catto": "jorts" },
]);
let (builder, added) = builder.add_documents(documents).unwrap();
insta::assert_display_snapshot!(added.unwrap(), @"1");
let addition = builder.execute().unwrap();
insta::assert_debug_snapshot!(addition, @r###"
DocumentAdditionResult {
indexed_documents: 1,
number_of_documents: 2,
}
"###);
wtxn.commit().unwrap();
db_snap!(index, documents, @r###"
{"id":1,"catto":"jorts"}
{"id":0,"catto":"jorts"}
"###);
// Ensuring all the returned IDs actually exists
let rtxn = index.read_txn().unwrap();
let res = index.search(&rtxn).execute().unwrap();
index.documents(&rtxn, res.documents_ids).unwrap();
}
}

View File

@ -21,15 +21,14 @@ use crate::error::{Error, InternalError, UserError};
use crate::index::{db_name, main_key};
use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
use crate::{
ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index,
Result, BEU32,
FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32,
};
pub struct TransformOutput {
pub primary_key: String,
pub fields_ids_map: FieldsIdsMap,
pub field_distribution: FieldDistribution,
pub external_documents_ids: ExternalDocumentsIds<'static>,
pub new_external_documents_ids: fst::Map<Cow<'static, [u8]>>,
pub new_documents_ids: RoaringBitmap,
pub replaced_documents_ids: RoaringBitmap,
pub documents_count: usize,
@ -58,8 +57,8 @@ pub struct Transform<'a, 'i> {
original_sorter: grenad::Sorter<MergeFn>,
flattened_sorter: grenad::Sorter<MergeFn>,
replaced_documents_ids: RoaringBitmap,
new_documents_ids: RoaringBitmap,
pub replaced_documents_ids: RoaringBitmap,
pub new_documents_ids: RoaringBitmap,
// To increase the cache locality and decrease the heap usage we use compact smartstring.
new_external_documents_ids_builder: FxHashMap<SmartString<smartstring::Compact>, u64>,
documents_count: usize,
@ -568,8 +567,6 @@ impl<'a, 'i> Transform<'a, 'i> {
}))?
.to_string();
let mut external_documents_ids = self.index.external_documents_ids(wtxn)?;
// We create a final writer to write the new documents in order from the sorter.
let mut writer = create_writer(
self.indexer_settings.chunk_compression_type,
@ -651,13 +648,14 @@ impl<'a, 'i> Transform<'a, 'i> {
fst_new_external_documents_ids_builder.insert(key, value)
})?;
let new_external_documents_ids = fst_new_external_documents_ids_builder.into_map();
external_documents_ids.insert_ids(&new_external_documents_ids)?;
Ok(TransformOutput {
primary_key,
fields_ids_map: self.fields_ids_map,
field_distribution,
external_documents_ids: external_documents_ids.into_static(),
new_external_documents_ids: new_external_documents_ids
.map_data(|c| Cow::Owned(c))
.unwrap(),
new_documents_ids: self.new_documents_ids,
replaced_documents_ids: self.replaced_documents_ids,
documents_count: self.documents_count,
@ -691,7 +689,8 @@ impl<'a, 'i> Transform<'a, 'i> {
let new_external_documents_ids = {
let mut external_documents_ids = self.index.external_documents_ids(wtxn)?;
external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
external_documents_ids
// it is safe to get the hard document IDs
external_documents_ids.into_static().hard
};
let documents_ids = self.index.documents_ids(wtxn)?;
@ -776,7 +775,7 @@ impl<'a, 'i> Transform<'a, 'i> {
primary_key,
fields_ids_map: new_fields_ids_map,
field_distribution,
external_documents_ids: new_external_documents_ids.into_static(),
new_external_documents_ids,
new_documents_ids: documents_ids,
replaced_documents_ids: RoaringBitmap::default(),
documents_count,