fix the addition + deletion bug

This commit is contained in:
Tamo 2023-05-17 18:19:43 +02:00
parent d7ddf4925e
commit 4391cba6ca
No known key found for this signature in database
GPG key ID: 20CD8020AFA88D69
14 changed files with 518 additions and 18 deletions

View file

@ -111,7 +111,6 @@ pub enum Error {
Io(#[from] io::Error),
}
#[cfg(test)]
pub fn objects_from_json_value(json: serde_json::Value) -> Vec<crate::Object> {
let documents = match json {
object @ serde_json::Value::Object(_) => vec![object],
@ -141,7 +140,6 @@ macro_rules! documents {
}};
}
#[cfg(test)]
pub fn documents_batch_reader_from_objects(
objects: impl IntoIterator<Item = Object>,
) -> DocumentsBatchReader<std::io::Cursor<Vec<u8>>> {

View file

@ -198,6 +198,7 @@ where
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents });
}
let output = self
.transform
.take()
@ -220,6 +221,7 @@ where
}
let indexed_documents = output.documents_count as u64;
let number_of_documents = self.execute_raw(output)?;
Ok(DocumentAdditionResult { indexed_documents, number_of_documents })
@ -236,7 +238,7 @@ where
primary_key,
fields_ids_map,
field_distribution,
mut external_documents_ids,
new_external_documents_ids,
new_documents_ids,
replaced_documents_ids,
documents_count,
@ -363,9 +365,6 @@ where
deletion_builder.delete_documents(&replaced_documents_ids);
let deleted_documents_result = deletion_builder.execute_inner()?;
debug!("{} documents actually deleted", deleted_documents_result.deleted_documents);
if !deleted_documents_result.soft_deletion_used {
external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
}
}
let index_documents_ids = self.index.documents_ids(self.wtxn)?;
@ -445,6 +444,9 @@ where
self.index.put_primary_key(self.wtxn, &primary_key)?;
// We write the external documents ids into the main database.
let mut external_documents_ids = self.index.external_documents_ids(self.wtxn)?;
external_documents_ids.insert_ids(&new_external_documents_ids)?;
let external_documents_ids = external_documents_ids.into_static();
self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?;
let all_documents_ids = index_documents_ids | new_documents_ids;
@ -2515,4 +2517,170 @@ mod tests {
db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
db_snap!(index, docid_word_positions, 3, @"5287245332627675740b28bd46e1cde1");
}
#[test]
fn reproduce_the_bug() {
/*
[milli/examples/fuzz.rs:69] &batches = [
Batch(
[
AddDoc(
{ "id": 1, "doggo": "bernese" }, => internal 0
),
],
),
Batch(
[
DeleteDoc(
1, => delete internal 0
),
AddDoc(
{ "id": 0, "catto": "jorts" }, => internal 1
),
],
),
Batch(
[
AddDoc(
{ "id": 1, "catto": "jorts" }, => internal 2
),
],
),
]
*/
let mut index = TempIndex::new();
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard;
// START OF BATCH
println!("--- ENTERING BATCH 1");
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&index.indexer_config,
index.index_documents_config.clone(),
|_| (),
|| false,
)
.unwrap();
// OP
let documents = documents!([
{ "id": 1, "doggo": "bernese" },
]);
let (builder, added) = builder.add_documents(documents).unwrap();
insta::assert_display_snapshot!(added.unwrap(), @"1");
// FINISHING
let addition = builder.execute().unwrap();
insta::assert_debug_snapshot!(addition, @r###"
DocumentAdditionResult {
indexed_documents: 1,
number_of_documents: 1,
}
"###);
wtxn.commit().unwrap();
db_snap!(index, documents, @r###"
{"id":1,"doggo":"bernese"}
"###);
db_snap!(index, external_documents_ids, @r###"
soft:
hard:
1 0
"###);
// A first batch of documents has been inserted
// BATCH 2
println!("--- ENTERING BATCH 2");
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&index.indexer_config,
index.index_documents_config.clone(),
|_| (),
|| false,
)
.unwrap();
let (builder, removed) = builder.remove_documents(vec![S("1")]).unwrap();
insta::assert_display_snapshot!(removed.unwrap(), @"1");
let documents = documents!([
{ "id": 0, "catto": "jorts" },
]);
let (builder, added) = builder.add_documents(documents).unwrap();
insta::assert_display_snapshot!(added.unwrap(), @"1");
let addition = builder.execute().unwrap();
insta::assert_debug_snapshot!(addition, @r###"
DocumentAdditionResult {
indexed_documents: 1,
number_of_documents: 1,
}
"###);
wtxn.commit().unwrap();
db_snap!(index, documents, @r###"
{"id":0,"catto":"jorts"}
"###);
db_snap!(index, external_documents_ids, @r###"
soft:
hard:
0 1
"###);
db_snap!(index, soft_deleted_documents_ids, @"[]");
// BATCH 3
println!("--- ENTERING BATCH 3");
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&index.indexer_config,
index.index_documents_config.clone(),
|_| (),
|| false,
)
.unwrap();
let documents = documents!([
{ "id": 1, "catto": "jorts" },
]);
let (builder, added) = builder.add_documents(documents).unwrap();
insta::assert_display_snapshot!(added.unwrap(), @"1");
let addition = builder.execute().unwrap();
insta::assert_debug_snapshot!(addition, @r###"
DocumentAdditionResult {
indexed_documents: 1,
number_of_documents: 2,
}
"###);
wtxn.commit().unwrap();
db_snap!(index, documents, @r###"
{"id":1,"catto":"jorts"}
{"id":0,"catto":"jorts"}
"###);
// Ensuring all the returned IDs actually exists
let rtxn = index.read_txn().unwrap();
let res = index.search(&rtxn).execute().unwrap();
index.documents(&rtxn, res.documents_ids).unwrap();
}
}

View file

@ -21,15 +21,14 @@ use crate::error::{Error, InternalError, UserError};
use crate::index::{db_name, main_key};
use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
use crate::{
ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index,
Result, BEU32,
FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32,
};
pub struct TransformOutput {
pub primary_key: String,
pub fields_ids_map: FieldsIdsMap,
pub field_distribution: FieldDistribution,
pub external_documents_ids: ExternalDocumentsIds<'static>,
pub new_external_documents_ids: fst::Map<Cow<'static, [u8]>>,
pub new_documents_ids: RoaringBitmap,
pub replaced_documents_ids: RoaringBitmap,
pub documents_count: usize,
@ -58,8 +57,8 @@ pub struct Transform<'a, 'i> {
original_sorter: grenad::Sorter<MergeFn>,
flattened_sorter: grenad::Sorter<MergeFn>,
replaced_documents_ids: RoaringBitmap,
new_documents_ids: RoaringBitmap,
pub replaced_documents_ids: RoaringBitmap,
pub new_documents_ids: RoaringBitmap,
// To increase the cache locality and decrease the heap usage we use compact smartstring.
new_external_documents_ids_builder: FxHashMap<SmartString<smartstring::Compact>, u64>,
documents_count: usize,
@ -568,8 +567,6 @@ impl<'a, 'i> Transform<'a, 'i> {
}))?
.to_string();
let mut external_documents_ids = self.index.external_documents_ids(wtxn)?;
// We create a final writer to write the new documents in order from the sorter.
let mut writer = create_writer(
self.indexer_settings.chunk_compression_type,
@ -651,13 +648,14 @@ impl<'a, 'i> Transform<'a, 'i> {
fst_new_external_documents_ids_builder.insert(key, value)
})?;
let new_external_documents_ids = fst_new_external_documents_ids_builder.into_map();
external_documents_ids.insert_ids(&new_external_documents_ids)?;
Ok(TransformOutput {
primary_key,
fields_ids_map: self.fields_ids_map,
field_distribution,
external_documents_ids: external_documents_ids.into_static(),
new_external_documents_ids: new_external_documents_ids
.map_data(|c| Cow::Owned(c))
.unwrap(),
new_documents_ids: self.new_documents_ids,
replaced_documents_ids: self.replaced_documents_ids,
documents_count: self.documents_count,
@ -691,7 +689,8 @@ impl<'a, 'i> Transform<'a, 'i> {
let new_external_documents_ids = {
let mut external_documents_ids = self.index.external_documents_ids(wtxn)?;
external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
external_documents_ids
// it is safe to get the hard document IDs
external_documents_ids.into_static().hard
};
let documents_ids = self.index.documents_ids(wtxn)?;
@ -776,7 +775,7 @@ impl<'a, 'i> Transform<'a, 'i> {
primary_key,
fields_ids_map: new_fields_ids_map,
field_distribution,
external_documents_ids: new_external_documents_ids.into_static(),
new_external_documents_ids,
new_documents_ids: documents_ids,
replaced_documents_ids: RoaringBitmap::default(),
documents_count,