mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 20:07:09 +02:00
fix the addition + deletion bug
This commit is contained in:
parent
d7ddf4925e
commit
4391cba6ca
14 changed files with 518 additions and 18 deletions
|
@ -111,7 +111,6 @@ pub enum Error {
|
|||
Io(#[from] io::Error),
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn objects_from_json_value(json: serde_json::Value) -> Vec<crate::Object> {
|
||||
let documents = match json {
|
||||
object @ serde_json::Value::Object(_) => vec![object],
|
||||
|
@ -141,7 +140,6 @@ macro_rules! documents {
|
|||
}};
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn documents_batch_reader_from_objects(
|
||||
objects: impl IntoIterator<Item = Object>,
|
||||
) -> DocumentsBatchReader<std::io::Cursor<Vec<u8>>> {
|
||||
|
|
|
@ -198,6 +198,7 @@ where
|
|||
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
||||
return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents });
|
||||
}
|
||||
|
||||
let output = self
|
||||
.transform
|
||||
.take()
|
||||
|
@ -220,6 +221,7 @@ where
|
|||
}
|
||||
|
||||
let indexed_documents = output.documents_count as u64;
|
||||
|
||||
let number_of_documents = self.execute_raw(output)?;
|
||||
|
||||
Ok(DocumentAdditionResult { indexed_documents, number_of_documents })
|
||||
|
@ -236,7 +238,7 @@ where
|
|||
primary_key,
|
||||
fields_ids_map,
|
||||
field_distribution,
|
||||
mut external_documents_ids,
|
||||
new_external_documents_ids,
|
||||
new_documents_ids,
|
||||
replaced_documents_ids,
|
||||
documents_count,
|
||||
|
@ -363,9 +365,6 @@ where
|
|||
deletion_builder.delete_documents(&replaced_documents_ids);
|
||||
let deleted_documents_result = deletion_builder.execute_inner()?;
|
||||
debug!("{} documents actually deleted", deleted_documents_result.deleted_documents);
|
||||
if !deleted_documents_result.soft_deletion_used {
|
||||
external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
|
||||
}
|
||||
}
|
||||
|
||||
let index_documents_ids = self.index.documents_ids(self.wtxn)?;
|
||||
|
@ -445,6 +444,9 @@ where
|
|||
self.index.put_primary_key(self.wtxn, &primary_key)?;
|
||||
|
||||
// We write the external documents ids into the main database.
|
||||
let mut external_documents_ids = self.index.external_documents_ids(self.wtxn)?;
|
||||
external_documents_ids.insert_ids(&new_external_documents_ids)?;
|
||||
let external_documents_ids = external_documents_ids.into_static();
|
||||
self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?;
|
||||
|
||||
let all_documents_ids = index_documents_ids | new_documents_ids;
|
||||
|
@ -2515,4 +2517,170 @@ mod tests {
|
|||
db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
|
||||
db_snap!(index, docid_word_positions, 3, @"5287245332627675740b28bd46e1cde1");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn reproduce_the_bug() {
|
||||
/*
|
||||
[milli/examples/fuzz.rs:69] &batches = [
|
||||
Batch(
|
||||
[
|
||||
AddDoc(
|
||||
{ "id": 1, "doggo": "bernese" }, => internal 0
|
||||
),
|
||||
],
|
||||
),
|
||||
Batch(
|
||||
[
|
||||
DeleteDoc(
|
||||
1, => delete internal 0
|
||||
),
|
||||
AddDoc(
|
||||
{ "id": 0, "catto": "jorts" }, => internal 1
|
||||
),
|
||||
],
|
||||
),
|
||||
Batch(
|
||||
[
|
||||
AddDoc(
|
||||
{ "id": 1, "catto": "jorts" }, => internal 2
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
*/
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard;
|
||||
|
||||
// START OF BATCH
|
||||
|
||||
println!("--- ENTERING BATCH 1");
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
let builder = IndexDocuments::new(
|
||||
&mut wtxn,
|
||||
&index,
|
||||
&index.indexer_config,
|
||||
index.index_documents_config.clone(),
|
||||
|_| (),
|
||||
|| false,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// OP
|
||||
|
||||
let documents = documents!([
|
||||
{ "id": 1, "doggo": "bernese" },
|
||||
]);
|
||||
let (builder, added) = builder.add_documents(documents).unwrap();
|
||||
insta::assert_display_snapshot!(added.unwrap(), @"1");
|
||||
|
||||
// FINISHING
|
||||
let addition = builder.execute().unwrap();
|
||||
insta::assert_debug_snapshot!(addition, @r###"
|
||||
DocumentAdditionResult {
|
||||
indexed_documents: 1,
|
||||
number_of_documents: 1,
|
||||
}
|
||||
"###);
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, documents, @r###"
|
||||
{"id":1,"doggo":"bernese"}
|
||||
"###);
|
||||
db_snap!(index, external_documents_ids, @r###"
|
||||
soft:
|
||||
hard:
|
||||
1 0
|
||||
"###);
|
||||
|
||||
// A first batch of documents has been inserted
|
||||
|
||||
// BATCH 2
|
||||
|
||||
println!("--- ENTERING BATCH 2");
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
let builder = IndexDocuments::new(
|
||||
&mut wtxn,
|
||||
&index,
|
||||
&index.indexer_config,
|
||||
index.index_documents_config.clone(),
|
||||
|_| (),
|
||||
|| false,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let (builder, removed) = builder.remove_documents(vec![S("1")]).unwrap();
|
||||
insta::assert_display_snapshot!(removed.unwrap(), @"1");
|
||||
|
||||
let documents = documents!([
|
||||
{ "id": 0, "catto": "jorts" },
|
||||
]);
|
||||
let (builder, added) = builder.add_documents(documents).unwrap();
|
||||
insta::assert_display_snapshot!(added.unwrap(), @"1");
|
||||
|
||||
let addition = builder.execute().unwrap();
|
||||
insta::assert_debug_snapshot!(addition, @r###"
|
||||
DocumentAdditionResult {
|
||||
indexed_documents: 1,
|
||||
number_of_documents: 1,
|
||||
}
|
||||
"###);
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, documents, @r###"
|
||||
{"id":0,"catto":"jorts"}
|
||||
"###);
|
||||
|
||||
db_snap!(index, external_documents_ids, @r###"
|
||||
soft:
|
||||
hard:
|
||||
0 1
|
||||
"###);
|
||||
|
||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||
|
||||
// BATCH 3
|
||||
|
||||
println!("--- ENTERING BATCH 3");
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
let builder = IndexDocuments::new(
|
||||
&mut wtxn,
|
||||
&index,
|
||||
&index.indexer_config,
|
||||
index.index_documents_config.clone(),
|
||||
|_| (),
|
||||
|| false,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let documents = documents!([
|
||||
{ "id": 1, "catto": "jorts" },
|
||||
]);
|
||||
let (builder, added) = builder.add_documents(documents).unwrap();
|
||||
insta::assert_display_snapshot!(added.unwrap(), @"1");
|
||||
|
||||
let addition = builder.execute().unwrap();
|
||||
insta::assert_debug_snapshot!(addition, @r###"
|
||||
DocumentAdditionResult {
|
||||
indexed_documents: 1,
|
||||
number_of_documents: 2,
|
||||
}
|
||||
"###);
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, documents, @r###"
|
||||
{"id":1,"catto":"jorts"}
|
||||
{"id":0,"catto":"jorts"}
|
||||
"###);
|
||||
|
||||
// Ensuring all the returned IDs actually exists
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let res = index.search(&rtxn).execute().unwrap();
|
||||
index.documents(&rtxn, res.documents_ids).unwrap();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,15 +21,14 @@ use crate::error::{Error, InternalError, UserError};
|
|||
use crate::index::{db_name, main_key};
|
||||
use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
|
||||
use crate::{
|
||||
ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index,
|
||||
Result, BEU32,
|
||||
FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32,
|
||||
};
|
||||
|
||||
pub struct TransformOutput {
|
||||
pub primary_key: String,
|
||||
pub fields_ids_map: FieldsIdsMap,
|
||||
pub field_distribution: FieldDistribution,
|
||||
pub external_documents_ids: ExternalDocumentsIds<'static>,
|
||||
pub new_external_documents_ids: fst::Map<Cow<'static, [u8]>>,
|
||||
pub new_documents_ids: RoaringBitmap,
|
||||
pub replaced_documents_ids: RoaringBitmap,
|
||||
pub documents_count: usize,
|
||||
|
@ -58,8 +57,8 @@ pub struct Transform<'a, 'i> {
|
|||
original_sorter: grenad::Sorter<MergeFn>,
|
||||
flattened_sorter: grenad::Sorter<MergeFn>,
|
||||
|
||||
replaced_documents_ids: RoaringBitmap,
|
||||
new_documents_ids: RoaringBitmap,
|
||||
pub replaced_documents_ids: RoaringBitmap,
|
||||
pub new_documents_ids: RoaringBitmap,
|
||||
// To increase the cache locality and decrease the heap usage we use compact smartstring.
|
||||
new_external_documents_ids_builder: FxHashMap<SmartString<smartstring::Compact>, u64>,
|
||||
documents_count: usize,
|
||||
|
@ -568,8 +567,6 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||
}))?
|
||||
.to_string();
|
||||
|
||||
let mut external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
||||
|
||||
// We create a final writer to write the new documents in order from the sorter.
|
||||
let mut writer = create_writer(
|
||||
self.indexer_settings.chunk_compression_type,
|
||||
|
@ -651,13 +648,14 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||
fst_new_external_documents_ids_builder.insert(key, value)
|
||||
})?;
|
||||
let new_external_documents_ids = fst_new_external_documents_ids_builder.into_map();
|
||||
external_documents_ids.insert_ids(&new_external_documents_ids)?;
|
||||
|
||||
Ok(TransformOutput {
|
||||
primary_key,
|
||||
fields_ids_map: self.fields_ids_map,
|
||||
field_distribution,
|
||||
external_documents_ids: external_documents_ids.into_static(),
|
||||
new_external_documents_ids: new_external_documents_ids
|
||||
.map_data(|c| Cow::Owned(c))
|
||||
.unwrap(),
|
||||
new_documents_ids: self.new_documents_ids,
|
||||
replaced_documents_ids: self.replaced_documents_ids,
|
||||
documents_count: self.documents_count,
|
||||
|
@ -691,7 +689,8 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||
let new_external_documents_ids = {
|
||||
let mut external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
||||
external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
|
||||
external_documents_ids
|
||||
// it is safe to get the hard document IDs
|
||||
external_documents_ids.into_static().hard
|
||||
};
|
||||
|
||||
let documents_ids = self.index.documents_ids(wtxn)?;
|
||||
|
@ -776,7 +775,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||
primary_key,
|
||||
fields_ids_map: new_fields_ids_map,
|
||||
field_distribution,
|
||||
external_documents_ids: new_external_documents_ids.into_static(),
|
||||
new_external_documents_ids,
|
||||
new_documents_ids: documents_ids,
|
||||
replaced_documents_ids: RoaringBitmap::default(),
|
||||
documents_count,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue