mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-27 13:47:29 +01:00
Merge #750
750: Fix hard-deletion of an external id that was soft-deleted and then reimported - main r=irevoire a=loiclec # Pull Request ## Related issue Fixes (when merged into meilisearch) https://github.com/meilisearch/meilisearch/issues/3021 ## What does this PR do? There was a bug happening when: 1. Documents were added 2. Some of these documents were replaced using soft-deletion 3. A deletion of another non-replaced document takes place and triggers a hard-deletion 4. Documents with the same identifiers as the replaced documents are added again Then, search results would return duplicate documents. No crash would happen at any time (this is the reason it wasn't caught by the previous fuzz test. I have updated the new one such that it also checks the result of a placeholder search request, which then finds the bug immediately). The cause of the bug is: 1. When a hard-deletion is triggered, we try to retrieve the external document id associated with each soft-deleted document id. 2. Then, we take this list of external document ids and remove each of them from the `ExternalDocumentsIds` structure. 3. However, this is not correct in case an existing (non-deleted) document shares the external id of a soft-deleted document. ## Implementation of the fix 1. Before we process a permanent deletion, we update the list of soft-deleted document ids. 2. Then, the permanent deletion's job is to remove the soft-deleted documents from all data structures. Therefore, to update `ExternalDocumentsIds`, we can simply call the `delete_soft_deleted_documents_ids_from_fsts` method, which is faster and simpler. ## Correctness A unit test was added to reproduce the bug. The new fuzz test, when adjusted to check the correctness of a placeholder search, could also instantly reproduce the bug, but now does not find any other problem. Co-authored-by: Loïc Lecrenier <loic.lecrenier@me.com>
This commit is contained in:
commit
249e051cd4
@ -47,30 +47,6 @@ impl<'a> ExternalDocumentsIds<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn delete_ids<A: AsRef<[u8]>>(&mut self, other: fst::Set<A>) -> fst::Result<()> {
|
||||
let other = fst::Map::from(other.into_fst());
|
||||
let union_op = self.soft.op().add(&other).r#union();
|
||||
|
||||
let mut iter = union_op.into_stream();
|
||||
let mut new_soft_builder = fst::MapBuilder::memory();
|
||||
while let Some((external_id, docids)) = iter.next() {
|
||||
if docids.iter().any(|v| v.index == 1) {
|
||||
// If the `other` set returns a value here it means
|
||||
// that it must be marked as deleted.
|
||||
new_soft_builder.insert(external_id, DELETED_ID)?;
|
||||
} else {
|
||||
let value = docids.iter().find(|v| v.index == 0).unwrap().value;
|
||||
new_soft_builder.insert(external_id, value)?;
|
||||
}
|
||||
}
|
||||
|
||||
drop(iter);
|
||||
|
||||
// We save this new map as the new soft map.
|
||||
self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?;
|
||||
self.merge_soft_into_hard()
|
||||
}
|
||||
|
||||
/// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they
|
||||
/// don't contain any soft deleted document id.
|
||||
pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> {
|
||||
@ -173,76 +149,3 @@ impl Default for ExternalDocumentsIds<'static> {
|
||||
fn indexed_last_value(indexed_values: &[IndexedValue]) -> Option<u64> {
|
||||
indexed_values.iter().copied().max_by_key(|iv| iv.index).map(|iv| iv.value)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn simple_insert_delete_ids() {
|
||||
let mut external_documents_ids = ExternalDocumentsIds::default();
|
||||
|
||||
let new_ids = fst::Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3), ("d", 4)]).unwrap();
|
||||
external_documents_ids.insert_ids(&new_ids).unwrap();
|
||||
|
||||
assert_eq!(external_documents_ids.get("a"), Some(1));
|
||||
assert_eq!(external_documents_ids.get("b"), Some(2));
|
||||
assert_eq!(external_documents_ids.get("c"), Some(3));
|
||||
assert_eq!(external_documents_ids.get("d"), Some(4));
|
||||
|
||||
let new_ids = fst::Map::from_iter(vec![("e", 5), ("f", 6), ("g", 7)]).unwrap();
|
||||
external_documents_ids.insert_ids(&new_ids).unwrap();
|
||||
|
||||
assert_eq!(external_documents_ids.get("a"), Some(1));
|
||||
assert_eq!(external_documents_ids.get("b"), Some(2));
|
||||
assert_eq!(external_documents_ids.get("c"), Some(3));
|
||||
assert_eq!(external_documents_ids.get("d"), Some(4));
|
||||
assert_eq!(external_documents_ids.get("e"), Some(5));
|
||||
assert_eq!(external_documents_ids.get("f"), Some(6));
|
||||
assert_eq!(external_documents_ids.get("g"), Some(7));
|
||||
|
||||
let del_ids = fst::Set::from_iter(vec!["a", "c", "f"]).unwrap();
|
||||
external_documents_ids.delete_ids(del_ids).unwrap();
|
||||
|
||||
assert_eq!(external_documents_ids.get("a"), None);
|
||||
assert_eq!(external_documents_ids.get("b"), Some(2));
|
||||
assert_eq!(external_documents_ids.get("c"), None);
|
||||
assert_eq!(external_documents_ids.get("d"), Some(4));
|
||||
assert_eq!(external_documents_ids.get("e"), Some(5));
|
||||
assert_eq!(external_documents_ids.get("f"), None);
|
||||
assert_eq!(external_documents_ids.get("g"), Some(7));
|
||||
|
||||
let new_ids = fst::Map::from_iter(vec![("a", 5), ("b", 6), ("h", 8)]).unwrap();
|
||||
external_documents_ids.insert_ids(&new_ids).unwrap();
|
||||
|
||||
assert_eq!(external_documents_ids.get("a"), Some(5));
|
||||
assert_eq!(external_documents_ids.get("b"), Some(6));
|
||||
assert_eq!(external_documents_ids.get("c"), None);
|
||||
assert_eq!(external_documents_ids.get("d"), Some(4));
|
||||
assert_eq!(external_documents_ids.get("e"), Some(5));
|
||||
assert_eq!(external_documents_ids.get("f"), None);
|
||||
assert_eq!(external_documents_ids.get("g"), Some(7));
|
||||
assert_eq!(external_documents_ids.get("h"), Some(8));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strange_delete_insert_ids() {
|
||||
let mut external_documents_ids = ExternalDocumentsIds::default();
|
||||
|
||||
let new_ids =
|
||||
fst::Map::from_iter(vec![("1", 0), ("123", 1), ("30", 2), ("456", 3)]).unwrap();
|
||||
external_documents_ids.insert_ids(&new_ids).unwrap();
|
||||
assert_eq!(external_documents_ids.get("1"), Some(0));
|
||||
assert_eq!(external_documents_ids.get("123"), Some(1));
|
||||
assert_eq!(external_documents_ids.get("30"), Some(2));
|
||||
assert_eq!(external_documents_ids.get("456"), Some(3));
|
||||
|
||||
let deleted_ids = fst::Set::from_iter(vec!["30"]).unwrap();
|
||||
external_documents_ids.delete_ids(deleted_ids).unwrap();
|
||||
assert_eq!(external_documents_ids.get("30"), None);
|
||||
|
||||
let new_ids = fst::Map::from_iter(vec![("30", 2)]).unwrap();
|
||||
external_documents_ids.insert_ids(&new_ids).unwrap();
|
||||
assert_eq!(external_documents_ids.get("30"), Some(2));
|
||||
}
|
||||
}
|
||||
|
@ -1181,6 +1181,7 @@ impl Index {
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
use std::collections::HashSet;
|
||||
use std::ops::Deref;
|
||||
|
||||
use big_s::S;
|
||||
@ -1195,7 +1196,7 @@ pub(crate) mod tests {
|
||||
self, DeleteDocuments, DeletionStrategy, IndexDocuments, IndexDocumentsConfig,
|
||||
IndexDocumentsMethod, IndexerConfig, Settings,
|
||||
};
|
||||
use crate::{db_snap, obkv_to_json, Index};
|
||||
use crate::{db_snap, obkv_to_json, Index, Search, SearchResult};
|
||||
|
||||
pub(crate) struct TempIndex {
|
||||
pub inner: Index,
|
||||
@ -2188,4 +2189,97 @@ pub(crate) mod tests {
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 2, @"[]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bug_3021_fourth() {
|
||||
// https://github.com/meilisearch/meilisearch/issues/3021
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
|
||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_primary_key("primary_key".to_owned());
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "primary_key": 11 },
|
||||
{ "primary_key": 4 },
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, @"[0, 1, ]");
|
||||
db_snap!(index, external_documents_ids, @r###"
|
||||
soft:
|
||||
hard:
|
||||
11 0
|
||||
4 1
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "primary_key": 4, "a": 0 },
|
||||
{ "primary_key": 1 },
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, @"[0, 2, 3, ]");
|
||||
db_snap!(index, external_documents_ids, @r###"
|
||||
soft:
|
||||
hard:
|
||||
1 3
|
||||
11 0
|
||||
4 2
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, @"[1, ]");
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
delete.strategy(DeletionStrategy::AlwaysHard);
|
||||
delete.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, @"[0, 2, 3, ]");
|
||||
db_snap!(index, external_documents_ids, @r###"
|
||||
soft:
|
||||
hard:
|
||||
1 3
|
||||
11 0
|
||||
4 2
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "primary_key": 4, "a": 1 },
|
||||
{ "primary_key": 1, "a": 0 },
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, @"[0, 1, 4, ]");
|
||||
db_snap!(index, external_documents_ids, @r###"
|
||||
soft:
|
||||
hard:
|
||||
1 4
|
||||
11 0
|
||||
4 1
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, @"[2, 3, ]");
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let search = Search::new(&rtxn, &index);
|
||||
let SearchResult { matching_words: _, candidates: _, mut documents_ids } =
|
||||
search.execute().unwrap();
|
||||
let primary_key_id = index.fields_ids_map(&rtxn).unwrap().id("primary_key").unwrap();
|
||||
documents_ids.sort_unstable();
|
||||
let docs = index.documents(&rtxn, documents_ids).unwrap();
|
||||
let mut all_ids = HashSet::new();
|
||||
for (_docid, obkv) in docs {
|
||||
let id = obkv.get(primary_key_id).unwrap();
|
||||
assert!(all_ids.insert(id));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -6,16 +6,14 @@ use heed::types::{ByteSlice, DecodeIgnore, Str};
|
||||
use heed::Database;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use super::facet::delete::FacetsDelete;
|
||||
use super::ClearDocuments;
|
||||
use crate::error::{InternalError, UserError};
|
||||
use crate::error::InternalError;
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::FieldDocIdFacetCodec;
|
||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
use crate::index::{db_name, main_key};
|
||||
use crate::{
|
||||
ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec,
|
||||
SmallString32, BEU32,
|
||||
@ -186,6 +184,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
|
||||
soft_deleted_docids |= &self.to_delete_docids;
|
||||
|
||||
// We always soft-delete the documents, even if they will be permanently
|
||||
// deleted immediately after.
|
||||
self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?;
|
||||
|
||||
// decide for a hard or soft deletion depending on the strategy
|
||||
let soft_deletion = match self.strategy {
|
||||
DeletionStrategy::Dynamic => {
|
||||
@ -214,7 +216,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
|
||||
if soft_deletion {
|
||||
// Keep the soft-deleted in the DB
|
||||
self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?;
|
||||
return Ok(DetailedDocumentDeletionResult {
|
||||
deleted_documents: self.to_delete_docids.len(),
|
||||
remaining_documents: documents_ids.len(),
|
||||
@ -222,23 +223,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
});
|
||||
}
|
||||
|
||||
// Erase soft-deleted from DB
|
||||
self.to_delete_docids = soft_deleted_docids;
|
||||
// and we can reset the soft deleted bitmap
|
||||
self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?;
|
||||
|
||||
let primary_key =
|
||||
self.index.primary_key(self.wtxn)?.ok_or(InternalError::DatabaseMissingEntry {
|
||||
db_name: db_name::MAIN,
|
||||
key: Some(main_key::PRIMARY_KEY_KEY),
|
||||
})?;
|
||||
|
||||
// Since we already checked if the DB was empty, if we can't find the primary key, then
|
||||
// something is wrong, and we must return an error.
|
||||
let id_field = match fields_ids_map.id(primary_key) {
|
||||
Some(field) => field,
|
||||
None => return Err(UserError::MissingPrimaryKey.into()),
|
||||
};
|
||||
|
||||
let Index {
|
||||
env: _env,
|
||||
@ -262,33 +247,14 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
documents,
|
||||
} = self.index;
|
||||
|
||||
// Retrieve the words and the external documents ids contained in the documents.
|
||||
// Retrieve the words contained in the documents.
|
||||
let mut words = Vec::new();
|
||||
let mut external_ids = Vec::new();
|
||||
for docid in &self.to_delete_docids {
|
||||
// We create an iterator to be able to get the content and delete the document
|
||||
// content itself. It's faster to acquire a cursor to get and delete,
|
||||
// as we avoid traversing the LMDB B-Tree two times but only once.
|
||||
let key = BEU32::new(docid);
|
||||
let mut iter = documents.range_mut(self.wtxn, &(key..=key))?;
|
||||
if let Some((_key, obkv)) = iter.next().transpose()? {
|
||||
if let Some(content) = obkv.get(id_field) {
|
||||
let external_id = match serde_json::from_slice(content).unwrap() {
|
||||
Value::String(string) => SmallString32::from(string.as_str()),
|
||||
Value::Number(number) => SmallString32::from(number.to_string()),
|
||||
document_id => {
|
||||
return Err(UserError::InvalidDocumentId { document_id }.into())
|
||||
}
|
||||
};
|
||||
external_ids.push(external_id);
|
||||
}
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.del_current()? };
|
||||
}
|
||||
drop(iter);
|
||||
documents.delete(self.wtxn, &BEU32::new(docid))?;
|
||||
|
||||
// We iterate through the words positions of the document id,
|
||||
// retrieve the word and delete the positions.
|
||||
// We iterate through the words positions of the document id, retrieve the word and delete the positions.
|
||||
// We create an iterator to be able to get the content and delete the key-value itself.
|
||||
// It's faster to acquire a cursor to get and delete, as we avoid traversing the LMDB B-Tree two times but only once.
|
||||
let mut iter = docid_word_positions.prefix_iter_mut(self.wtxn, &(docid, ""))?;
|
||||
while let Some(result) = iter.next() {
|
||||
let ((_docid, word), _positions) = result?;
|
||||
@ -298,17 +264,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
unsafe { iter.del_current()? };
|
||||
}
|
||||
}
|
||||
|
||||
// We create the FST map of the external ids that we must delete.
|
||||
external_ids.sort_unstable();
|
||||
let external_ids_to_delete = fst::Set::from_iter(external_ids)?;
|
||||
|
||||
// We acquire the current external documents ids map...
|
||||
// Note that its soft-deleted document ids field will be equal to the `to_delete_docids`
|
||||
let mut new_external_documents_ids = self.index.external_documents_ids(self.wtxn)?;
|
||||
// ...and remove the to-delete external ids.
|
||||
new_external_documents_ids.delete_ids(external_ids_to_delete)?;
|
||||
|
||||
// We write the new external ids into the main database.
|
||||
// We then remove the soft-deleted docids from it
|
||||
new_external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
|
||||
// and write it back to the main database.
|
||||
let new_external_documents_ids = new_external_documents_ids.into_static();
|
||||
self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?;
|
||||
|
||||
@ -545,6 +506,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
&self.to_delete_docids,
|
||||
)?;
|
||||
|
||||
self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?;
|
||||
|
||||
Ok(DetailedDocumentDeletionResult {
|
||||
deleted_documents: self.to_delete_docids.len(),
|
||||
remaining_documents: documents_ids.len(),
|
||||
@ -1125,14 +1088,16 @@ mod tests {
|
||||
id
|
||||
);
|
||||
}
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
// get internal docids from deleted external document ids
|
||||
let results = index.external_documents_ids(&wtxn).unwrap();
|
||||
let results = index.external_documents_ids(&rtxn).unwrap();
|
||||
for id in deleted_external_ids {
|
||||
assert!(results.get(id).is_none(), "The document {} was supposed to be deleted", id);
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
drop(rtxn);
|
||||
|
||||
db_snap!(index, soft_deleted_documents_ids, deletion_strategy);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user