mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-12 06:24:29 +01:00
Fix hard-deletion of an external id that was soft-deleted
This commit is contained in:
parent
97fb64e40e
commit
fc0e7382fe
@ -47,30 +47,6 @@ impl<'a> ExternalDocumentsIds<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn delete_ids<A: AsRef<[u8]>>(&mut self, other: fst::Set<A>) -> fst::Result<()> {
|
|
||||||
let other = fst::Map::from(other.into_fst());
|
|
||||||
let union_op = self.soft.op().add(&other).r#union();
|
|
||||||
|
|
||||||
let mut iter = union_op.into_stream();
|
|
||||||
let mut new_soft_builder = fst::MapBuilder::memory();
|
|
||||||
while let Some((external_id, docids)) = iter.next() {
|
|
||||||
if docids.iter().any(|v| v.index == 1) {
|
|
||||||
// If the `other` set returns a value here it means
|
|
||||||
// that it must be marked as deleted.
|
|
||||||
new_soft_builder.insert(external_id, DELETED_ID)?;
|
|
||||||
} else {
|
|
||||||
let value = docids.iter().find(|v| v.index == 0).unwrap().value;
|
|
||||||
new_soft_builder.insert(external_id, value)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
drop(iter);
|
|
||||||
|
|
||||||
// We save this new map as the new soft map.
|
|
||||||
self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?;
|
|
||||||
self.merge_soft_into_hard()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they
|
/// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they
|
||||||
/// don't contain any soft deleted document id.
|
/// don't contain any soft deleted document id.
|
||||||
pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> {
|
pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> {
|
||||||
@ -173,76 +149,3 @@ impl Default for ExternalDocumentsIds<'static> {
|
|||||||
fn indexed_last_value(indexed_values: &[IndexedValue]) -> Option<u64> {
|
fn indexed_last_value(indexed_values: &[IndexedValue]) -> Option<u64> {
|
||||||
indexed_values.iter().copied().max_by_key(|iv| iv.index).map(|iv| iv.value)
|
indexed_values.iter().copied().max_by_key(|iv| iv.index).map(|iv| iv.value)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn simple_insert_delete_ids() {
|
|
||||||
let mut external_documents_ids = ExternalDocumentsIds::default();
|
|
||||||
|
|
||||||
let new_ids = fst::Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3), ("d", 4)]).unwrap();
|
|
||||||
external_documents_ids.insert_ids(&new_ids).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(external_documents_ids.get("a"), Some(1));
|
|
||||||
assert_eq!(external_documents_ids.get("b"), Some(2));
|
|
||||||
assert_eq!(external_documents_ids.get("c"), Some(3));
|
|
||||||
assert_eq!(external_documents_ids.get("d"), Some(4));
|
|
||||||
|
|
||||||
let new_ids = fst::Map::from_iter(vec![("e", 5), ("f", 6), ("g", 7)]).unwrap();
|
|
||||||
external_documents_ids.insert_ids(&new_ids).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(external_documents_ids.get("a"), Some(1));
|
|
||||||
assert_eq!(external_documents_ids.get("b"), Some(2));
|
|
||||||
assert_eq!(external_documents_ids.get("c"), Some(3));
|
|
||||||
assert_eq!(external_documents_ids.get("d"), Some(4));
|
|
||||||
assert_eq!(external_documents_ids.get("e"), Some(5));
|
|
||||||
assert_eq!(external_documents_ids.get("f"), Some(6));
|
|
||||||
assert_eq!(external_documents_ids.get("g"), Some(7));
|
|
||||||
|
|
||||||
let del_ids = fst::Set::from_iter(vec!["a", "c", "f"]).unwrap();
|
|
||||||
external_documents_ids.delete_ids(del_ids).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(external_documents_ids.get("a"), None);
|
|
||||||
assert_eq!(external_documents_ids.get("b"), Some(2));
|
|
||||||
assert_eq!(external_documents_ids.get("c"), None);
|
|
||||||
assert_eq!(external_documents_ids.get("d"), Some(4));
|
|
||||||
assert_eq!(external_documents_ids.get("e"), Some(5));
|
|
||||||
assert_eq!(external_documents_ids.get("f"), None);
|
|
||||||
assert_eq!(external_documents_ids.get("g"), Some(7));
|
|
||||||
|
|
||||||
let new_ids = fst::Map::from_iter(vec![("a", 5), ("b", 6), ("h", 8)]).unwrap();
|
|
||||||
external_documents_ids.insert_ids(&new_ids).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(external_documents_ids.get("a"), Some(5));
|
|
||||||
assert_eq!(external_documents_ids.get("b"), Some(6));
|
|
||||||
assert_eq!(external_documents_ids.get("c"), None);
|
|
||||||
assert_eq!(external_documents_ids.get("d"), Some(4));
|
|
||||||
assert_eq!(external_documents_ids.get("e"), Some(5));
|
|
||||||
assert_eq!(external_documents_ids.get("f"), None);
|
|
||||||
assert_eq!(external_documents_ids.get("g"), Some(7));
|
|
||||||
assert_eq!(external_documents_ids.get("h"), Some(8));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn strange_delete_insert_ids() {
|
|
||||||
let mut external_documents_ids = ExternalDocumentsIds::default();
|
|
||||||
|
|
||||||
let new_ids =
|
|
||||||
fst::Map::from_iter(vec![("1", 0), ("123", 1), ("30", 2), ("456", 3)]).unwrap();
|
|
||||||
external_documents_ids.insert_ids(&new_ids).unwrap();
|
|
||||||
assert_eq!(external_documents_ids.get("1"), Some(0));
|
|
||||||
assert_eq!(external_documents_ids.get("123"), Some(1));
|
|
||||||
assert_eq!(external_documents_ids.get("30"), Some(2));
|
|
||||||
assert_eq!(external_documents_ids.get("456"), Some(3));
|
|
||||||
|
|
||||||
let deleted_ids = fst::Set::from_iter(vec!["30"]).unwrap();
|
|
||||||
external_documents_ids.delete_ids(deleted_ids).unwrap();
|
|
||||||
assert_eq!(external_documents_ids.get("30"), None);
|
|
||||||
|
|
||||||
let new_ids = fst::Map::from_iter(vec![("30", 2)]).unwrap();
|
|
||||||
external_documents_ids.insert_ids(&new_ids).unwrap();
|
|
||||||
assert_eq!(external_documents_ids.get("30"), Some(2));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
@ -1181,6 +1181,7 @@ impl Index {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub(crate) mod tests {
|
pub(crate) mod tests {
|
||||||
|
use std::collections::HashSet;
|
||||||
use std::ops::Deref;
|
use std::ops::Deref;
|
||||||
|
|
||||||
use big_s::S;
|
use big_s::S;
|
||||||
@ -1195,7 +1196,7 @@ pub(crate) mod tests {
|
|||||||
self, DeleteDocuments, DeletionStrategy, IndexDocuments, IndexDocumentsConfig,
|
self, DeleteDocuments, DeletionStrategy, IndexDocuments, IndexDocumentsConfig,
|
||||||
IndexDocumentsMethod, IndexerConfig, Settings,
|
IndexDocumentsMethod, IndexerConfig, Settings,
|
||||||
};
|
};
|
||||||
use crate::{db_snap, obkv_to_json, Index};
|
use crate::{db_snap, obkv_to_json, Index, Search, SearchResult};
|
||||||
|
|
||||||
pub(crate) struct TempIndex {
|
pub(crate) struct TempIndex {
|
||||||
pub inner: Index,
|
pub inner: Index,
|
||||||
@ -2188,4 +2189,97 @@ pub(crate) mod tests {
|
|||||||
"###);
|
"###);
|
||||||
db_snap!(index, soft_deleted_documents_ids, 2, @"[]");
|
db_snap!(index, soft_deleted_documents_ids, 2, @"[]");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bug_3021_fourth() {
|
||||||
|
// https://github.com/meilisearch/meilisearch/issues/3021
|
||||||
|
let mut index = TempIndex::new();
|
||||||
|
index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
|
||||||
|
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
|
||||||
|
|
||||||
|
index
|
||||||
|
.update_settings(|settings| {
|
||||||
|
settings.set_primary_key("primary_key".to_owned());
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
index
|
||||||
|
.add_documents(documents!([
|
||||||
|
{ "primary_key": 11 },
|
||||||
|
{ "primary_key": 4 },
|
||||||
|
]))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
db_snap!(index, documents_ids, @"[0, 1, ]");
|
||||||
|
db_snap!(index, external_documents_ids, @r###"
|
||||||
|
soft:
|
||||||
|
hard:
|
||||||
|
11 0
|
||||||
|
4 1
|
||||||
|
"###);
|
||||||
|
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||||
|
|
||||||
|
index
|
||||||
|
.add_documents(documents!([
|
||||||
|
{ "primary_key": 4, "a": 0 },
|
||||||
|
{ "primary_key": 1 },
|
||||||
|
]))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
db_snap!(index, documents_ids, @"[0, 2, 3, ]");
|
||||||
|
db_snap!(index, external_documents_ids, @r###"
|
||||||
|
soft:
|
||||||
|
hard:
|
||||||
|
1 3
|
||||||
|
11 0
|
||||||
|
4 2
|
||||||
|
"###);
|
||||||
|
db_snap!(index, soft_deleted_documents_ids, @"[1, ]");
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||||
|
delete.strategy(DeletionStrategy::AlwaysHard);
|
||||||
|
delete.execute().unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
db_snap!(index, documents_ids, @"[0, 2, 3, ]");
|
||||||
|
db_snap!(index, external_documents_ids, @r###"
|
||||||
|
soft:
|
||||||
|
hard:
|
||||||
|
1 3
|
||||||
|
11 0
|
||||||
|
4 2
|
||||||
|
"###);
|
||||||
|
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||||
|
|
||||||
|
index
|
||||||
|
.add_documents(documents!([
|
||||||
|
{ "primary_key": 4, "a": 1 },
|
||||||
|
{ "primary_key": 1, "a": 0 },
|
||||||
|
]))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
db_snap!(index, documents_ids, @"[0, 1, 4, ]");
|
||||||
|
db_snap!(index, external_documents_ids, @r###"
|
||||||
|
soft:
|
||||||
|
hard:
|
||||||
|
1 4
|
||||||
|
11 0
|
||||||
|
4 1
|
||||||
|
"###);
|
||||||
|
db_snap!(index, soft_deleted_documents_ids, @"[2, 3, ]");
|
||||||
|
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
let search = Search::new(&rtxn, &index);
|
||||||
|
let SearchResult { matching_words: _, candidates: _, mut documents_ids } =
|
||||||
|
search.execute().unwrap();
|
||||||
|
let primary_key_id = index.fields_ids_map(&rtxn).unwrap().id("primary_key").unwrap();
|
||||||
|
documents_ids.sort_unstable();
|
||||||
|
let docs = index.documents(&rtxn, documents_ids).unwrap();
|
||||||
|
let mut all_ids = HashSet::new();
|
||||||
|
for (_docid, obkv) in docs {
|
||||||
|
let id = obkv.get(primary_key_id).unwrap();
|
||||||
|
assert!(all_ids.insert(id));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -6,16 +6,14 @@ use heed::types::{ByteSlice, DecodeIgnore, Str};
|
|||||||
use heed::Database;
|
use heed::Database;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_json::Value;
|
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
|
|
||||||
use super::facet::delete::FacetsDelete;
|
use super::facet::delete::FacetsDelete;
|
||||||
use super::ClearDocuments;
|
use super::ClearDocuments;
|
||||||
use crate::error::{InternalError, UserError};
|
use crate::error::InternalError;
|
||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::heed_codec::facet::FieldDocIdFacetCodec;
|
use crate::heed_codec::facet::FieldDocIdFacetCodec;
|
||||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||||
use crate::index::{db_name, main_key};
|
|
||||||
use crate::{
|
use crate::{
|
||||||
ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec,
|
ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec,
|
||||||
SmallString32, BEU32,
|
SmallString32, BEU32,
|
||||||
@ -186,6 +184,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
|
|
||||||
soft_deleted_docids |= &self.to_delete_docids;
|
soft_deleted_docids |= &self.to_delete_docids;
|
||||||
|
|
||||||
|
// We always soft-delete the documents, even if they will be permanently
|
||||||
|
// deleted immediately after.
|
||||||
|
self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?;
|
||||||
|
|
||||||
// decide for a hard or soft deletion depending on the strategy
|
// decide for a hard or soft deletion depending on the strategy
|
||||||
let soft_deletion = match self.strategy {
|
let soft_deletion = match self.strategy {
|
||||||
DeletionStrategy::Dynamic => {
|
DeletionStrategy::Dynamic => {
|
||||||
@ -214,7 +216,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
|
|
||||||
if soft_deletion {
|
if soft_deletion {
|
||||||
// Keep the soft-deleted in the DB
|
// Keep the soft-deleted in the DB
|
||||||
self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?;
|
|
||||||
return Ok(DetailedDocumentDeletionResult {
|
return Ok(DetailedDocumentDeletionResult {
|
||||||
deleted_documents: self.to_delete_docids.len(),
|
deleted_documents: self.to_delete_docids.len(),
|
||||||
remaining_documents: documents_ids.len(),
|
remaining_documents: documents_ids.len(),
|
||||||
@ -222,23 +223,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Erase soft-deleted from DB
|
|
||||||
self.to_delete_docids = soft_deleted_docids;
|
self.to_delete_docids = soft_deleted_docids;
|
||||||
// and we can reset the soft deleted bitmap
|
|
||||||
self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?;
|
|
||||||
|
|
||||||
let primary_key =
|
|
||||||
self.index.primary_key(self.wtxn)?.ok_or(InternalError::DatabaseMissingEntry {
|
|
||||||
db_name: db_name::MAIN,
|
|
||||||
key: Some(main_key::PRIMARY_KEY_KEY),
|
|
||||||
})?;
|
|
||||||
|
|
||||||
// Since we already checked if the DB was empty, if we can't find the primary key, then
|
|
||||||
// something is wrong, and we must return an error.
|
|
||||||
let id_field = match fields_ids_map.id(primary_key) {
|
|
||||||
Some(field) => field,
|
|
||||||
None => return Err(UserError::MissingPrimaryKey.into()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let Index {
|
let Index {
|
||||||
env: _env,
|
env: _env,
|
||||||
@ -262,33 +247,14 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
documents,
|
documents,
|
||||||
} = self.index;
|
} = self.index;
|
||||||
|
|
||||||
// Retrieve the words and the external documents ids contained in the documents.
|
// Retrieve the words contained in the documents.
|
||||||
let mut words = Vec::new();
|
let mut words = Vec::new();
|
||||||
let mut external_ids = Vec::new();
|
|
||||||
for docid in &self.to_delete_docids {
|
for docid in &self.to_delete_docids {
|
||||||
// We create an iterator to be able to get the content and delete the document
|
documents.delete(self.wtxn, &BEU32::new(docid))?;
|
||||||
// content itself. It's faster to acquire a cursor to get and delete,
|
|
||||||
// as we avoid traversing the LMDB B-Tree two times but only once.
|
|
||||||
let key = BEU32::new(docid);
|
|
||||||
let mut iter = documents.range_mut(self.wtxn, &(key..=key))?;
|
|
||||||
if let Some((_key, obkv)) = iter.next().transpose()? {
|
|
||||||
if let Some(content) = obkv.get(id_field) {
|
|
||||||
let external_id = match serde_json::from_slice(content).unwrap() {
|
|
||||||
Value::String(string) => SmallString32::from(string.as_str()),
|
|
||||||
Value::Number(number) => SmallString32::from(number.to_string()),
|
|
||||||
document_id => {
|
|
||||||
return Err(UserError::InvalidDocumentId { document_id }.into())
|
|
||||||
}
|
|
||||||
};
|
|
||||||
external_ids.push(external_id);
|
|
||||||
}
|
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
|
||||||
unsafe { iter.del_current()? };
|
|
||||||
}
|
|
||||||
drop(iter);
|
|
||||||
|
|
||||||
// We iterate through the words positions of the document id,
|
// We iterate through the words positions of the document id, retrieve the word and delete the positions.
|
||||||
// retrieve the word and delete the positions.
|
// We create an iterator to be able to get the content and delete the key-value itself.
|
||||||
|
// It's faster to acquire a cursor to get and delete, as we avoid traversing the LMDB B-Tree two times but only once.
|
||||||
let mut iter = docid_word_positions.prefix_iter_mut(self.wtxn, &(docid, ""))?;
|
let mut iter = docid_word_positions.prefix_iter_mut(self.wtxn, &(docid, ""))?;
|
||||||
while let Some(result) = iter.next() {
|
while let Some(result) = iter.next() {
|
||||||
let ((_docid, word), _positions) = result?;
|
let ((_docid, word), _positions) = result?;
|
||||||
@ -298,17 +264,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
unsafe { iter.del_current()? };
|
unsafe { iter.del_current()? };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// We create the FST map of the external ids that we must delete.
|
|
||||||
external_ids.sort_unstable();
|
|
||||||
let external_ids_to_delete = fst::Set::from_iter(external_ids)?;
|
|
||||||
|
|
||||||
// We acquire the current external documents ids map...
|
// We acquire the current external documents ids map...
|
||||||
|
// Note that its soft-deleted document ids field will be equal to the `to_delete_docids`
|
||||||
let mut new_external_documents_ids = self.index.external_documents_ids(self.wtxn)?;
|
let mut new_external_documents_ids = self.index.external_documents_ids(self.wtxn)?;
|
||||||
// ...and remove the to-delete external ids.
|
// We then remove the soft-deleted docids from it
|
||||||
new_external_documents_ids.delete_ids(external_ids_to_delete)?;
|
new_external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
|
||||||
|
// and write it back to the main database.
|
||||||
// We write the new external ids into the main database.
|
|
||||||
let new_external_documents_ids = new_external_documents_ids.into_static();
|
let new_external_documents_ids = new_external_documents_ids.into_static();
|
||||||
self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?;
|
self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?;
|
||||||
|
|
||||||
@ -545,6 +506,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
&self.to_delete_docids,
|
&self.to_delete_docids,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
|
self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?;
|
||||||
|
|
||||||
Ok(DetailedDocumentDeletionResult {
|
Ok(DetailedDocumentDeletionResult {
|
||||||
deleted_documents: self.to_delete_docids.len(),
|
deleted_documents: self.to_delete_docids.len(),
|
||||||
remaining_documents: documents_ids.len(),
|
remaining_documents: documents_ids.len(),
|
||||||
@ -1125,14 +1088,16 @@ mod tests {
|
|||||||
id
|
id
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
// get internal docids from deleted external document ids
|
// get internal docids from deleted external document ids
|
||||||
let results = index.external_documents_ids(&wtxn).unwrap();
|
let results = index.external_documents_ids(&rtxn).unwrap();
|
||||||
for id in deleted_external_ids {
|
for id in deleted_external_ids {
|
||||||
assert!(results.get(id).is_none(), "The document {} was supposed to be deleted", id);
|
assert!(results.get(id).is_none(), "The document {} was supposed to be deleted", id);
|
||||||
}
|
}
|
||||||
|
drop(rtxn);
|
||||||
wtxn.commit().unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, soft_deleted_documents_ids, deletion_strategy);
|
db_snap!(index, soft_deleted_documents_ids, deletion_strategy);
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user