557: Fasten documents deletion and update r=Kerollmops a=irevoire

When a document deletion occurs, instead of deleting the document we mark it as deleted in the new “soft deleted” bitmap. It is then removed from the search and all the other endpoints.

I ran the benchmarks against main;
```
% ./compare.sh indexing_main_83ad1aaf.json indexing_fasten-document-deletion_abab51fb.json
group                                                                     indexing_fasten-document-deletion_abab51fb    indexing_main_83ad1aaf
-----                                                                     ------------------------------------------    ----------------------
indexing/-geo-delete-facetedNumber-facetedGeo-searchable-                 1.05      2.0±0.40ms        ? ?/sec           1.00  1904.9±190.00µs        ? ?/sec
indexing/-movies-delete-facetedString-facetedNumber-searchable-           1.00     10.3±2.64ms        ? ?/sec           961.61      9.9±0.12s        ? ?/sec
indexing/-movies-delete-facetedString-facetedNumber-searchable-nested-    1.00     15.1±3.90ms        ? ?/sec           554.63      8.4±0.12s        ? ?/sec
indexing/-songs-delete-facetedString-facetedNumber-searchable-            1.00     45.1±7.53ms        ? ?/sec           710.15     32.0±0.10s        ? ?/sec
indexing/-wiki-delete-searchable-                                         1.00    277.8±7.97ms        ? ?/sec           1946.57    540.8±3.15s        ? ?/sec
indexing/Indexing geo_point                                               1.00      12.0±0.20s        ? ?/sec           1.03      12.4±0.19s        ? ?/sec
indexing/Indexing movies in three batches                                 1.00      19.3±0.30s        ? ?/sec           1.01      19.4±0.16s        ? ?/sec
indexing/Indexing movies with default settings                            1.00      18.8±0.09s        ? ?/sec           1.00      18.9±0.10s        ? ?/sec
indexing/Indexing nested movies with default settings                     1.00      25.9±0.19s        ? ?/sec           1.00      25.9±0.12s        ? ?/sec
indexing/Indexing nested movies without any facets                        1.00      24.8±0.17s        ? ?/sec           1.00      24.8±0.18s        ? ?/sec
indexing/Indexing songs in three batches with default settings            1.00      65.9±0.96s        ? ?/sec           1.03      67.8±0.82s        ? ?/sec
indexing/Indexing songs with default settings                             1.00      58.8±1.11s        ? ?/sec           1.02      59.9±2.09s        ? ?/sec
indexing/Indexing songs without any facets                                1.00      53.4±0.72s        ? ?/sec           1.01      54.2±0.88s        ? ?/sec
indexing/Indexing songs without faceted numbers                           1.00      57.9±1.17s        ? ?/sec           1.01      58.3±1.20s        ? ?/sec
indexing/Indexing wiki                                                    1.00   1065.2±13.26s        ? ?/sec           1.00   1065.8±12.66s        ? ?/sec
indexing/Indexing wiki in three batches                                   1.00    1182.4±6.20s        ? ?/sec           1.01    1190.8±8.48s        ? ?/sec
```

Most things do not change, we lost 0.1ms on the indexing of geo point (I don’t get why), and then we are between 500 and 1900 times faster when we delete documents.


Co-authored-by: Tamo <tamo@meilisearch.com>
This commit is contained in:
bors[bot] 2022-07-05 14:14:38 +00:00 committed by GitHub
commit dd1e606f13
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 415 additions and 284 deletions

View File

@ -83,6 +83,8 @@ pub enum FieldIdMapMissingEntry {
#[derive(Error, Debug)]
pub enum UserError {
#[error("A soft deleted internal document id have been used: `{document_id}`.")]
AccessingSoftDeletedDocument { document_id: DocumentId },
#[error("A document cannot contain more than 65,535 fields.")]
AttributeLimitReached,
#[error(transparent)]

View File

@ -5,26 +5,30 @@ use std::{fmt, str};
use fst::map::IndexedValue;
use fst::{IntoStreamer, Streamer};
use roaring::RoaringBitmap;
const DELETED_ID: u64 = u64::MAX;
pub struct ExternalDocumentsIds<'a> {
pub(crate) hard: fst::Map<Cow<'a, [u8]>>,
pub(crate) soft: fst::Map<Cow<'a, [u8]>>,
soft_deleted_docids: RoaringBitmap,
}
impl<'a> ExternalDocumentsIds<'a> {
pub fn new(
hard: fst::Map<Cow<'a, [u8]>>,
soft: fst::Map<Cow<'a, [u8]>>,
soft_deleted_docids: RoaringBitmap,
) -> ExternalDocumentsIds<'a> {
ExternalDocumentsIds { hard, soft }
ExternalDocumentsIds { hard, soft, soft_deleted_docids }
}
pub fn into_static(self) -> ExternalDocumentsIds<'static> {
ExternalDocumentsIds {
hard: self.hard.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
soft: self.soft.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
soft_deleted_docids: self.soft_deleted_docids,
}
}
@ -36,7 +40,9 @@ impl<'a> ExternalDocumentsIds<'a> {
pub fn get<A: AsRef<[u8]>>(&self, external_id: A) -> Option<u32> {
let external_id = external_id.as_ref();
match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) {
Some(id) if id != DELETED_ID => Some(id.try_into().unwrap()),
Some(id) if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) => {
Some(id.try_into().unwrap())
}
_otherwise => None,
}
}
@ -134,6 +140,7 @@ impl Default for ExternalDocumentsIds<'static> {
ExternalDocumentsIds {
hard: fst::Map::default().map_data(Cow::Owned).unwrap(),
soft: fst::Map::default().map_data(Cow::Owned).unwrap(),
soft_deleted_docids: RoaringBitmap::new(),
}
}
}

View File

@ -32,6 +32,7 @@ pub mod main_key {
pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields";
pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key";
pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
pub const SOFT_DELETED_DOCUMENTS_IDS_KEY: &str = "soft-deleted-documents-ids";
pub const HIDDEN_FACETED_FIELDS_KEY: &str = "hidden-faceted-fields";
pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields";
pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields";
@ -254,6 +255,29 @@ impl Index {
Ok(count.unwrap_or_default())
}
/* deleted documents ids */
/// Writes the soft deleted documents ids.
pub(crate) fn put_soft_deleted_documents_ids(
&self,
wtxn: &mut RwTxn,
docids: &RoaringBitmap,
) -> heed::Result<()> {
self.main.put::<_, Str, RoaringBitmapCodec>(
wtxn,
main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY,
docids,
)
}
/// Returns the soft deleted documents ids.
pub(crate) fn soft_deleted_documents_ids(&self, rtxn: &RoTxn) -> heed::Result<RoaringBitmap> {
Ok(self
.main
.get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY)?
.unwrap_or_default())
}
/* primary key */
/// Writes the documents primary key, this is the field name that is used to store the id.
@ -280,7 +304,7 @@ impl Index {
wtxn: &mut RwTxn,
external_documents_ids: &ExternalDocumentsIds<'a>,
) -> heed::Result<()> {
let ExternalDocumentsIds { hard, soft } = external_documents_ids;
let ExternalDocumentsIds { hard, soft, .. } = external_documents_ids;
let hard = hard.as_fst().as_bytes();
let soft = soft.as_fst().as_bytes();
self.main.put::<_, Str, ByteSlice>(
@ -311,7 +335,8 @@ impl Index {
Some(soft) => fst::Map::new(soft)?.map_data(Cow::Borrowed)?,
None => fst::Map::default().map_data(Cow::Owned)?,
};
Ok(ExternalDocumentsIds::new(hard, soft))
let soft_deleted_docids = self.soft_deleted_documents_ids(rtxn)?;
Ok(ExternalDocumentsIds::new(hard, soft, soft_deleted_docids))
}
/* fields ids map */
@ -929,9 +954,13 @@ impl Index {
rtxn: &'t RoTxn,
ids: impl IntoIterator<Item = DocumentId>,
) -> Result<Vec<(DocumentId, obkv::KvReaderU16<'t>)>> {
let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?;
let mut documents = Vec::new();
for id in ids {
if soft_deleted_documents.contains(id) {
return Err(UserError::AccessingSoftDeletedDocument { document_id: id })?;
}
let kv = self
.documents
.get(rtxn, &BEU32::new(id))?
@ -947,11 +976,16 @@ impl Index {
&self,
rtxn: &'t RoTxn,
) -> Result<impl Iterator<Item = heed::Result<(DocumentId, obkv::KvReaderU16<'t>)>>> {
let soft_deleted_docids = self.soft_deleted_documents_ids(rtxn)?;
Ok(self
.documents
.iter(rtxn)?
// we cast the BEU32 to a DocumentId
.map(|document| document.map(|(id, obkv)| (id.get(), obkv))))
.map(|document| document.map(|(id, obkv)| (id.get(), obkv)))
.filter(move |document| {
document.as_ref().map_or(true, |(id, _)| !soft_deleted_docids.contains(*id))
}))
}
pub fn facets_distribution<'a>(&'a self, rtxn: &'a RoTxn) -> FacetDistribution<'a> {

View File

@ -10,9 +10,7 @@ use roaring::RoaringBitmap;
use super::FacetNumberRange;
use crate::error::{Error, UserError};
use crate::heed_codec::facet::{
FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
};
use crate::heed_codec::facet::FacetLevelValueF64Codec;
use crate::{
distance_between_two_points, lat_lng_to_xyz, CboRoaringBitmapCodec, FieldId, Index, Result,
};
@ -266,11 +264,12 @@ impl<'a> Filter<'a> {
fn evaluate_operator(
rtxn: &heed::RoTxn,
index: &Index,
numbers_db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
strings_db: heed::Database<FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>,
field_id: FieldId,
operator: &Condition<'a>,
) -> Result<RoaringBitmap> {
let numbers_db = index.facet_id_f64_docids;
let strings_db = index.facet_id_string_docids;
// Make sure we always bound the ranges with the field id and the level,
// as the facets values are all in the same database and prefixed by the
// field id and the level.
@ -309,9 +308,7 @@ impl<'a> Filter<'a> {
let all_numbers_ids = index.number_faceted_documents_ids(rtxn, field_id)?;
let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?;
let operator = Condition::Equal(val.clone());
let docids = Self::evaluate_operator(
rtxn, index, numbers_db, strings_db, field_id, &operator,
)?;
let docids = Self::evaluate_operator(rtxn, index, field_id, &operator)?;
return Ok((all_numbers_ids | all_strings_ids) - docids);
}
};
@ -342,17 +339,27 @@ impl<'a> Filter<'a> {
}
pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> {
let numbers_db = index.facet_id_f64_docids;
let strings_db = index.facet_id_string_docids;
// to avoid doing this for each recursive call we're going to do it ONCE ahead of time
let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?;
let filterable_fields = index.filterable_fields(rtxn)?;
// and finally we delete all the soft_deleted_documents, again, only once at the very end
self.inner_evaluate(rtxn, index, &filterable_fields)
.map(|result| result - soft_deleted_documents)
}
fn inner_evaluate(
&self,
rtxn: &heed::RoTxn,
index: &Index,
filterable_fields: &HashSet<String>,
) -> Result<RoaringBitmap> {
match &self.condition {
FilterCondition::Condition { fid, op } => {
let filterable_fields = index.filterable_fields(rtxn)?;
if crate::is_faceted(fid.value(), &filterable_fields) {
if crate::is_faceted(fid.value(), filterable_fields) {
let field_ids_map = index.fields_ids_map(rtxn)?;
if let Some(fid) = field_ids_map.id(fid.value()) {
Self::evaluate_operator(rtxn, index, numbers_db, strings_db, fid, &op)
Self::evaluate_operator(rtxn, index, fid, &op)
} else {
return Ok(RoaringBitmap::new());
}
@ -371,7 +378,7 @@ impl<'a> Filter<'a> {
return Err(fid.as_external_error(
FilterError::AttributeNotFilterable {
attribute,
filterable_fields,
filterable_fields: filterable_fields.clone(),
},
))?;
}
@ -379,17 +386,39 @@ impl<'a> Filter<'a> {
}
}
FilterCondition::Or(lhs, rhs) => {
let lhs = Self::evaluate(&(lhs.as_ref().clone()).into(), rtxn, index)?;
let rhs = Self::evaluate(&(rhs.as_ref().clone()).into(), rtxn, index)?;
let lhs = Self::inner_evaluate(
&(lhs.as_ref().clone()).into(),
rtxn,
index,
filterable_fields,
)?;
let rhs = Self::inner_evaluate(
&(rhs.as_ref().clone()).into(),
rtxn,
index,
filterable_fields,
)?;
Ok(lhs | rhs)
}
FilterCondition::And(lhs, rhs) => {
let lhs = Self::evaluate(&(lhs.as_ref().clone()).into(), rtxn, index)?;
let rhs = Self::evaluate(&(rhs.as_ref().clone()).into(), rtxn, index)?;
let lhs = Self::inner_evaluate(
&(lhs.as_ref().clone()).into(),
rtxn,
index,
filterable_fields,
)?;
if lhs.is_empty() {
return Ok(lhs);
}
let rhs = Self::inner_evaluate(
&(rhs.as_ref().clone()).into(),
rtxn,
index,
filterable_fields,
)?;
Ok(lhs & rhs)
}
FilterCondition::GeoLowerThan { point, radius } => {
let filterable_fields = index.filterable_fields(rtxn)?;
if filterable_fields.contains("_geo") {
let base_point: [f64; 2] = [point[0].parse()?, point[1].parse()?];
if !(-90.0..=90.0).contains(&base_point[0]) {
@ -422,16 +451,17 @@ impl<'a> Filter<'a> {
} else {
return Err(point[0].as_external_error(FilterError::AttributeNotFilterable {
attribute: "_geo",
filterable_fields,
filterable_fields: filterable_fields.clone(),
}))?;
}
}
FilterCondition::GeoGreaterThan { point, radius } => {
let result = Self::evaluate(
let result = Self::inner_evaluate(
&FilterCondition::GeoLowerThan { point: point.clone(), radius: radius.clone() }
.into(),
rtxn,
index,
filterable_fields,
)?;
let geo_faceted_doc_ids = index.geo_faceted_documents_ids(rtxn)?;
Ok(geo_faceted_doc_ids - result)

View File

@ -214,7 +214,7 @@ impl<'a> Search<'a> {
) -> Result<SearchResult> {
let mut offset = self.offset;
let mut initial_candidates = RoaringBitmap::new();
let mut excluded_candidates = RoaringBitmap::new();
let mut excluded_candidates = self.index.soft_deleted_documents_ids(self.rtxn)?;
let mut documents_ids = Vec::new();
while let Some(FinalResult { candidates, bucket_candidates, .. }) =

View File

@ -8,11 +8,16 @@ pub struct AvailableDocumentsIds {
}
impl AvailableDocumentsIds {
pub fn from_documents_ids(docids: &RoaringBitmap) -> AvailableDocumentsIds {
match docids.max() {
pub fn from_documents_ids(
docids: &RoaringBitmap,
soft_deleted_docids: &RoaringBitmap,
) -> AvailableDocumentsIds {
let used_docids = docids | soft_deleted_docids;
match used_docids.max() {
Some(last_id) => {
let mut available = RoaringBitmap::from_iter(0..last_id);
available -= docids;
available -= used_docids;
let iter = match last_id.checked_add(1) {
Some(id) => id..=u32::max_value(),
@ -44,7 +49,7 @@ mod tests {
#[test]
fn empty() {
let base = RoaringBitmap::new();
let left = AvailableDocumentsIds::from_documents_ids(&base);
let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new());
let right = 0..=u32::max_value();
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
}
@ -57,8 +62,28 @@ mod tests {
base.insert(100);
base.insert(405);
let left = AvailableDocumentsIds::from_documents_ids(&base);
let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new());
let right = (0..=u32::max_value()).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405);
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
}
#[test]
fn soft_deleted() {
let mut base = RoaringBitmap::new();
base.insert(0);
base.insert(10);
base.insert(100);
base.insert(405);
let mut soft_deleted = RoaringBitmap::new();
soft_deleted.insert(1);
soft_deleted.insert(11);
soft_deleted.insert(101);
soft_deleted.insert(406);
let left = AvailableDocumentsIds::from_documents_ids(&base, &soft_deleted);
let right =
(0..=u32::max_value()).filter(|&n| ![0, 1, 10, 11, 100, 101, 405, 406].contains(&n));
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
}
}

View File

@ -35,6 +35,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
documents,
} = self.index;
let empty_roaring = RoaringBitmap::default();
// We retrieve the number of documents ids that we are deleting.
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
@ -43,16 +45,16 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
self.index.put_words_fst(self.wtxn, &fst::Set::default())?;
self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?;
self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?;
self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?;
self.index.put_documents_ids(self.wtxn, &empty_roaring)?;
self.index.put_soft_deleted_documents_ids(self.wtxn, &empty_roaring)?;
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
self.index.delete_geo_rtree(self.wtxn)?;
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
// We clean all the faceted documents ids.
let empty = RoaringBitmap::default();
for field_id in faceted_fields {
self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &empty)?;
self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &empty)?;
self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &empty_roaring)?;
self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &empty_roaring)?;
}
// Clear the other databases.

View File

@ -1,5 +1,4 @@
use std::collections::btree_map::Entry;
use std::collections::HashMap;
use fst::IntoStreamer;
use heed::types::{ByteSlice, Str};
@ -17,15 +16,19 @@ use crate::heed_codec::facet::{
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::index::{db_name, main_key};
use crate::{
DocumentId, ExternalDocumentsIds, FieldId, Index, Result, RoaringBitmapCodec, SmallString32,
BEU32,
DocumentId, ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result,
RoaringBitmapCodec, SmallString32, BEU32,
};
/// The threshold we use to determine after which number of documents we want to clear the
/// soft-deleted database and delete documents for real.
const DELETE_DOCUMENTS_THRESHOLD: u64 = 10_000;
pub struct DeleteDocuments<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
external_documents_ids: ExternalDocumentsIds<'static>,
documents_ids: RoaringBitmap,
to_delete_docids: RoaringBitmap,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@ -45,16 +48,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
wtxn,
index,
external_documents_ids,
documents_ids: RoaringBitmap::new(),
to_delete_docids: RoaringBitmap::new(),
})
}
pub fn delete_document(&mut self, docid: u32) {
self.documents_ids.insert(docid);
self.to_delete_docids.insert(docid);
}
pub fn delete_documents(&mut self, docids: &RoaringBitmap) {
self.documents_ids |= docids;
self.to_delete_docids |= docids;
}
pub fn delete_external_id(&mut self, external_id: &str) -> Option<u32> {
@ -63,28 +66,30 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
Some(docid)
}
pub fn execute(self) -> Result<DocumentDeletionResult> {
pub fn execute(mut self) -> Result<DocumentDeletionResult> {
self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?;
// We retrieve the current documents ids that are in the database.
let mut documents_ids = self.index.documents_ids(self.wtxn)?;
let mut soft_deleted_docids = self.index.soft_deleted_documents_ids(self.wtxn)?;
let current_documents_ids_len = documents_ids.len();
// We can and must stop removing documents in a database that is empty.
if documents_ids.is_empty() {
return Ok(DocumentDeletionResult {
deleted_documents: 0,
remaining_documents: current_documents_ids_len,
});
// but if there was still documents to delete we clear the database entirely
if !soft_deleted_docids.is_empty() {
ClearDocuments::new(self.wtxn, self.index).execute()?;
}
return Ok(DocumentDeletionResult { deleted_documents: 0, remaining_documents: 0 });
}
// We remove the documents ids that we want to delete
// from the documents in the database and write them back.
documents_ids -= &self.documents_ids;
documents_ids -= &self.to_delete_docids;
self.index.put_documents_ids(self.wtxn, &documents_ids)?;
// We can execute a ClearDocuments operation when the number of documents
// to delete is exactly the number of documents in the database.
if current_documents_ids_len == self.documents_ids.len() {
if current_documents_ids_len == self.to_delete_docids.len() {
let remaining_documents = ClearDocuments::new(self.wtxn, self.index).execute()?;
return Ok(DocumentDeletionResult {
deleted_documents: current_documents_ids_len,
@ -93,6 +98,50 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
}
let fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
let mut field_distribution = self.index.field_distribution(self.wtxn)?;
// we update the field distribution
for docid in self.to_delete_docids.iter() {
let key = BEU32::new(docid);
let document =
self.index.documents.get(self.wtxn, &key)?.ok_or(
InternalError::DatabaseMissingEntry { db_name: "documents", key: None },
)?;
for (fid, _value) in document.iter() {
let field_name =
fields_ids_map.name(fid).ok_or(FieldIdMapMissingEntry::FieldId {
field_id: fid,
process: "delete documents",
})?;
if let Entry::Occupied(mut entry) = field_distribution.entry(field_name.to_string())
{
match entry.get().checked_sub(1) {
Some(0) | None => entry.remove(),
Some(count) => entry.insert(count),
};
}
}
}
self.index.put_field_distribution(self.wtxn, &field_distribution)?;
soft_deleted_docids |= &self.to_delete_docids;
// if we have less documents to delete than the threshold we simply save them in
// the `soft_deleted_documents_ids` bitmap and early exit.
if soft_deleted_docids.len() < DELETE_DOCUMENTS_THRESHOLD {
self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?;
return Ok(DocumentDeletionResult {
deleted_documents: self.to_delete_docids.len(),
remaining_documents: documents_ids.len(),
});
}
// There is more than documents to delete than the threshold we needs to delete them all
self.to_delete_docids = soft_deleted_docids;
// and we can reset the soft deleted bitmap
self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?;
let primary_key = self.index.primary_key(self.wtxn)?.ok_or_else(|| {
InternalError::DatabaseMissingEntry {
db_name: db_name::MAIN,
@ -127,23 +176,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
documents,
} = self.index;
// Number of fields for each document that has been deleted.
let mut fields_ids_distribution_diff = HashMap::new();
// Retrieve the words and the external documents ids contained in the documents.
let mut words = Vec::new();
let mut external_ids = Vec::new();
for docid in &self.documents_ids {
for docid in &self.to_delete_docids {
// We create an iterator to be able to get the content and delete the document
// content itself. It's faster to acquire a cursor to get and delete,
// as we avoid traversing the LMDB B-Tree two times but only once.
let key = BEU32::new(docid);
let mut iter = documents.range_mut(self.wtxn, &(key..=key))?;
if let Some((_key, obkv)) = iter.next().transpose()? {
for (field_id, _) in obkv.iter() {
*fields_ids_distribution_diff.entry(field_id).or_default() += 1;
}
if let Some(content) = obkv.get(id_field) {
let external_id = match serde_json::from_slice(content).unwrap() {
Value::String(string) => SmallString32::from(string.as_str()),
@ -171,24 +213,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
}
}
let mut field_distribution = self.index.field_distribution(self.wtxn)?;
// We use pre-calculated number of fields occurrences that needs to be deleted
// to reflect deleted documents.
// If all field occurrences are removed, delete the entry from distribution.
// Otherwise, insert new number of occurrences (current_count - count_diff).
for (field_id, count_diff) in fields_ids_distribution_diff {
let field_name = fields_ids_map.name(field_id).unwrap();
if let Entry::Occupied(mut entry) = field_distribution.entry(field_name.to_string()) {
match entry.get().checked_sub(count_diff) {
Some(0) | None => entry.remove(),
Some(count) => entry.insert(count),
};
}
}
self.index.put_field_distribution(self.wtxn, &field_distribution)?;
// We create the FST map of the external ids that we must delete.
external_ids.sort_unstable();
let external_ids_to_delete = fst::Set::from_iter(external_ids)?;
@ -214,7 +238,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
word_docids,
word.as_str(),
must_remove,
&self.documents_ids,
&self.to_delete_docids,
)?;
remove_from_word_docids(
@ -222,7 +246,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
exact_word_docids,
word.as_str(),
must_remove,
&self.documents_ids,
&self.to_delete_docids,
)?;
}
@ -256,12 +280,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
self.index.put_words_fst(self.wtxn, &new_words_fst)?;
let prefixes_to_delete =
remove_from_word_prefix_docids(self.wtxn, word_prefix_docids, &self.documents_ids)?;
remove_from_word_prefix_docids(self.wtxn, word_prefix_docids, &self.to_delete_docids)?;
let exact_prefix_to_delete = remove_from_word_prefix_docids(
self.wtxn,
exact_word_prefix_docids,
&self.documents_ids,
&self.to_delete_docids,
)?;
let all_prefixes_to_delete = prefixes_to_delete.op().add(&exact_prefix_to_delete).union();
@ -293,7 +317,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
while let Some(result) = iter.next() {
let (key, mut docids) = result?;
let previous_len = docids.len();
docids -= &self.documents_ids;
docids -= &self.to_delete_docids;
if docids.is_empty() {
// safety: we don't keep references from inside the LMDB database.
unsafe { iter.del_current()? };
@ -314,7 +338,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
while let Some(result) = iter.next() {
let (bytes, mut docids) = result?;
let previous_len = docids.len();
docids -= &self.documents_ids;
docids -= &self.to_delete_docids;
if docids.is_empty() {
// safety: we don't keep references from inside the LMDB database.
unsafe { iter.del_current()? };
@ -332,7 +356,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
while let Some(result) = iter.next() {
let (bytes, mut docids) = result?;
let previous_len = docids.len();
docids -= &self.documents_ids;
docids -= &self.to_delete_docids;
if docids.is_empty() {
// safety: we don't keep references from inside the LMDB database.
unsafe { iter.del_current()? };
@ -351,7 +375,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
while let Some(result) = iter.next() {
let (bytes, mut docids) = result?;
let previous_len = docids.len();
docids -= &self.documents_ids;
docids -= &self.to_delete_docids;
if docids.is_empty() {
// safety: we don't keep references from inside the LMDB database.
unsafe { iter.del_current()? };
@ -368,7 +392,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
let mut iter = field_id_word_count_docids.iter_mut(self.wtxn)?;
while let Some((key, mut docids)) = iter.next().transpose()? {
let previous_len = docids.len();
docids -= &self.documents_ids;
docids -= &self.to_delete_docids;
if docids.is_empty() {
// safety: we don't keep references from inside the LMDB database.
unsafe { iter.del_current()? };
@ -386,7 +410,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
let (points_to_remove, docids_to_remove): (Vec<_>, RoaringBitmap) = rtree
.iter()
.filter(|&point| self.documents_ids.contains(point.data.0))
.filter(|&point| self.to_delete_docids.contains(point.data.0))
.cloned()
.map(|point| (point, point.data.0))
.unzip();
@ -403,46 +427,46 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
remove_docids_from_facet_field_id_number_docids(
self.wtxn,
facet_id_f64_docids,
&self.documents_ids,
&self.to_delete_docids,
)?;
remove_docids_from_facet_field_id_string_docids(
self.wtxn,
facet_id_string_docids,
&self.documents_ids,
&self.to_delete_docids,
)?;
// Remove the documents ids from the faceted documents ids.
for field_id in self.index.faceted_fields_ids(self.wtxn)? {
// Remove docids from the number faceted documents ids
let mut docids = self.index.number_faceted_documents_ids(self.wtxn, field_id)?;
docids -= &self.documents_ids;
docids -= &self.to_delete_docids;
self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &docids)?;
remove_docids_from_field_id_docid_facet_value(
self.wtxn,
field_id_docid_facet_f64s,
field_id,
&self.documents_ids,
&self.to_delete_docids,
|(_fid, docid, _value)| docid,
)?;
// Remove docids from the string faceted documents ids
let mut docids = self.index.string_faceted_documents_ids(self.wtxn, field_id)?;
docids -= &self.documents_ids;
docids -= &self.to_delete_docids;
self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &docids)?;
remove_docids_from_field_id_docid_facet_value(
self.wtxn,
field_id_docid_facet_strings,
field_id,
&self.documents_ids,
&self.to_delete_docids,
|(_fid, docid, _value)| docid,
)?;
}
Ok(DocumentDeletionResult {
deleted_documents: self.documents_ids.len(),
deleted_documents: self.to_delete_docids.len(),
remaining_documents: documents_ids.len(),
})
}
@ -741,26 +765,26 @@ mod tests {
builder.execute(|_| ()).unwrap();
let content = documents!([
{"docid":"1_4","label":"sign"},
{"docid":"1_5","label":"letter"},
{"docid":"1_7","label":"abstract,cartoon,design,pattern"},
{"docid":"1_36","label":"drawing,painting,pattern"},
{"docid":"1_37","label":"art,drawing,outdoor"},
{"docid":"1_38","label":"aquarium,art,drawing"},
{"docid":"1_39","label":"abstract"},
{"docid":"1_40","label":"cartoon"},
{"docid":"1_41","label":"art,drawing"},
{"docid":"1_42","label":"art,pattern"},
{"docid":"1_43","label":"abstract,art,drawing,pattern"},
{"docid":"1_44","label":"drawing"},
{"docid":"1_45","label":"art"},
{"docid":"1_46","label":"abstract,colorfulness,pattern"},
{"docid":"1_47","label":"abstract,pattern"},
{"docid":"1_52","label":"abstract,cartoon"},
{"docid":"1_57","label":"abstract,drawing,pattern"},
{"docid":"1_58","label":"abstract,art,cartoon"},
{"docid":"1_68","label":"design"},
{"docid":"1_69","label":"geometry"}
{ "docid": "1_4", "label": "sign" },
{ "docid": "1_5", "label": "letter" },
{ "docid": "1_7", "label": "abstract,cartoon,design,pattern" },
{ "docid": "1_36", "label": "drawing,painting,pattern" },
{ "docid": "1_37", "label": "art,drawing,outdoor" },
{ "docid": "1_38", "label": "aquarium,art,drawing" },
{ "docid": "1_39", "label": "abstract" },
{ "docid": "1_40", "label": "cartoon" },
{ "docid": "1_41", "label": "art,drawing" },
{ "docid": "1_42", "label": "art,pattern" },
{ "docid": "1_43", "label": "abstract,art,drawing,pattern" },
{ "docid": "1_44", "label": "drawing" },
{ "docid": "1_45", "label": "art" },
{ "docid": "1_46", "label": "abstract,colorfulness,pattern" },
{ "docid": "1_47", "label": "abstract,pattern" },
{ "docid": "1_52", "label": "abstract,cartoon" },
{ "docid": "1_57", "label": "abstract,drawing,pattern" },
{ "docid": "1_58", "label": "abstract,art,cartoon" },
{ "docid": "1_68", "label": "design" },
{ "docid": "1_69", "label": "geometry" }
]);
insert_documents(&mut wtxn, &index, content);
@ -788,26 +812,26 @@ mod tests {
builder.execute(|_| ()).unwrap();
let content = documents!([
{"docid":"1_4","label":"sign"},
{"docid":"1_5","label":"letter"},
{"docid":"1_7","label":"abstract,cartoon,design,pattern"},
{"docid":"1_36","label":"drawing,painting,pattern"},
{"docid":"1_37","label":"art,drawing,outdoor"},
{"docid":"1_38","label":"aquarium,art,drawing"},
{"docid":"1_39","label":"abstract"},
{"docid":"1_40","label":"cartoon"},
{"docid":"1_41","label":"art,drawing"},
{"docid":"1_42","label":"art,pattern"},
{"docid":"1_43","label":"abstract,art,drawing,pattern"},
{"docid":"1_44","label":"drawing"},
{"docid":"1_45","label":"art"},
{"docid":"1_46","label":"abstract,colorfulness,pattern"},
{"docid":"1_47","label":"abstract,pattern"},
{"docid":"1_52","label":"abstract,cartoon"},
{"docid":"1_57","label":"abstract,drawing,pattern"},
{"docid":"1_58","label":"abstract,art,cartoon"},
{"docid":"1_68","label":"design"},
{"docid":"1_69","label":"geometry"}
{ "docid": "1_4", "label": "sign" },
{ "docid": "1_5", "label": "letter" },
{ "docid": "1_7", "label": "abstract,cartoon,design,pattern" },
{ "docid": "1_36", "label": "drawing,painting,pattern" },
{ "docid": "1_37", "label": "art,drawing,outdoor" },
{ "docid": "1_38", "label": "aquarium,art,drawing" },
{ "docid": "1_39", "label": "abstract" },
{ "docid": "1_40", "label": "cartoon" },
{ "docid": "1_41", "label": "art,drawing" },
{ "docid": "1_42", "label": "art,pattern" },
{ "docid": "1_43", "label": "abstract,art,drawing,pattern" },
{ "docid": "1_44", "label": "drawing" },
{ "docid": "1_45", "label": "art" },
{ "docid": "1_46", "label": "abstract,colorfulness,pattern" },
{ "docid": "1_47", "label": "abstract,pattern" },
{ "docid": "1_52", "label": "abstract,cartoon" },
{ "docid": "1_57", "label": "abstract,drawing,pattern" },
{ "docid": "1_58", "label": "abstract,art,cartoon" },
{ "docid": "1_68", "label": "design" },
{ "docid": "1_69", "label": "geometry" }
]);
insert_documents(&mut wtxn, &index, content);
@ -841,26 +865,26 @@ mod tests {
builder.execute(|_| ()).unwrap();
let content = documents!([
{"docid":"1_4","label":"sign"},
{"docid":"1_5","label":"letter"},
{"docid":"1_7","label":"abstract,cartoon,design,pattern"},
{"docid":"1_36","label":"drawing,painting,pattern"},
{"docid":"1_37","label":"art,drawing,outdoor"},
{"docid":"1_38","label":"aquarium,art,drawing"},
{"docid":"1_39","label":"abstract"},
{"docid":"1_40","label":"cartoon"},
{"docid":"1_41","label":"art,drawing"},
{"docid":"1_42","label":"art,pattern"},
{"docid":"1_43","label":"abstract,art,drawing,pattern"},
{"docid":"1_44","label":"drawing"},
{"docid":"1_45","label":"art"},
{"docid":"1_46","label":"abstract,colorfulness,pattern"},
{"docid":"1_47","label":"abstract,pattern"},
{"docid":"1_52","label":"abstract,cartoon"},
{"docid":"1_57","label":"abstract,drawing,pattern"},
{"docid":"1_58","label":"abstract,art,cartoon"},
{"docid":"1_68","label":"design"},
{"docid":"1_69","label":"geometry"}
{"docid": "1_4", "label": "sign"},
{"docid": "1_5", "label": "letter"},
{"docid": "1_7", "label": "abstract,cartoon,design,pattern"},
{"docid": "1_36","label": "drawing,painting,pattern"},
{"docid": "1_37","label": "art,drawing,outdoor"},
{"docid": "1_38","label": "aquarium,art,drawing"},
{"docid": "1_39","label": "abstract"},
{"docid": "1_40","label": "cartoon"},
{"docid": "1_41","label": "art,drawing"},
{"docid": "1_42","label": "art,pattern"},
{"docid": "1_43","label": "abstract,art,drawing,pattern"},
{"docid": "1_44","label": "drawing"},
{"docid": "1_45","label": "art"},
{"docid": "1_46","label": "abstract,colorfulness,pattern"},
{"docid": "1_47","label": "abstract,pattern"},
{"docid": "1_52","label": "abstract,cartoon"},
{"docid": "1_57","label": "abstract,drawing,pattern"},
{"docid": "1_58","label": "abstract,art,cartoon"},
{"docid": "1_68","label": "design"},
{"docid": "1_69","label": "geometry"}
]);
insert_documents(&mut wtxn, &index, content);
@ -896,26 +920,26 @@ mod tests {
builder.execute(|_| ()).unwrap();
let content = documents!([
{"id":"1","city":"Lille", "_geo": { "lat": 50.6299 as f32, "lng": 3.0569 as f32 } },
{"id":"2","city":"Mons-en-Barœul", "_geo": { "lat": 50.6415 as f32, "lng": 3.1106 as f32 } },
{"id":"3","city":"Hellemmes", "_geo": { "lat": 50.6312 as f32, "lng": 3.1106 as f32 } },
{"id":"4","city":"Villeneuve-d'Ascq", "_geo": { "lat": 50.6224 as f32, "lng": 3.1476 as f32 } },
{"id":"5","city":"Hem", "_geo": { "lat": 50.6552 as f32, "lng": 3.1897 as f32 } },
{"id":"6","city":"Roubaix", "_geo": { "lat": 50.6924 as f32, "lng": 3.1763 as f32 } },
{"id":"7","city":"Tourcoing", "_geo": { "lat": 50.7263 as f32, "lng": 3.1541 as f32 } },
{"id":"8","city":"Mouscron", "_geo": { "lat": 50.7453 as f32, "lng": 3.2206 as f32 } },
{"id":"9","city":"Tournai", "_geo": { "lat": 50.6053 as f32, "lng": 3.3758 as f32 } },
{"id":"10","city":"Ghent", "_geo": { "lat": 51.0537 as f32, "lng": 3.6957 as f32 } },
{"id":"11","city":"Brussels", "_geo": { "lat": 50.8466 as f32, "lng": 4.3370 as f32 } },
{"id":"12","city":"Charleroi", "_geo": { "lat": 50.4095 as f32, "lng": 4.4347 as f32 } },
{"id":"13","city":"Mons", "_geo": { "lat": 50.4502 as f32, "lng": 3.9623 as f32 } },
{"id":"14","city":"Valenciennes", "_geo": { "lat": 50.3518 as f32, "lng": 3.5326 as f32 } },
{"id":"15","city":"Arras", "_geo": { "lat": 50.2844 as f32, "lng": 2.7637 as f32 } },
{"id":"16","city":"Cambrai", "_geo": { "lat": 50.1793 as f32, "lng": 3.2189 as f32 } },
{"id":"17","city":"Bapaume", "_geo": { "lat": 50.1112 as f32, "lng": 2.8547 as f32 } },
{"id":"18","city":"Amiens", "_geo": { "lat": 49.9314 as f32, "lng": 2.2710 as f32 } },
{"id":"19","city":"Compiègne", "_geo": { "lat": 49.4449 as f32, "lng": 2.7913 as f32 } },
{"id":"20","city":"Paris", "_geo": { "lat": 48.9021 as f32, "lng": 2.3708 as f32 } }
{ "id": "1", "city": "Lille", "_geo": { "lat": 50.6299, "lng": 3.0569 } },
{ "id": "2", "city": "Mons-en-Barœul", "_geo": { "lat": 50.6415, "lng": 3.1106 } },
{ "id": "3", "city": "Hellemmes", "_geo": { "lat": 50.6312, "lng": 3.1106 } },
{ "id": "4", "city": "Villeneuve-d'Ascq", "_geo": { "lat": 50.6224, "lng": 3.1476 } },
{ "id": "5", "city": "Hem", "_geo": { "lat": 50.6552, "lng": 3.1897 } },
{ "id": "6", "city": "Roubaix", "_geo": { "lat": 50.6924, "lng": 3.1763 } },
{ "id": "7", "city": "Tourcoing", "_geo": { "lat": 50.7263, "lng": 3.1541 } },
{ "id": "8", "city": "Mouscron", "_geo": { "lat": 50.7453, "lng": 3.2206 } },
{ "id": "9", "city": "Tournai", "_geo": { "lat": 50.6053, "lng": 3.3758 } },
{ "id": "10", "city": "Ghent", "_geo": { "lat": 51.0537, "lng": 3.6957 } },
{ "id": "11", "city": "Brussels", "_geo": { "lat": 50.8466, "lng": 4.3370 } },
{ "id": "12", "city": "Charleroi", "_geo": { "lat": 50.4095, "lng": 4.4347 } },
{ "id": "13", "city": "Mons", "_geo": { "lat": 50.4502, "lng": 3.9623 } },
{ "id": "14", "city": "Valenciennes", "_geo": { "lat": 50.3518, "lng": 3.5326 } },
{ "id": "15", "city": "Arras", "_geo": { "lat": 50.2844, "lng": 2.7637 } },
{ "id": "16", "city": "Cambrai", "_geo": { "lat": 50.1793, "lng": 3.2189 } },
{ "id": "17", "city": "Bapaume", "_geo": { "lat": 50.1112, "lng": 2.8547 } },
{ "id": "18", "city": "Amiens", "_geo": { "lat": 49.9314, "lng": 2.2710 } },
{ "id": "19", "city": "Compiègne", "_geo": { "lat": 49.4449, "lng": 2.7913 } },
{ "id": "20", "city": "Paris", "_geo": { "lat": 48.9021, "lng": 2.3708 } }
]);
let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"];
@ -951,26 +975,26 @@ mod tests {
builder.execute(|_| ()).unwrap();
let content = documents!([
{"docid":"1_4","label":"sign"},
{"docid":"1_5","label":"letter"},
{"docid":"1_7","label":"abstract,cartoon,design,pattern"},
{"docid":"1_36","label":"drawing,painting,pattern"},
{"docid":"1_37","label":"art,drawing,outdoor"},
{"docid":"1_38","label":"aquarium,art,drawing"},
{"docid":"1_39","label":"abstract"},
{"docid":"1_40","label":"cartoon"},
{"docid":"1_41","label":"art,drawing"},
{"docid":"1_42","label":"art,pattern"},
{"docid":"1_43","label":"abstract,art,drawing,pattern"},
{"docid":"1_44","label":"drawing"},
{"docid":"1_45","label":"art"},
{"docid":"1_46","label":"abstract,colorfulness,pattern"},
{"docid":"1_47","label":"abstract,pattern"},
{"docid":"1_52","label":"abstract,cartoon"},
{"docid":"1_57","label":"abstract,drawing,pattern"},
{"docid":"1_58","label":"abstract,art,cartoon"},
{"docid":"1_68","label":"design"},
{"docid":"1_69","label":"geometry"}
{ "docid": "1_4", "label": "sign" },
{ "docid": "1_5", "label": "letter" },
{ "docid": "1_7", "label": "abstract,cartoon,design,pattern" },
{ "docid": "1_36", "label": "drawing,painting,pattern" },
{ "docid": "1_37", "label": "art,drawing,outdoor" },
{ "docid": "1_38", "label": "aquarium,art,drawing" },
{ "docid": "1_39", "label": "abstract" },
{ "docid": "1_40", "label": "cartoon" },
{ "docid": "1_41", "label": "art,drawing" },
{ "docid": "1_42", "label": "art,pattern" },
{ "docid": "1_43", "label": "abstract,art,drawing,pattern" },
{ "docid": "1_44", "label": "drawing" },
{ "docid": "1_45", "label": "art" },
{ "docid": "1_46", "label": "abstract,colorfulness,pattern" },
{ "docid": "1_47", "label": "abstract,pattern" },
{ "docid": "1_52", "label": "abstract,cartoon" },
{ "docid": "1_57", "label": "abstract,drawing,pattern" },
{ "docid": "1_58", "label": "abstract,art,cartoon" },
{ "docid": "1_68", "label": "design" },
{ "docid": "1_69", "label": "geometry" }
]);
insert_documents(&mut wtxn, &index, content);
@ -1021,26 +1045,26 @@ mod tests {
builder.execute(|_| ()).unwrap();
let content = documents!([
{"docid":"1_4","label":"sign"},
{"docid":"1_5","label":"letter"},
{"docid":"1_7","label":"abstract,cartoon,design,pattern", "title": "Mickey Mouse"},
{"docid":"1_36","label":"drawing,painting,pattern"},
{"docid":"1_37","label":"art,drawing,outdoor"},
{"docid":"1_38","label":"aquarium,art,drawing", "title": "Nemo"},
{"docid":"1_39","label":"abstract"},
{"docid":"1_40","label":"cartoon"},
{"docid":"1_41","label":"art,drawing"},
{"docid":"1_42","label":"art,pattern"},
{"docid":"1_43","label":"abstract,art,drawing,pattern", "number": 32i32},
{"docid":"1_44","label":"drawing", "number": 44i32},
{"docid":"1_45","label":"art"},
{"docid":"1_46","label":"abstract,colorfulness,pattern"},
{"docid":"1_47","label":"abstract,pattern"},
{"docid":"1_52","label":"abstract,cartoon"},
{"docid":"1_57","label":"abstract,drawing,pattern"},
{"docid":"1_58","label":"abstract,art,cartoon"},
{"docid":"1_68","label":"design"},
{"docid":"1_69","label":"geometry"}
{ "docid": "1_4", "label": "sign"},
{ "docid": "1_5", "label": "letter"},
{ "docid": "1_7", "label": "abstract,cartoon,design,pattern", "title": "Mickey Mouse"},
{ "docid": "1_36", "label": "drawing,painting,pattern"},
{ "docid": "1_37", "label": "art,drawing,outdoor"},
{ "docid": "1_38", "label": "aquarium,art,drawing", "title": "Nemo"},
{ "docid": "1_39", "label": "abstract"},
{ "docid": "1_40", "label": "cartoon"},
{ "docid": "1_41", "label": "art,drawing"},
{ "docid": "1_42", "label": "art,pattern"},
{ "docid": "1_43", "label": "abstract,art,drawing,pattern", "number": 32i32},
{ "docid": "1_44", "label": "drawing", "number": 44i32},
{ "docid": "1_45", "label": "art"},
{ "docid": "1_46", "label": "abstract,colorfulness,pattern"},
{ "docid": "1_47", "label": "abstract,pattern"},
{ "docid": "1_52", "label": "abstract,cartoon"},
{ "docid": "1_57", "label": "abstract,drawing,pattern"},
{ "docid": "1_58", "label": "abstract,art,cartoon"},
{ "docid": "1_68", "label": "design"},
{ "docid": "1_69", "label": "geometry"}
]);
insert_documents(&mut wtxn, &index, content);

View File

@ -719,10 +719,11 @@ mod tests {
assert_eq!(count, 1);
// Check that we get only one document from the database.
let docs = index.documents(&rtxn, Some(0)).unwrap();
// Since the document has been deleted and re-inserted, its internal docid has been incremented to 1
let docs = index.documents(&rtxn, Some(1)).unwrap();
assert_eq!(docs.len(), 1);
let (id, doc) = docs[0];
assert_eq!(id, 0);
assert_eq!(id, 1);
// Check that this document is equal to the last one sent.
let mut doc_iter = doc.iter();
@ -809,11 +810,12 @@ mod tests {
let count = index.number_of_documents(&rtxn).unwrap();
assert_eq!(count, 3);
let docs = index.documents(&rtxn, vec![0, 1, 2]).unwrap();
let (kevin_id, _) =
docs.iter().find(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap();
let (id, doc) = docs[*kevin_id as usize];
assert_eq!(id, *kevin_id);
// the document 0 has been deleted and reinserted with the id 3
let docs = index.documents(&rtxn, vec![1, 2, 3]).unwrap();
let kevin_position =
docs.iter().position(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap();
assert_eq!(kevin_position, 2);
let (_, doc) = docs[kevin_position];
// Check that this document is equal to the last
// one sent and that an UUID has been generated.

View File

@ -4,7 +4,6 @@ use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::io::{Read, Seek, SeekFrom};
use byteorder::ReadBytesExt;
use fxhash::FxHashMap;
use heed::RoTxn;
use itertools::Itertools;
@ -57,7 +56,7 @@ pub struct Transform<'a, 'i> {
flattened_sorter: grenad::Sorter<MergeFn>,
replaced_documents_ids: RoaringBitmap,
new_documents_ids: RoaringBitmap,
// To increase the cache locality and the heap usage we use smartstring.
// To increase the cache locality and decrease the heap usage we use compact smartstring.
new_external_documents_ids_builder: FxHashMap<SmartString<smartstring::Compact>, u64>,
documents_count: usize,
}
@ -130,13 +129,17 @@ impl<'a, 'i> Transform<'a, 'i> {
indexer_settings.max_memory.map(|mem| mem / 2),
);
let documents_ids = index.documents_ids(wtxn)?;
let soft_deleted_documents_ids = index.soft_deleted_documents_ids(wtxn)?;
Ok(Transform {
index,
fields_ids_map: index.fields_ids_map(wtxn)?,
indexer_settings,
autogenerate_docids,
available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids),
available_documents_ids: AvailableDocumentsIds::from_documents_ids(
&documents_ids,
&soft_deleted_documents_ids,
),
original_sorter,
flattened_sorter,
index_documents_method,
@ -248,45 +251,39 @@ impl<'a, 'i> Transform<'a, 'i> {
writer.insert(*k, v)?;
}
let (docid, should_insert_original_document) =
match external_documents_ids.get(&*external_id) {
// if the document is in the db but has already been inserted
// (ie: already exists in the list of replaced documents ids),
// we should not add the original document a second time.
Some(docid) => (docid, !self.replaced_documents_ids.contains(docid)),
None => {
// if the document has already been inserted in this
// batch we need to get its docid
match self.new_external_documents_ids_builder.entry(external_id.into()) {
Entry::Occupied(entry) => (*entry.get() as u32, false),
// if the document has never been encountered we give it a new docid
// and push this new docid to the external documents ids builder
Entry::Vacant(entry) => {
let new_docid = self
.available_documents_ids
.next()
.ok_or(UserError::DocumentLimitReached)?;
entry.insert(new_docid as u64);
(new_docid, false)
}
}
let mut original_docid = None;
let docid = match self.new_external_documents_ids_builder.entry(external_id.into()) {
Entry::Occupied(entry) => *entry.get() as u32,
Entry::Vacant(entry) => {
// If the document was already in the db we mark it as a replaced document.
// It'll be deleted later. We keep its original docid to insert it in the grenad.
if let Some(docid) = external_documents_ids.get(entry.key()) {
self.replaced_documents_ids.insert(docid);
original_docid = Some(docid);
}
};
let docid = self
.available_documents_ids
.next()
.ok_or(UserError::DocumentLimitReached)?;
entry.insert(docid as u64);
docid
}
};
if should_insert_original_document {
self.replaced_documents_ids.insert(docid);
let key = BEU32::new(docid);
if let Some(original_docid) = original_docid {
let original_key = BEU32::new(original_docid);
let base_obkv = self
.index
.documents
.remap_data_type::<heed::types::ByteSlice>()
.get(wtxn, &key)?
.get(wtxn, &original_key)?
.ok_or(InternalError::DatabaseMissingEntry {
db_name: db_name::DOCUMENTS,
key: None,
})?;
// we associate the base document with the new key, everything will get merged later.
self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?;
match self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))? {
Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?,
@ -506,6 +503,39 @@ impl<'a, 'i> Transform<'a, 'i> {
Ok(())
}
fn remove_deleted_documents_from_field_distribution(
&self,
rtxn: &RoTxn,
field_distribution: &mut FieldDistribution,
) -> Result<()> {
for deleted_docid in self.replaced_documents_ids.iter() {
let obkv = self.index.documents.get(rtxn, &BEU32::new(deleted_docid))?.ok_or(
InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
)?;
for (key, _) in obkv.iter() {
let name =
self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId {
field_id: key,
process: "Computing field distribution in transform.",
})?;
// We checked that the document was in the db earlier. If we can't find it it means
// there is an inconsistency between the field distribution and the field id map.
let field =
field_distribution.get_mut(name).ok_or(FieldIdMapMissingEntry::FieldId {
field_id: key,
process: "Accessing field distribution in transform.",
})?;
*field -= 1;
if *field == 0 {
// since we were able to get the field right before it's safe to unwrap here
field_distribution.remove(name).unwrap();
}
}
}
Ok(())
}
/// Generate the `TransformOutput` based on the given sorter that can be generated from any
/// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document
/// id for the user side and the value must be an obkv where keys are valid fields ids.
@ -532,9 +562,14 @@ impl<'a, 'i> Transform<'a, 'i> {
tempfile::tempfile()?,
);
// Once we have all the documents in the sorter, we write the documents
// in the writer. We also generate the field distribution.
// To compute the field distribution we need to;
// 1. Remove all the deleted documents from the field distribution
// 2. Add all the new documents to the field distribution
let mut field_distribution = self.index.field_distribution(wtxn)?;
self.remove_deleted_documents_from_field_distribution(wtxn, &mut field_distribution)?;
// Here we are going to do the document count + field distribution + `write_into_stream_writer`
let mut iter = self.original_sorter.into_stream_merger_iter()?;
// used only for the callback
let mut documents_count = 0;
@ -547,36 +582,6 @@ impl<'a, 'i> Transform<'a, 'i> {
total_documents: self.documents_count,
});
let u32_key = key.clone().read_u32::<byteorder::BigEndian>()?;
// if the document was already in the db we remove all of its field
// from the field distribution.
if self.replaced_documents_ids.contains(u32_key) {
let obkv = self.index.documents.get(wtxn, &BEU32::new(u32_key))?.ok_or(
InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
)?;
for (key, _) in obkv.iter() {
let name =
self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId {
field_id: key,
process: "Computing field distribution in transform.",
})?;
// We checked that the document was in the db earlier. If we can't find it it means
// there is an inconsistency between the field distribution and the field id map.
let field = field_distribution.get_mut(name).ok_or(
FieldIdMapMissingEntry::FieldId {
field_id: key,
process: "Accessing field distribution in transform.",
},
)?;
*field -= 1;
if *field == 0 {
// since we were able to get the field right before it's safe to unwrap here
field_distribution.remove(name).unwrap();
}
}
}
// We increment all the field of the current document in the field distribution.
let obkv = KvReader::new(val);