diff --git a/milli/src/error.rs b/milli/src/error.rs index caabb96fc..57ae1c85a 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -83,6 +83,8 @@ pub enum FieldIdMapMissingEntry { #[derive(Error, Debug)] pub enum UserError { + #[error("A soft deleted internal document id have been used: `{document_id}`.")] + AccessingSoftDeletedDocument { document_id: DocumentId }, #[error("A document cannot contain more than 65,535 fields.")] AttributeLimitReached, #[error(transparent)] diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs index 3dce18b00..6029722af 100644 --- a/milli/src/external_documents_ids.rs +++ b/milli/src/external_documents_ids.rs @@ -5,26 +5,30 @@ use std::{fmt, str}; use fst::map::IndexedValue; use fst::{IntoStreamer, Streamer}; +use roaring::RoaringBitmap; const DELETED_ID: u64 = u64::MAX; pub struct ExternalDocumentsIds<'a> { pub(crate) hard: fst::Map>, pub(crate) soft: fst::Map>, + soft_deleted_docids: RoaringBitmap, } impl<'a> ExternalDocumentsIds<'a> { pub fn new( hard: fst::Map>, soft: fst::Map>, + soft_deleted_docids: RoaringBitmap, ) -> ExternalDocumentsIds<'a> { - ExternalDocumentsIds { hard, soft } + ExternalDocumentsIds { hard, soft, soft_deleted_docids } } pub fn into_static(self) -> ExternalDocumentsIds<'static> { ExternalDocumentsIds { hard: self.hard.map_data(|c| Cow::Owned(c.into_owned())).unwrap(), soft: self.soft.map_data(|c| Cow::Owned(c.into_owned())).unwrap(), + soft_deleted_docids: self.soft_deleted_docids, } } @@ -36,7 +40,9 @@ impl<'a> ExternalDocumentsIds<'a> { pub fn get>(&self, external_id: A) -> Option { let external_id = external_id.as_ref(); match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) { - Some(id) if id != DELETED_ID => Some(id.try_into().unwrap()), + Some(id) if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) => { + Some(id.try_into().unwrap()) + } _otherwise => None, } } @@ -134,6 +140,7 @@ impl Default for ExternalDocumentsIds<'static> { ExternalDocumentsIds { hard: fst::Map::default().map_data(Cow::Owned).unwrap(), soft: fst::Map::default().map_data(Cow::Owned).unwrap(), + soft_deleted_docids: RoaringBitmap::new(), } } } diff --git a/milli/src/index.rs b/milli/src/index.rs index d89246dd8..9637b4103 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -32,6 +32,7 @@ pub mod main_key { pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; + pub const SOFT_DELETED_DOCUMENTS_IDS_KEY: &str = "soft-deleted-documents-ids"; pub const HIDDEN_FACETED_FIELDS_KEY: &str = "hidden-faceted-fields"; pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; @@ -254,6 +255,29 @@ impl Index { Ok(count.unwrap_or_default()) } + /* deleted documents ids */ + + /// Writes the soft deleted documents ids. + pub(crate) fn put_soft_deleted_documents_ids( + &self, + wtxn: &mut RwTxn, + docids: &RoaringBitmap, + ) -> heed::Result<()> { + self.main.put::<_, Str, RoaringBitmapCodec>( + wtxn, + main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY, + docids, + ) + } + + /// Returns the soft deleted documents ids. + pub(crate) fn soft_deleted_documents_ids(&self, rtxn: &RoTxn) -> heed::Result { + Ok(self + .main + .get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY)? + .unwrap_or_default()) + } + /* primary key */ /// Writes the documents primary key, this is the field name that is used to store the id. @@ -280,7 +304,7 @@ impl Index { wtxn: &mut RwTxn, external_documents_ids: &ExternalDocumentsIds<'a>, ) -> heed::Result<()> { - let ExternalDocumentsIds { hard, soft } = external_documents_ids; + let ExternalDocumentsIds { hard, soft, .. } = external_documents_ids; let hard = hard.as_fst().as_bytes(); let soft = soft.as_fst().as_bytes(); self.main.put::<_, Str, ByteSlice>( @@ -311,7 +335,8 @@ impl Index { Some(soft) => fst::Map::new(soft)?.map_data(Cow::Borrowed)?, None => fst::Map::default().map_data(Cow::Owned)?, }; - Ok(ExternalDocumentsIds::new(hard, soft)) + let soft_deleted_docids = self.soft_deleted_documents_ids(rtxn)?; + Ok(ExternalDocumentsIds::new(hard, soft, soft_deleted_docids)) } /* fields ids map */ @@ -929,9 +954,13 @@ impl Index { rtxn: &'t RoTxn, ids: impl IntoIterator, ) -> Result)>> { + let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?; let mut documents = Vec::new(); for id in ids { + if soft_deleted_documents.contains(id) { + return Err(UserError::AccessingSoftDeletedDocument { document_id: id })?; + } let kv = self .documents .get(rtxn, &BEU32::new(id))? @@ -947,11 +976,16 @@ impl Index { &self, rtxn: &'t RoTxn, ) -> Result)>>> { + let soft_deleted_docids = self.soft_deleted_documents_ids(rtxn)?; + Ok(self .documents .iter(rtxn)? // we cast the BEU32 to a DocumentId - .map(|document| document.map(|(id, obkv)| (id.get(), obkv)))) + .map(|document| document.map(|(id, obkv)| (id.get(), obkv))) + .filter(move |document| { + document.as_ref().map_or(true, |(id, _)| !soft_deleted_docids.contains(*id)) + })) } pub fn facets_distribution<'a>(&'a self, rtxn: &'a RoTxn) -> FacetDistribution<'a> { diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 91caa0171..d89413f62 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -10,9 +10,7 @@ use roaring::RoaringBitmap; use super::FacetNumberRange; use crate::error::{Error, UserError}; -use crate::heed_codec::facet::{ - FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, -}; +use crate::heed_codec::facet::FacetLevelValueF64Codec; use crate::{ distance_between_two_points, lat_lng_to_xyz, CboRoaringBitmapCodec, FieldId, Index, Result, }; @@ -266,11 +264,12 @@ impl<'a> Filter<'a> { fn evaluate_operator( rtxn: &heed::RoTxn, index: &Index, - numbers_db: heed::Database, - strings_db: heed::Database, field_id: FieldId, operator: &Condition<'a>, ) -> Result { + let numbers_db = index.facet_id_f64_docids; + let strings_db = index.facet_id_string_docids; + // Make sure we always bound the ranges with the field id and the level, // as the facets values are all in the same database and prefixed by the // field id and the level. @@ -309,9 +308,7 @@ impl<'a> Filter<'a> { let all_numbers_ids = index.number_faceted_documents_ids(rtxn, field_id)?; let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?; let operator = Condition::Equal(val.clone()); - let docids = Self::evaluate_operator( - rtxn, index, numbers_db, strings_db, field_id, &operator, - )?; + let docids = Self::evaluate_operator(rtxn, index, field_id, &operator)?; return Ok((all_numbers_ids | all_strings_ids) - docids); } }; @@ -342,17 +339,27 @@ impl<'a> Filter<'a> { } pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result { - let numbers_db = index.facet_id_f64_docids; - let strings_db = index.facet_id_string_docids; + // to avoid doing this for each recursive call we're going to do it ONCE ahead of time + let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?; + let filterable_fields = index.filterable_fields(rtxn)?; + // and finally we delete all the soft_deleted_documents, again, only once at the very end + self.inner_evaluate(rtxn, index, &filterable_fields) + .map(|result| result - soft_deleted_documents) + } + + fn inner_evaluate( + &self, + rtxn: &heed::RoTxn, + index: &Index, + filterable_fields: &HashSet, + ) -> Result { match &self.condition { FilterCondition::Condition { fid, op } => { - let filterable_fields = index.filterable_fields(rtxn)?; - - if crate::is_faceted(fid.value(), &filterable_fields) { + if crate::is_faceted(fid.value(), filterable_fields) { let field_ids_map = index.fields_ids_map(rtxn)?; if let Some(fid) = field_ids_map.id(fid.value()) { - Self::evaluate_operator(rtxn, index, numbers_db, strings_db, fid, &op) + Self::evaluate_operator(rtxn, index, fid, &op) } else { return Ok(RoaringBitmap::new()); } @@ -371,7 +378,7 @@ impl<'a> Filter<'a> { return Err(fid.as_external_error( FilterError::AttributeNotFilterable { attribute, - filterable_fields, + filterable_fields: filterable_fields.clone(), }, ))?; } @@ -379,17 +386,39 @@ impl<'a> Filter<'a> { } } FilterCondition::Or(lhs, rhs) => { - let lhs = Self::evaluate(&(lhs.as_ref().clone()).into(), rtxn, index)?; - let rhs = Self::evaluate(&(rhs.as_ref().clone()).into(), rtxn, index)?; + let lhs = Self::inner_evaluate( + &(lhs.as_ref().clone()).into(), + rtxn, + index, + filterable_fields, + )?; + let rhs = Self::inner_evaluate( + &(rhs.as_ref().clone()).into(), + rtxn, + index, + filterable_fields, + )?; Ok(lhs | rhs) } FilterCondition::And(lhs, rhs) => { - let lhs = Self::evaluate(&(lhs.as_ref().clone()).into(), rtxn, index)?; - let rhs = Self::evaluate(&(rhs.as_ref().clone()).into(), rtxn, index)?; + let lhs = Self::inner_evaluate( + &(lhs.as_ref().clone()).into(), + rtxn, + index, + filterable_fields, + )?; + if lhs.is_empty() { + return Ok(lhs); + } + let rhs = Self::inner_evaluate( + &(rhs.as_ref().clone()).into(), + rtxn, + index, + filterable_fields, + )?; Ok(lhs & rhs) } FilterCondition::GeoLowerThan { point, radius } => { - let filterable_fields = index.filterable_fields(rtxn)?; if filterable_fields.contains("_geo") { let base_point: [f64; 2] = [point[0].parse()?, point[1].parse()?]; if !(-90.0..=90.0).contains(&base_point[0]) { @@ -422,16 +451,17 @@ impl<'a> Filter<'a> { } else { return Err(point[0].as_external_error(FilterError::AttributeNotFilterable { attribute: "_geo", - filterable_fields, + filterable_fields: filterable_fields.clone(), }))?; } } FilterCondition::GeoGreaterThan { point, radius } => { - let result = Self::evaluate( + let result = Self::inner_evaluate( &FilterCondition::GeoLowerThan { point: point.clone(), radius: radius.clone() } .into(), rtxn, index, + filterable_fields, )?; let geo_faceted_doc_ids = index.geo_faceted_documents_ids(rtxn)?; Ok(geo_faceted_doc_ids - result) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 447ba4984..1930091ef 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -214,7 +214,7 @@ impl<'a> Search<'a> { ) -> Result { let mut offset = self.offset; let mut initial_candidates = RoaringBitmap::new(); - let mut excluded_candidates = RoaringBitmap::new(); + let mut excluded_candidates = self.index.soft_deleted_documents_ids(self.rtxn)?; let mut documents_ids = Vec::new(); while let Some(FinalResult { candidates, bucket_candidates, .. }) = diff --git a/milli/src/update/available_documents_ids.rs b/milli/src/update/available_documents_ids.rs index 653bc7dd2..3e4ec5600 100644 --- a/milli/src/update/available_documents_ids.rs +++ b/milli/src/update/available_documents_ids.rs @@ -8,11 +8,16 @@ pub struct AvailableDocumentsIds { } impl AvailableDocumentsIds { - pub fn from_documents_ids(docids: &RoaringBitmap) -> AvailableDocumentsIds { - match docids.max() { + pub fn from_documents_ids( + docids: &RoaringBitmap, + soft_deleted_docids: &RoaringBitmap, + ) -> AvailableDocumentsIds { + let used_docids = docids | soft_deleted_docids; + + match used_docids.max() { Some(last_id) => { let mut available = RoaringBitmap::from_iter(0..last_id); - available -= docids; + available -= used_docids; let iter = match last_id.checked_add(1) { Some(id) => id..=u32::max_value(), @@ -44,7 +49,7 @@ mod tests { #[test] fn empty() { let base = RoaringBitmap::new(); - let left = AvailableDocumentsIds::from_documents_ids(&base); + let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new()); let right = 0..=u32::max_value(); left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); } @@ -57,8 +62,28 @@ mod tests { base.insert(100); base.insert(405); - let left = AvailableDocumentsIds::from_documents_ids(&base); + let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new()); let right = (0..=u32::max_value()).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405); left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); } + + #[test] + fn soft_deleted() { + let mut base = RoaringBitmap::new(); + base.insert(0); + base.insert(10); + base.insert(100); + base.insert(405); + + let mut soft_deleted = RoaringBitmap::new(); + soft_deleted.insert(1); + soft_deleted.insert(11); + soft_deleted.insert(101); + soft_deleted.insert(406); + + let left = AvailableDocumentsIds::from_documents_ids(&base, &soft_deleted); + let right = + (0..=u32::max_value()).filter(|&n| ![0, 1, 10, 11, 100, 101, 405, 406].contains(&n)); + left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); + } } diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index f93ba60fa..d1939df7b 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -35,6 +35,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { documents, } = self.index; + let empty_roaring = RoaringBitmap::default(); + // We retrieve the number of documents ids that we are deleting. let number_of_documents = self.index.number_of_documents(self.wtxn)?; let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; @@ -43,16 +45,16 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { self.index.put_words_fst(self.wtxn, &fst::Set::default())?; self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?; self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; - self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; + self.index.put_documents_ids(self.wtxn, &empty_roaring)?; + self.index.put_soft_deleted_documents_ids(self.wtxn, &empty_roaring)?; self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?; self.index.delete_geo_rtree(self.wtxn)?; self.index.delete_geo_faceted_documents_ids(self.wtxn)?; // We clean all the faceted documents ids. - let empty = RoaringBitmap::default(); for field_id in faceted_fields { - self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &empty)?; - self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &empty)?; + self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &empty_roaring)?; + self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &empty_roaring)?; } // Clear the other databases. diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index ef4f849cc..3b519c101 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -1,5 +1,4 @@ use std::collections::btree_map::Entry; -use std::collections::HashMap; use fst::IntoStreamer; use heed::types::{ByteSlice, Str}; @@ -17,15 +16,19 @@ use crate::heed_codec::facet::{ use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{ - DocumentId, ExternalDocumentsIds, FieldId, Index, Result, RoaringBitmapCodec, SmallString32, - BEU32, + DocumentId, ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, + RoaringBitmapCodec, SmallString32, BEU32, }; +/// The threshold we use to determine after which number of documents we want to clear the +/// soft-deleted database and delete documents for real. +const DELETE_DOCUMENTS_THRESHOLD: u64 = 10_000; + pub struct DeleteDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, external_documents_ids: ExternalDocumentsIds<'static>, - documents_ids: RoaringBitmap, + to_delete_docids: RoaringBitmap, } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -45,16 +48,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { wtxn, index, external_documents_ids, - documents_ids: RoaringBitmap::new(), + to_delete_docids: RoaringBitmap::new(), }) } pub fn delete_document(&mut self, docid: u32) { - self.documents_ids.insert(docid); + self.to_delete_docids.insert(docid); } pub fn delete_documents(&mut self, docids: &RoaringBitmap) { - self.documents_ids |= docids; + self.to_delete_docids |= docids; } pub fn delete_external_id(&mut self, external_id: &str) -> Option { @@ -63,28 +66,30 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { Some(docid) } - pub fn execute(self) -> Result { + pub fn execute(mut self) -> Result { self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; // We retrieve the current documents ids that are in the database. let mut documents_ids = self.index.documents_ids(self.wtxn)?; + let mut soft_deleted_docids = self.index.soft_deleted_documents_ids(self.wtxn)?; let current_documents_ids_len = documents_ids.len(); // We can and must stop removing documents in a database that is empty. if documents_ids.is_empty() { - return Ok(DocumentDeletionResult { - deleted_documents: 0, - remaining_documents: current_documents_ids_len, - }); + // but if there was still documents to delete we clear the database entirely + if !soft_deleted_docids.is_empty() { + ClearDocuments::new(self.wtxn, self.index).execute()?; + } + return Ok(DocumentDeletionResult { deleted_documents: 0, remaining_documents: 0 }); } // We remove the documents ids that we want to delete // from the documents in the database and write them back. - documents_ids -= &self.documents_ids; + documents_ids -= &self.to_delete_docids; self.index.put_documents_ids(self.wtxn, &documents_ids)?; // We can execute a ClearDocuments operation when the number of documents // to delete is exactly the number of documents in the database. - if current_documents_ids_len == self.documents_ids.len() { + if current_documents_ids_len == self.to_delete_docids.len() { let remaining_documents = ClearDocuments::new(self.wtxn, self.index).execute()?; return Ok(DocumentDeletionResult { deleted_documents: current_documents_ids_len, @@ -93,6 +98,50 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + let mut field_distribution = self.index.field_distribution(self.wtxn)?; + + // we update the field distribution + for docid in self.to_delete_docids.iter() { + let key = BEU32::new(docid); + let document = + self.index.documents.get(self.wtxn, &key)?.ok_or( + InternalError::DatabaseMissingEntry { db_name: "documents", key: None }, + )?; + for (fid, _value) in document.iter() { + let field_name = + fields_ids_map.name(fid).ok_or(FieldIdMapMissingEntry::FieldId { + field_id: fid, + process: "delete documents", + })?; + if let Entry::Occupied(mut entry) = field_distribution.entry(field_name.to_string()) + { + match entry.get().checked_sub(1) { + Some(0) | None => entry.remove(), + Some(count) => entry.insert(count), + }; + } + } + } + + self.index.put_field_distribution(self.wtxn, &field_distribution)?; + + soft_deleted_docids |= &self.to_delete_docids; + + // if we have less documents to delete than the threshold we simply save them in + // the `soft_deleted_documents_ids` bitmap and early exit. + if soft_deleted_docids.len() < DELETE_DOCUMENTS_THRESHOLD { + self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?; + return Ok(DocumentDeletionResult { + deleted_documents: self.to_delete_docids.len(), + remaining_documents: documents_ids.len(), + }); + } + + // There is more than documents to delete than the threshold we needs to delete them all + self.to_delete_docids = soft_deleted_docids; + // and we can reset the soft deleted bitmap + self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?; + let primary_key = self.index.primary_key(self.wtxn)?.ok_or_else(|| { InternalError::DatabaseMissingEntry { db_name: db_name::MAIN, @@ -127,23 +176,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { documents, } = self.index; - // Number of fields for each document that has been deleted. - let mut fields_ids_distribution_diff = HashMap::new(); - // Retrieve the words and the external documents ids contained in the documents. let mut words = Vec::new(); let mut external_ids = Vec::new(); - for docid in &self.documents_ids { + for docid in &self.to_delete_docids { // We create an iterator to be able to get the content and delete the document // content itself. It's faster to acquire a cursor to get and delete, // as we avoid traversing the LMDB B-Tree two times but only once. let key = BEU32::new(docid); let mut iter = documents.range_mut(self.wtxn, &(key..=key))?; if let Some((_key, obkv)) = iter.next().transpose()? { - for (field_id, _) in obkv.iter() { - *fields_ids_distribution_diff.entry(field_id).or_default() += 1; - } - if let Some(content) = obkv.get(id_field) { let external_id = match serde_json::from_slice(content).unwrap() { Value::String(string) => SmallString32::from(string.as_str()), @@ -171,24 +213,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } - let mut field_distribution = self.index.field_distribution(self.wtxn)?; - - // We use pre-calculated number of fields occurrences that needs to be deleted - // to reflect deleted documents. - // If all field occurrences are removed, delete the entry from distribution. - // Otherwise, insert new number of occurrences (current_count - count_diff). - for (field_id, count_diff) in fields_ids_distribution_diff { - let field_name = fields_ids_map.name(field_id).unwrap(); - if let Entry::Occupied(mut entry) = field_distribution.entry(field_name.to_string()) { - match entry.get().checked_sub(count_diff) { - Some(0) | None => entry.remove(), - Some(count) => entry.insert(count), - }; - } - } - - self.index.put_field_distribution(self.wtxn, &field_distribution)?; - // We create the FST map of the external ids that we must delete. external_ids.sort_unstable(); let external_ids_to_delete = fst::Set::from_iter(external_ids)?; @@ -214,7 +238,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_docids, word.as_str(), must_remove, - &self.documents_ids, + &self.to_delete_docids, )?; remove_from_word_docids( @@ -222,7 +246,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { exact_word_docids, word.as_str(), must_remove, - &self.documents_ids, + &self.to_delete_docids, )?; } @@ -256,12 +280,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { self.index.put_words_fst(self.wtxn, &new_words_fst)?; let prefixes_to_delete = - remove_from_word_prefix_docids(self.wtxn, word_prefix_docids, &self.documents_ids)?; + remove_from_word_prefix_docids(self.wtxn, word_prefix_docids, &self.to_delete_docids)?; let exact_prefix_to_delete = remove_from_word_prefix_docids( self.wtxn, exact_word_prefix_docids, - &self.documents_ids, + &self.to_delete_docids, )?; let all_prefixes_to_delete = prefixes_to_delete.op().add(&exact_prefix_to_delete).union(); @@ -293,7 +317,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { while let Some(result) = iter.next() { let (key, mut docids) = result?; let previous_len = docids.len(); - docids -= &self.documents_ids; + docids -= &self.to_delete_docids; if docids.is_empty() { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; @@ -314,7 +338,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { while let Some(result) = iter.next() { let (bytes, mut docids) = result?; let previous_len = docids.len(); - docids -= &self.documents_ids; + docids -= &self.to_delete_docids; if docids.is_empty() { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; @@ -332,7 +356,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { while let Some(result) = iter.next() { let (bytes, mut docids) = result?; let previous_len = docids.len(); - docids -= &self.documents_ids; + docids -= &self.to_delete_docids; if docids.is_empty() { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; @@ -351,7 +375,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { while let Some(result) = iter.next() { let (bytes, mut docids) = result?; let previous_len = docids.len(); - docids -= &self.documents_ids; + docids -= &self.to_delete_docids; if docids.is_empty() { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; @@ -368,7 +392,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let mut iter = field_id_word_count_docids.iter_mut(self.wtxn)?; while let Some((key, mut docids)) = iter.next().transpose()? { let previous_len = docids.len(); - docids -= &self.documents_ids; + docids -= &self.to_delete_docids; if docids.is_empty() { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; @@ -386,7 +410,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let (points_to_remove, docids_to_remove): (Vec<_>, RoaringBitmap) = rtree .iter() - .filter(|&point| self.documents_ids.contains(point.data.0)) + .filter(|&point| self.to_delete_docids.contains(point.data.0)) .cloned() .map(|point| (point, point.data.0)) .unzip(); @@ -403,46 +427,46 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { remove_docids_from_facet_field_id_number_docids( self.wtxn, facet_id_f64_docids, - &self.documents_ids, + &self.to_delete_docids, )?; remove_docids_from_facet_field_id_string_docids( self.wtxn, facet_id_string_docids, - &self.documents_ids, + &self.to_delete_docids, )?; // Remove the documents ids from the faceted documents ids. for field_id in self.index.faceted_fields_ids(self.wtxn)? { // Remove docids from the number faceted documents ids let mut docids = self.index.number_faceted_documents_ids(self.wtxn, field_id)?; - docids -= &self.documents_ids; + docids -= &self.to_delete_docids; self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &docids)?; remove_docids_from_field_id_docid_facet_value( self.wtxn, field_id_docid_facet_f64s, field_id, - &self.documents_ids, + &self.to_delete_docids, |(_fid, docid, _value)| docid, )?; // Remove docids from the string faceted documents ids let mut docids = self.index.string_faceted_documents_ids(self.wtxn, field_id)?; - docids -= &self.documents_ids; + docids -= &self.to_delete_docids; self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &docids)?; remove_docids_from_field_id_docid_facet_value( self.wtxn, field_id_docid_facet_strings, field_id, - &self.documents_ids, + &self.to_delete_docids, |(_fid, docid, _value)| docid, )?; } Ok(DocumentDeletionResult { - deleted_documents: self.documents_ids.len(), + deleted_documents: self.to_delete_docids.len(), remaining_documents: documents_ids.len(), }) } @@ -741,26 +765,26 @@ mod tests { builder.execute(|_| ()).unwrap(); let content = documents!([ - {"docid":"1_4","label":"sign"}, - {"docid":"1_5","label":"letter"}, - {"docid":"1_7","label":"abstract,cartoon,design,pattern"}, - {"docid":"1_36","label":"drawing,painting,pattern"}, - {"docid":"1_37","label":"art,drawing,outdoor"}, - {"docid":"1_38","label":"aquarium,art,drawing"}, - {"docid":"1_39","label":"abstract"}, - {"docid":"1_40","label":"cartoon"}, - {"docid":"1_41","label":"art,drawing"}, - {"docid":"1_42","label":"art,pattern"}, - {"docid":"1_43","label":"abstract,art,drawing,pattern"}, - {"docid":"1_44","label":"drawing"}, - {"docid":"1_45","label":"art"}, - {"docid":"1_46","label":"abstract,colorfulness,pattern"}, - {"docid":"1_47","label":"abstract,pattern"}, - {"docid":"1_52","label":"abstract,cartoon"}, - {"docid":"1_57","label":"abstract,drawing,pattern"}, - {"docid":"1_58","label":"abstract,art,cartoon"}, - {"docid":"1_68","label":"design"}, - {"docid":"1_69","label":"geometry"} + { "docid": "1_4", "label": "sign" }, + { "docid": "1_5", "label": "letter" }, + { "docid": "1_7", "label": "abstract,cartoon,design,pattern" }, + { "docid": "1_36", "label": "drawing,painting,pattern" }, + { "docid": "1_37", "label": "art,drawing,outdoor" }, + { "docid": "1_38", "label": "aquarium,art,drawing" }, + { "docid": "1_39", "label": "abstract" }, + { "docid": "1_40", "label": "cartoon" }, + { "docid": "1_41", "label": "art,drawing" }, + { "docid": "1_42", "label": "art,pattern" }, + { "docid": "1_43", "label": "abstract,art,drawing,pattern" }, + { "docid": "1_44", "label": "drawing" }, + { "docid": "1_45", "label": "art" }, + { "docid": "1_46", "label": "abstract,colorfulness,pattern" }, + { "docid": "1_47", "label": "abstract,pattern" }, + { "docid": "1_52", "label": "abstract,cartoon" }, + { "docid": "1_57", "label": "abstract,drawing,pattern" }, + { "docid": "1_58", "label": "abstract,art,cartoon" }, + { "docid": "1_68", "label": "design" }, + { "docid": "1_69", "label": "geometry" } ]); insert_documents(&mut wtxn, &index, content); @@ -788,26 +812,26 @@ mod tests { builder.execute(|_| ()).unwrap(); let content = documents!([ - {"docid":"1_4","label":"sign"}, - {"docid":"1_5","label":"letter"}, - {"docid":"1_7","label":"abstract,cartoon,design,pattern"}, - {"docid":"1_36","label":"drawing,painting,pattern"}, - {"docid":"1_37","label":"art,drawing,outdoor"}, - {"docid":"1_38","label":"aquarium,art,drawing"}, - {"docid":"1_39","label":"abstract"}, - {"docid":"1_40","label":"cartoon"}, - {"docid":"1_41","label":"art,drawing"}, - {"docid":"1_42","label":"art,pattern"}, - {"docid":"1_43","label":"abstract,art,drawing,pattern"}, - {"docid":"1_44","label":"drawing"}, - {"docid":"1_45","label":"art"}, - {"docid":"1_46","label":"abstract,colorfulness,pattern"}, - {"docid":"1_47","label":"abstract,pattern"}, - {"docid":"1_52","label":"abstract,cartoon"}, - {"docid":"1_57","label":"abstract,drawing,pattern"}, - {"docid":"1_58","label":"abstract,art,cartoon"}, - {"docid":"1_68","label":"design"}, - {"docid":"1_69","label":"geometry"} + { "docid": "1_4", "label": "sign" }, + { "docid": "1_5", "label": "letter" }, + { "docid": "1_7", "label": "abstract,cartoon,design,pattern" }, + { "docid": "1_36", "label": "drawing,painting,pattern" }, + { "docid": "1_37", "label": "art,drawing,outdoor" }, + { "docid": "1_38", "label": "aquarium,art,drawing" }, + { "docid": "1_39", "label": "abstract" }, + { "docid": "1_40", "label": "cartoon" }, + { "docid": "1_41", "label": "art,drawing" }, + { "docid": "1_42", "label": "art,pattern" }, + { "docid": "1_43", "label": "abstract,art,drawing,pattern" }, + { "docid": "1_44", "label": "drawing" }, + { "docid": "1_45", "label": "art" }, + { "docid": "1_46", "label": "abstract,colorfulness,pattern" }, + { "docid": "1_47", "label": "abstract,pattern" }, + { "docid": "1_52", "label": "abstract,cartoon" }, + { "docid": "1_57", "label": "abstract,drawing,pattern" }, + { "docid": "1_58", "label": "abstract,art,cartoon" }, + { "docid": "1_68", "label": "design" }, + { "docid": "1_69", "label": "geometry" } ]); insert_documents(&mut wtxn, &index, content); @@ -841,26 +865,26 @@ mod tests { builder.execute(|_| ()).unwrap(); let content = documents!([ - {"docid":"1_4","label":"sign"}, - {"docid":"1_5","label":"letter"}, - {"docid":"1_7","label":"abstract,cartoon,design,pattern"}, - {"docid":"1_36","label":"drawing,painting,pattern"}, - {"docid":"1_37","label":"art,drawing,outdoor"}, - {"docid":"1_38","label":"aquarium,art,drawing"}, - {"docid":"1_39","label":"abstract"}, - {"docid":"1_40","label":"cartoon"}, - {"docid":"1_41","label":"art,drawing"}, - {"docid":"1_42","label":"art,pattern"}, - {"docid":"1_43","label":"abstract,art,drawing,pattern"}, - {"docid":"1_44","label":"drawing"}, - {"docid":"1_45","label":"art"}, - {"docid":"1_46","label":"abstract,colorfulness,pattern"}, - {"docid":"1_47","label":"abstract,pattern"}, - {"docid":"1_52","label":"abstract,cartoon"}, - {"docid":"1_57","label":"abstract,drawing,pattern"}, - {"docid":"1_58","label":"abstract,art,cartoon"}, - {"docid":"1_68","label":"design"}, - {"docid":"1_69","label":"geometry"} + {"docid": "1_4", "label": "sign"}, + {"docid": "1_5", "label": "letter"}, + {"docid": "1_7", "label": "abstract,cartoon,design,pattern"}, + {"docid": "1_36","label": "drawing,painting,pattern"}, + {"docid": "1_37","label": "art,drawing,outdoor"}, + {"docid": "1_38","label": "aquarium,art,drawing"}, + {"docid": "1_39","label": "abstract"}, + {"docid": "1_40","label": "cartoon"}, + {"docid": "1_41","label": "art,drawing"}, + {"docid": "1_42","label": "art,pattern"}, + {"docid": "1_43","label": "abstract,art,drawing,pattern"}, + {"docid": "1_44","label": "drawing"}, + {"docid": "1_45","label": "art"}, + {"docid": "1_46","label": "abstract,colorfulness,pattern"}, + {"docid": "1_47","label": "abstract,pattern"}, + {"docid": "1_52","label": "abstract,cartoon"}, + {"docid": "1_57","label": "abstract,drawing,pattern"}, + {"docid": "1_58","label": "abstract,art,cartoon"}, + {"docid": "1_68","label": "design"}, + {"docid": "1_69","label": "geometry"} ]); insert_documents(&mut wtxn, &index, content); @@ -896,26 +920,26 @@ mod tests { builder.execute(|_| ()).unwrap(); let content = documents!([ - {"id":"1","city":"Lille", "_geo": { "lat": 50.6299 as f32, "lng": 3.0569 as f32 } }, - {"id":"2","city":"Mons-en-Barœul", "_geo": { "lat": 50.6415 as f32, "lng": 3.1106 as f32 } }, - {"id":"3","city":"Hellemmes", "_geo": { "lat": 50.6312 as f32, "lng": 3.1106 as f32 } }, - {"id":"4","city":"Villeneuve-d'Ascq", "_geo": { "lat": 50.6224 as f32, "lng": 3.1476 as f32 } }, - {"id":"5","city":"Hem", "_geo": { "lat": 50.6552 as f32, "lng": 3.1897 as f32 } }, - {"id":"6","city":"Roubaix", "_geo": { "lat": 50.6924 as f32, "lng": 3.1763 as f32 } }, - {"id":"7","city":"Tourcoing", "_geo": { "lat": 50.7263 as f32, "lng": 3.1541 as f32 } }, - {"id":"8","city":"Mouscron", "_geo": { "lat": 50.7453 as f32, "lng": 3.2206 as f32 } }, - {"id":"9","city":"Tournai", "_geo": { "lat": 50.6053 as f32, "lng": 3.3758 as f32 } }, - {"id":"10","city":"Ghent", "_geo": { "lat": 51.0537 as f32, "lng": 3.6957 as f32 } }, - {"id":"11","city":"Brussels", "_geo": { "lat": 50.8466 as f32, "lng": 4.3370 as f32 } }, - {"id":"12","city":"Charleroi", "_geo": { "lat": 50.4095 as f32, "lng": 4.4347 as f32 } }, - {"id":"13","city":"Mons", "_geo": { "lat": 50.4502 as f32, "lng": 3.9623 as f32 } }, - {"id":"14","city":"Valenciennes", "_geo": { "lat": 50.3518 as f32, "lng": 3.5326 as f32 } }, - {"id":"15","city":"Arras", "_geo": { "lat": 50.2844 as f32, "lng": 2.7637 as f32 } }, - {"id":"16","city":"Cambrai", "_geo": { "lat": 50.1793 as f32, "lng": 3.2189 as f32 } }, - {"id":"17","city":"Bapaume", "_geo": { "lat": 50.1112 as f32, "lng": 2.8547 as f32 } }, - {"id":"18","city":"Amiens", "_geo": { "lat": 49.9314 as f32, "lng": 2.2710 as f32 } }, - {"id":"19","city":"Compiègne", "_geo": { "lat": 49.4449 as f32, "lng": 2.7913 as f32 } }, - {"id":"20","city":"Paris", "_geo": { "lat": 48.9021 as f32, "lng": 2.3708 as f32 } } + { "id": "1", "city": "Lille", "_geo": { "lat": 50.6299, "lng": 3.0569 } }, + { "id": "2", "city": "Mons-en-Barœul", "_geo": { "lat": 50.6415, "lng": 3.1106 } }, + { "id": "3", "city": "Hellemmes", "_geo": { "lat": 50.6312, "lng": 3.1106 } }, + { "id": "4", "city": "Villeneuve-d'Ascq", "_geo": { "lat": 50.6224, "lng": 3.1476 } }, + { "id": "5", "city": "Hem", "_geo": { "lat": 50.6552, "lng": 3.1897 } }, + { "id": "6", "city": "Roubaix", "_geo": { "lat": 50.6924, "lng": 3.1763 } }, + { "id": "7", "city": "Tourcoing", "_geo": { "lat": 50.7263, "lng": 3.1541 } }, + { "id": "8", "city": "Mouscron", "_geo": { "lat": 50.7453, "lng": 3.2206 } }, + { "id": "9", "city": "Tournai", "_geo": { "lat": 50.6053, "lng": 3.3758 } }, + { "id": "10", "city": "Ghent", "_geo": { "lat": 51.0537, "lng": 3.6957 } }, + { "id": "11", "city": "Brussels", "_geo": { "lat": 50.8466, "lng": 4.3370 } }, + { "id": "12", "city": "Charleroi", "_geo": { "lat": 50.4095, "lng": 4.4347 } }, + { "id": "13", "city": "Mons", "_geo": { "lat": 50.4502, "lng": 3.9623 } }, + { "id": "14", "city": "Valenciennes", "_geo": { "lat": 50.3518, "lng": 3.5326 } }, + { "id": "15", "city": "Arras", "_geo": { "lat": 50.2844, "lng": 2.7637 } }, + { "id": "16", "city": "Cambrai", "_geo": { "lat": 50.1793, "lng": 3.2189 } }, + { "id": "17", "city": "Bapaume", "_geo": { "lat": 50.1112, "lng": 2.8547 } }, + { "id": "18", "city": "Amiens", "_geo": { "lat": 49.9314, "lng": 2.2710 } }, + { "id": "19", "city": "Compiègne", "_geo": { "lat": 49.4449, "lng": 2.7913 } }, + { "id": "20", "city": "Paris", "_geo": { "lat": 48.9021, "lng": 2.3708 } } ]); let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"]; @@ -951,26 +975,26 @@ mod tests { builder.execute(|_| ()).unwrap(); let content = documents!([ - {"docid":"1_4","label":"sign"}, - {"docid":"1_5","label":"letter"}, - {"docid":"1_7","label":"abstract,cartoon,design,pattern"}, - {"docid":"1_36","label":"drawing,painting,pattern"}, - {"docid":"1_37","label":"art,drawing,outdoor"}, - {"docid":"1_38","label":"aquarium,art,drawing"}, - {"docid":"1_39","label":"abstract"}, - {"docid":"1_40","label":"cartoon"}, - {"docid":"1_41","label":"art,drawing"}, - {"docid":"1_42","label":"art,pattern"}, - {"docid":"1_43","label":"abstract,art,drawing,pattern"}, - {"docid":"1_44","label":"drawing"}, - {"docid":"1_45","label":"art"}, - {"docid":"1_46","label":"abstract,colorfulness,pattern"}, - {"docid":"1_47","label":"abstract,pattern"}, - {"docid":"1_52","label":"abstract,cartoon"}, - {"docid":"1_57","label":"abstract,drawing,pattern"}, - {"docid":"1_58","label":"abstract,art,cartoon"}, - {"docid":"1_68","label":"design"}, - {"docid":"1_69","label":"geometry"} + { "docid": "1_4", "label": "sign" }, + { "docid": "1_5", "label": "letter" }, + { "docid": "1_7", "label": "abstract,cartoon,design,pattern" }, + { "docid": "1_36", "label": "drawing,painting,pattern" }, + { "docid": "1_37", "label": "art,drawing,outdoor" }, + { "docid": "1_38", "label": "aquarium,art,drawing" }, + { "docid": "1_39", "label": "abstract" }, + { "docid": "1_40", "label": "cartoon" }, + { "docid": "1_41", "label": "art,drawing" }, + { "docid": "1_42", "label": "art,pattern" }, + { "docid": "1_43", "label": "abstract,art,drawing,pattern" }, + { "docid": "1_44", "label": "drawing" }, + { "docid": "1_45", "label": "art" }, + { "docid": "1_46", "label": "abstract,colorfulness,pattern" }, + { "docid": "1_47", "label": "abstract,pattern" }, + { "docid": "1_52", "label": "abstract,cartoon" }, + { "docid": "1_57", "label": "abstract,drawing,pattern" }, + { "docid": "1_58", "label": "abstract,art,cartoon" }, + { "docid": "1_68", "label": "design" }, + { "docid": "1_69", "label": "geometry" } ]); insert_documents(&mut wtxn, &index, content); @@ -1021,26 +1045,26 @@ mod tests { builder.execute(|_| ()).unwrap(); let content = documents!([ - {"docid":"1_4","label":"sign"}, - {"docid":"1_5","label":"letter"}, - {"docid":"1_7","label":"abstract,cartoon,design,pattern", "title": "Mickey Mouse"}, - {"docid":"1_36","label":"drawing,painting,pattern"}, - {"docid":"1_37","label":"art,drawing,outdoor"}, - {"docid":"1_38","label":"aquarium,art,drawing", "title": "Nemo"}, - {"docid":"1_39","label":"abstract"}, - {"docid":"1_40","label":"cartoon"}, - {"docid":"1_41","label":"art,drawing"}, - {"docid":"1_42","label":"art,pattern"}, - {"docid":"1_43","label":"abstract,art,drawing,pattern", "number": 32i32}, - {"docid":"1_44","label":"drawing", "number": 44i32}, - {"docid":"1_45","label":"art"}, - {"docid":"1_46","label":"abstract,colorfulness,pattern"}, - {"docid":"1_47","label":"abstract,pattern"}, - {"docid":"1_52","label":"abstract,cartoon"}, - {"docid":"1_57","label":"abstract,drawing,pattern"}, - {"docid":"1_58","label":"abstract,art,cartoon"}, - {"docid":"1_68","label":"design"}, - {"docid":"1_69","label":"geometry"} + { "docid": "1_4", "label": "sign"}, + { "docid": "1_5", "label": "letter"}, + { "docid": "1_7", "label": "abstract,cartoon,design,pattern", "title": "Mickey Mouse"}, + { "docid": "1_36", "label": "drawing,painting,pattern"}, + { "docid": "1_37", "label": "art,drawing,outdoor"}, + { "docid": "1_38", "label": "aquarium,art,drawing", "title": "Nemo"}, + { "docid": "1_39", "label": "abstract"}, + { "docid": "1_40", "label": "cartoon"}, + { "docid": "1_41", "label": "art,drawing"}, + { "docid": "1_42", "label": "art,pattern"}, + { "docid": "1_43", "label": "abstract,art,drawing,pattern", "number": 32i32}, + { "docid": "1_44", "label": "drawing", "number": 44i32}, + { "docid": "1_45", "label": "art"}, + { "docid": "1_46", "label": "abstract,colorfulness,pattern"}, + { "docid": "1_47", "label": "abstract,pattern"}, + { "docid": "1_52", "label": "abstract,cartoon"}, + { "docid": "1_57", "label": "abstract,drawing,pattern"}, + { "docid": "1_58", "label": "abstract,art,cartoon"}, + { "docid": "1_68", "label": "design"}, + { "docid": "1_69", "label": "geometry"} ]); insert_documents(&mut wtxn, &index, content); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 5b6af12ae..ba428f078 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -719,10 +719,11 @@ mod tests { assert_eq!(count, 1); // Check that we get only one document from the database. - let docs = index.documents(&rtxn, Some(0)).unwrap(); + // Since the document has been deleted and re-inserted, its internal docid has been incremented to 1 + let docs = index.documents(&rtxn, Some(1)).unwrap(); assert_eq!(docs.len(), 1); let (id, doc) = docs[0]; - assert_eq!(id, 0); + assert_eq!(id, 1); // Check that this document is equal to the last one sent. let mut doc_iter = doc.iter(); @@ -809,11 +810,12 @@ mod tests { let count = index.number_of_documents(&rtxn).unwrap(); assert_eq!(count, 3); - let docs = index.documents(&rtxn, vec![0, 1, 2]).unwrap(); - let (kevin_id, _) = - docs.iter().find(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap(); - let (id, doc) = docs[*kevin_id as usize]; - assert_eq!(id, *kevin_id); + // the document 0 has been deleted and reinserted with the id 3 + let docs = index.documents(&rtxn, vec![1, 2, 3]).unwrap(); + let kevin_position = + docs.iter().position(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap(); + assert_eq!(kevin_position, 2); + let (_, doc) = docs[kevin_position]; // Check that this document is equal to the last // one sent and that an UUID has been generated. diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 08d450578..7ddf8765a 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -4,7 +4,6 @@ use std::collections::{HashMap, HashSet}; use std::fs::File; use std::io::{Read, Seek, SeekFrom}; -use byteorder::ReadBytesExt; use fxhash::FxHashMap; use heed::RoTxn; use itertools::Itertools; @@ -57,7 +56,7 @@ pub struct Transform<'a, 'i> { flattened_sorter: grenad::Sorter, replaced_documents_ids: RoaringBitmap, new_documents_ids: RoaringBitmap, - // To increase the cache locality and the heap usage we use smartstring. + // To increase the cache locality and decrease the heap usage we use compact smartstring. new_external_documents_ids_builder: FxHashMap, u64>, documents_count: usize, } @@ -130,13 +129,17 @@ impl<'a, 'i> Transform<'a, 'i> { indexer_settings.max_memory.map(|mem| mem / 2), ); let documents_ids = index.documents_ids(wtxn)?; + let soft_deleted_documents_ids = index.soft_deleted_documents_ids(wtxn)?; Ok(Transform { index, fields_ids_map: index.fields_ids_map(wtxn)?, indexer_settings, autogenerate_docids, - available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids), + available_documents_ids: AvailableDocumentsIds::from_documents_ids( + &documents_ids, + &soft_deleted_documents_ids, + ), original_sorter, flattened_sorter, index_documents_method, @@ -248,45 +251,39 @@ impl<'a, 'i> Transform<'a, 'i> { writer.insert(*k, v)?; } - let (docid, should_insert_original_document) = - match external_documents_ids.get(&*external_id) { - // if the document is in the db but has already been inserted - // (ie: already exists in the list of replaced documents ids), - // we should not add the original document a second time. - Some(docid) => (docid, !self.replaced_documents_ids.contains(docid)), - None => { - // if the document has already been inserted in this - // batch we need to get its docid - match self.new_external_documents_ids_builder.entry(external_id.into()) { - Entry::Occupied(entry) => (*entry.get() as u32, false), - // if the document has never been encountered we give it a new docid - // and push this new docid to the external documents ids builder - Entry::Vacant(entry) => { - let new_docid = self - .available_documents_ids - .next() - .ok_or(UserError::DocumentLimitReached)?; - entry.insert(new_docid as u64); - (new_docid, false) - } - } + let mut original_docid = None; + + let docid = match self.new_external_documents_ids_builder.entry(external_id.into()) { + Entry::Occupied(entry) => *entry.get() as u32, + Entry::Vacant(entry) => { + // If the document was already in the db we mark it as a replaced document. + // It'll be deleted later. We keep its original docid to insert it in the grenad. + if let Some(docid) = external_documents_ids.get(entry.key()) { + self.replaced_documents_ids.insert(docid); + original_docid = Some(docid); } - }; + let docid = self + .available_documents_ids + .next() + .ok_or(UserError::DocumentLimitReached)?; + entry.insert(docid as u64); + docid + } + }; - if should_insert_original_document { - self.replaced_documents_ids.insert(docid); - - let key = BEU32::new(docid); + if let Some(original_docid) = original_docid { + let original_key = BEU32::new(original_docid); let base_obkv = self .index .documents .remap_data_type::() - .get(wtxn, &key)? + .get(wtxn, &original_key)? .ok_or(InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None, })?; + // we associate the base document with the new key, everything will get merged later. self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?; match self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))? { Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, @@ -506,6 +503,39 @@ impl<'a, 'i> Transform<'a, 'i> { Ok(()) } + fn remove_deleted_documents_from_field_distribution( + &self, + rtxn: &RoTxn, + field_distribution: &mut FieldDistribution, + ) -> Result<()> { + for deleted_docid in self.replaced_documents_ids.iter() { + let obkv = self.index.documents.get(rtxn, &BEU32::new(deleted_docid))?.ok_or( + InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, + )?; + + for (key, _) in obkv.iter() { + let name = + self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { + field_id: key, + process: "Computing field distribution in transform.", + })?; + // We checked that the document was in the db earlier. If we can't find it it means + // there is an inconsistency between the field distribution and the field id map. + let field = + field_distribution.get_mut(name).ok_or(FieldIdMapMissingEntry::FieldId { + field_id: key, + process: "Accessing field distribution in transform.", + })?; + *field -= 1; + if *field == 0 { + // since we were able to get the field right before it's safe to unwrap here + field_distribution.remove(name).unwrap(); + } + } + } + Ok(()) + } + /// Generate the `TransformOutput` based on the given sorter that can be generated from any /// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document /// id for the user side and the value must be an obkv where keys are valid fields ids. @@ -532,9 +562,14 @@ impl<'a, 'i> Transform<'a, 'i> { tempfile::tempfile()?, ); - // Once we have all the documents in the sorter, we write the documents - // in the writer. We also generate the field distribution. + // To compute the field distribution we need to; + // 1. Remove all the deleted documents from the field distribution + // 2. Add all the new documents to the field distribution let mut field_distribution = self.index.field_distribution(wtxn)?; + + self.remove_deleted_documents_from_field_distribution(wtxn, &mut field_distribution)?; + + // Here we are going to do the document count + field distribution + `write_into_stream_writer` let mut iter = self.original_sorter.into_stream_merger_iter()?; // used only for the callback let mut documents_count = 0; @@ -547,36 +582,6 @@ impl<'a, 'i> Transform<'a, 'i> { total_documents: self.documents_count, }); - let u32_key = key.clone().read_u32::()?; - // if the document was already in the db we remove all of its field - // from the field distribution. - if self.replaced_documents_ids.contains(u32_key) { - let obkv = self.index.documents.get(wtxn, &BEU32::new(u32_key))?.ok_or( - InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, - )?; - - for (key, _) in obkv.iter() { - let name = - self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { - field_id: key, - process: "Computing field distribution in transform.", - })?; - // We checked that the document was in the db earlier. If we can't find it it means - // there is an inconsistency between the field distribution and the field id map. - let field = field_distribution.get_mut(name).ok_or( - FieldIdMapMissingEntry::FieldId { - field_id: key, - process: "Accessing field distribution in transform.", - }, - )?; - *field -= 1; - if *field == 0 { - // since we were able to get the field right before it's safe to unwrap here - field_distribution.remove(name).unwrap(); - } - } - } - // We increment all the field of the current document in the field distribution. let obkv = KvReader::new(val);