From 602ad98cb8ceeb90a13067f94821381434722655 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 22 May 2023 11:15:14 +0200 Subject: [PATCH] improve the way we handle the fsts --- milli/Cargo.toml | 4 ++- milli/src/external_documents_ids.rs | 36 +++++++++++-------- milli/src/update/index_documents/mod.rs | 2 -- milli/src/update/index_documents/transform.rs | 12 +++---- 4 files changed, 30 insertions(+), 24 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index ea48e008c..f708edc73 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -56,7 +56,6 @@ itertools = "0.10.5" log = "0.4.17" logging_timer = "1.1.0" csv = "1.2.1" -fastrand = "1.9.0" [dev-dependencies] mimalloc = { version = "0.1.29", default-features = false } @@ -65,7 +64,10 @@ insta = "1.29.0" maplit = "1.0.2" md5 = "0.7.0" rand = {version = "0.8.5", features = ["small_rng"] } + +# fuzzing arbitrary = { version = "1.3.0", features = ["derive"] } +fastrand = "1.9.0" [target.'cfg(fuzzing)'.dev-dependencies] fuzzcheck = "0.12.1" diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs index 2cecd1abe..36b147336 100644 --- a/milli/src/external_documents_ids.rs +++ b/milli/src/external_documents_ids.rs @@ -106,22 +106,30 @@ impl<'a> ExternalDocumentsIds<'a> { map } + /// Return an fst of the combined hard and soft deleted ID. + pub fn to_fst<'b>(&'b self) -> fst::Result>>> { + if self.soft.is_empty() { + return Ok(Cow::Borrowed(&self.hard)); + } + let union_op = self.hard.op().add(&self.soft).r#union(); + + let mut iter = union_op.into_stream(); + let mut new_hard_builder = fst::MapBuilder::memory(); + while let Some((external_id, marked_docids)) = iter.next() { + let value = indexed_last_value(marked_docids).unwrap(); + if value != DELETED_ID { + new_hard_builder.insert(external_id, value)?; + } + } + + drop(iter); + + Ok(Cow::Owned(new_hard_builder.into_map().map_data(Cow::Owned)?)) + } + fn merge_soft_into_hard(&mut self) -> fst::Result<()> { if self.soft.len() >= self.hard.len() / 2 { - let union_op = self.hard.op().add(&self.soft).r#union(); - - let mut iter = union_op.into_stream(); - let mut new_hard_builder = fst::MapBuilder::memory(); - while let Some((external_id, marked_docids)) = iter.next() { - let value = indexed_last_value(marked_docids).unwrap(); - if value != DELETED_ID { - new_hard_builder.insert(external_id, value)?; - } - } - - drop(iter); - - self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?; + self.hard = self.to_fst()?.into_owned(); self.soft = fst::Map::default().map_data(Cow::Owned)?; } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 406bfb0c9..70ec377aa 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -198,7 +198,6 @@ where let number_of_documents = self.index.number_of_documents(self.wtxn)?; return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents }); } - let output = self .transform .take() @@ -221,7 +220,6 @@ where } let indexed_documents = output.documents_count as u64; - let number_of_documents = self.execute_raw(output)?; Ok(DocumentAdditionResult { indexed_documents, number_of_documents }) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index e2a260391..ee6831be5 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -57,8 +57,8 @@ pub struct Transform<'a, 'i> { original_sorter: grenad::Sorter, flattened_sorter: grenad::Sorter, - pub replaced_documents_ids: RoaringBitmap, - pub new_documents_ids: RoaringBitmap, + replaced_documents_ids: RoaringBitmap, + new_documents_ids: RoaringBitmap, // To increase the cache locality and decrease the heap usage we use compact smartstring. new_external_documents_ids_builder: FxHashMap, u64>, documents_count: usize, @@ -653,9 +653,7 @@ impl<'a, 'i> Transform<'a, 'i> { primary_key, fields_ids_map: self.fields_ids_map, field_distribution, - new_external_documents_ids: new_external_documents_ids - .map_data(|c| Cow::Owned(c)) - .unwrap(), + new_external_documents_ids: new_external_documents_ids.map_data(Cow::Owned).unwrap(), new_documents_ids: self.new_documents_ids, replaced_documents_ids: self.replaced_documents_ids, documents_count: self.documents_count, @@ -689,8 +687,8 @@ impl<'a, 'i> Transform<'a, 'i> { let new_external_documents_ids = { let mut external_documents_ids = self.index.external_documents_ids(wtxn)?; external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?; - // it is safe to get the hard document IDs - external_documents_ids.into_static().hard + // This call should be free and can't fail since the previous method merged both fsts. + external_documents_ids.into_static().to_fst()?.into_owned() }; let documents_ids = self.index.documents_ids(wtxn)?;