mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-11 05:54:30 +01:00
improve the way we handle the fsts
This commit is contained in:
parent
7f619ff0e4
commit
602ad98cb8
@ -56,7 +56,6 @@ itertools = "0.10.5"
|
||||
log = "0.4.17"
|
||||
logging_timer = "1.1.0"
|
||||
csv = "1.2.1"
|
||||
fastrand = "1.9.0"
|
||||
|
||||
[dev-dependencies]
|
||||
mimalloc = { version = "0.1.29", default-features = false }
|
||||
@ -65,7 +64,10 @@ insta = "1.29.0"
|
||||
maplit = "1.0.2"
|
||||
md5 = "0.7.0"
|
||||
rand = {version = "0.8.5", features = ["small_rng"] }
|
||||
|
||||
# fuzzing
|
||||
arbitrary = { version = "1.3.0", features = ["derive"] }
|
||||
fastrand = "1.9.0"
|
||||
|
||||
[target.'cfg(fuzzing)'.dev-dependencies]
|
||||
fuzzcheck = "0.12.1"
|
||||
|
@ -106,22 +106,30 @@ impl<'a> ExternalDocumentsIds<'a> {
|
||||
map
|
||||
}
|
||||
|
||||
/// Return an fst of the combined hard and soft deleted ID.
|
||||
pub fn to_fst<'b>(&'b self) -> fst::Result<Cow<'b, fst::Map<Cow<'a, [u8]>>>> {
|
||||
if self.soft.is_empty() {
|
||||
return Ok(Cow::Borrowed(&self.hard));
|
||||
}
|
||||
let union_op = self.hard.op().add(&self.soft).r#union();
|
||||
|
||||
let mut iter = union_op.into_stream();
|
||||
let mut new_hard_builder = fst::MapBuilder::memory();
|
||||
while let Some((external_id, marked_docids)) = iter.next() {
|
||||
let value = indexed_last_value(marked_docids).unwrap();
|
||||
if value != DELETED_ID {
|
||||
new_hard_builder.insert(external_id, value)?;
|
||||
}
|
||||
}
|
||||
|
||||
drop(iter);
|
||||
|
||||
Ok(Cow::Owned(new_hard_builder.into_map().map_data(Cow::Owned)?))
|
||||
}
|
||||
|
||||
fn merge_soft_into_hard(&mut self) -> fst::Result<()> {
|
||||
if self.soft.len() >= self.hard.len() / 2 {
|
||||
let union_op = self.hard.op().add(&self.soft).r#union();
|
||||
|
||||
let mut iter = union_op.into_stream();
|
||||
let mut new_hard_builder = fst::MapBuilder::memory();
|
||||
while let Some((external_id, marked_docids)) = iter.next() {
|
||||
let value = indexed_last_value(marked_docids).unwrap();
|
||||
if value != DELETED_ID {
|
||||
new_hard_builder.insert(external_id, value)?;
|
||||
}
|
||||
}
|
||||
|
||||
drop(iter);
|
||||
|
||||
self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?;
|
||||
self.hard = self.to_fst()?.into_owned();
|
||||
self.soft = fst::Map::default().map_data(Cow::Owned)?;
|
||||
}
|
||||
|
||||
|
@ -198,7 +198,6 @@ where
|
||||
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
||||
return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents });
|
||||
}
|
||||
|
||||
let output = self
|
||||
.transform
|
||||
.take()
|
||||
@ -221,7 +220,6 @@ where
|
||||
}
|
||||
|
||||
let indexed_documents = output.documents_count as u64;
|
||||
|
||||
let number_of_documents = self.execute_raw(output)?;
|
||||
|
||||
Ok(DocumentAdditionResult { indexed_documents, number_of_documents })
|
||||
|
@ -57,8 +57,8 @@ pub struct Transform<'a, 'i> {
|
||||
original_sorter: grenad::Sorter<MergeFn>,
|
||||
flattened_sorter: grenad::Sorter<MergeFn>,
|
||||
|
||||
pub replaced_documents_ids: RoaringBitmap,
|
||||
pub new_documents_ids: RoaringBitmap,
|
||||
replaced_documents_ids: RoaringBitmap,
|
||||
new_documents_ids: RoaringBitmap,
|
||||
// To increase the cache locality and decrease the heap usage we use compact smartstring.
|
||||
new_external_documents_ids_builder: FxHashMap<SmartString<smartstring::Compact>, u64>,
|
||||
documents_count: usize,
|
||||
@ -653,9 +653,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
primary_key,
|
||||
fields_ids_map: self.fields_ids_map,
|
||||
field_distribution,
|
||||
new_external_documents_ids: new_external_documents_ids
|
||||
.map_data(|c| Cow::Owned(c))
|
||||
.unwrap(),
|
||||
new_external_documents_ids: new_external_documents_ids.map_data(Cow::Owned).unwrap(),
|
||||
new_documents_ids: self.new_documents_ids,
|
||||
replaced_documents_ids: self.replaced_documents_ids,
|
||||
documents_count: self.documents_count,
|
||||
@ -689,8 +687,8 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
let new_external_documents_ids = {
|
||||
let mut external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
||||
external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
|
||||
// it is safe to get the hard document IDs
|
||||
external_documents_ids.into_static().hard
|
||||
// This call should be free and can't fail since the previous method merged both fsts.
|
||||
external_documents_ids.into_static().to_fst()?.into_owned()
|
||||
};
|
||||
|
||||
let documents_ids = self.index.documents_ids(wtxn)?;
|
||||
|
Loading…
x
Reference in New Issue
Block a user