mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-23 05:14:27 +01:00
improve the way we handle the fsts
This commit is contained in:
parent
7f619ff0e4
commit
602ad98cb8
@ -56,7 +56,6 @@ itertools = "0.10.5"
|
|||||||
log = "0.4.17"
|
log = "0.4.17"
|
||||||
logging_timer = "1.1.0"
|
logging_timer = "1.1.0"
|
||||||
csv = "1.2.1"
|
csv = "1.2.1"
|
||||||
fastrand = "1.9.0"
|
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
mimalloc = { version = "0.1.29", default-features = false }
|
mimalloc = { version = "0.1.29", default-features = false }
|
||||||
@ -65,7 +64,10 @@ insta = "1.29.0"
|
|||||||
maplit = "1.0.2"
|
maplit = "1.0.2"
|
||||||
md5 = "0.7.0"
|
md5 = "0.7.0"
|
||||||
rand = {version = "0.8.5", features = ["small_rng"] }
|
rand = {version = "0.8.5", features = ["small_rng"] }
|
||||||
|
|
||||||
|
# fuzzing
|
||||||
arbitrary = { version = "1.3.0", features = ["derive"] }
|
arbitrary = { version = "1.3.0", features = ["derive"] }
|
||||||
|
fastrand = "1.9.0"
|
||||||
|
|
||||||
[target.'cfg(fuzzing)'.dev-dependencies]
|
[target.'cfg(fuzzing)'.dev-dependencies]
|
||||||
fuzzcheck = "0.12.1"
|
fuzzcheck = "0.12.1"
|
||||||
|
@ -106,22 +106,30 @@ impl<'a> ExternalDocumentsIds<'a> {
|
|||||||
map
|
map
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return an fst of the combined hard and soft deleted ID.
|
||||||
|
pub fn to_fst<'b>(&'b self) -> fst::Result<Cow<'b, fst::Map<Cow<'a, [u8]>>>> {
|
||||||
|
if self.soft.is_empty() {
|
||||||
|
return Ok(Cow::Borrowed(&self.hard));
|
||||||
|
}
|
||||||
|
let union_op = self.hard.op().add(&self.soft).r#union();
|
||||||
|
|
||||||
|
let mut iter = union_op.into_stream();
|
||||||
|
let mut new_hard_builder = fst::MapBuilder::memory();
|
||||||
|
while let Some((external_id, marked_docids)) = iter.next() {
|
||||||
|
let value = indexed_last_value(marked_docids).unwrap();
|
||||||
|
if value != DELETED_ID {
|
||||||
|
new_hard_builder.insert(external_id, value)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
drop(iter);
|
||||||
|
|
||||||
|
Ok(Cow::Owned(new_hard_builder.into_map().map_data(Cow::Owned)?))
|
||||||
|
}
|
||||||
|
|
||||||
fn merge_soft_into_hard(&mut self) -> fst::Result<()> {
|
fn merge_soft_into_hard(&mut self) -> fst::Result<()> {
|
||||||
if self.soft.len() >= self.hard.len() / 2 {
|
if self.soft.len() >= self.hard.len() / 2 {
|
||||||
let union_op = self.hard.op().add(&self.soft).r#union();
|
self.hard = self.to_fst()?.into_owned();
|
||||||
|
|
||||||
let mut iter = union_op.into_stream();
|
|
||||||
let mut new_hard_builder = fst::MapBuilder::memory();
|
|
||||||
while let Some((external_id, marked_docids)) = iter.next() {
|
|
||||||
let value = indexed_last_value(marked_docids).unwrap();
|
|
||||||
if value != DELETED_ID {
|
|
||||||
new_hard_builder.insert(external_id, value)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
drop(iter);
|
|
||||||
|
|
||||||
self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?;
|
|
||||||
self.soft = fst::Map::default().map_data(Cow::Owned)?;
|
self.soft = fst::Map::default().map_data(Cow::Owned)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -198,7 +198,6 @@ where
|
|||||||
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
||||||
return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents });
|
return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents });
|
||||||
}
|
}
|
||||||
|
|
||||||
let output = self
|
let output = self
|
||||||
.transform
|
.transform
|
||||||
.take()
|
.take()
|
||||||
@ -221,7 +220,6 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
let indexed_documents = output.documents_count as u64;
|
let indexed_documents = output.documents_count as u64;
|
||||||
|
|
||||||
let number_of_documents = self.execute_raw(output)?;
|
let number_of_documents = self.execute_raw(output)?;
|
||||||
|
|
||||||
Ok(DocumentAdditionResult { indexed_documents, number_of_documents })
|
Ok(DocumentAdditionResult { indexed_documents, number_of_documents })
|
||||||
|
@ -57,8 +57,8 @@ pub struct Transform<'a, 'i> {
|
|||||||
original_sorter: grenad::Sorter<MergeFn>,
|
original_sorter: grenad::Sorter<MergeFn>,
|
||||||
flattened_sorter: grenad::Sorter<MergeFn>,
|
flattened_sorter: grenad::Sorter<MergeFn>,
|
||||||
|
|
||||||
pub replaced_documents_ids: RoaringBitmap,
|
replaced_documents_ids: RoaringBitmap,
|
||||||
pub new_documents_ids: RoaringBitmap,
|
new_documents_ids: RoaringBitmap,
|
||||||
// To increase the cache locality and decrease the heap usage we use compact smartstring.
|
// To increase the cache locality and decrease the heap usage we use compact smartstring.
|
||||||
new_external_documents_ids_builder: FxHashMap<SmartString<smartstring::Compact>, u64>,
|
new_external_documents_ids_builder: FxHashMap<SmartString<smartstring::Compact>, u64>,
|
||||||
documents_count: usize,
|
documents_count: usize,
|
||||||
@ -653,9 +653,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
primary_key,
|
primary_key,
|
||||||
fields_ids_map: self.fields_ids_map,
|
fields_ids_map: self.fields_ids_map,
|
||||||
field_distribution,
|
field_distribution,
|
||||||
new_external_documents_ids: new_external_documents_ids
|
new_external_documents_ids: new_external_documents_ids.map_data(Cow::Owned).unwrap(),
|
||||||
.map_data(|c| Cow::Owned(c))
|
|
||||||
.unwrap(),
|
|
||||||
new_documents_ids: self.new_documents_ids,
|
new_documents_ids: self.new_documents_ids,
|
||||||
replaced_documents_ids: self.replaced_documents_ids,
|
replaced_documents_ids: self.replaced_documents_ids,
|
||||||
documents_count: self.documents_count,
|
documents_count: self.documents_count,
|
||||||
@ -689,8 +687,8 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
let new_external_documents_ids = {
|
let new_external_documents_ids = {
|
||||||
let mut external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
let mut external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
||||||
external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
|
external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
|
||||||
// it is safe to get the hard document IDs
|
// This call should be free and can't fail since the previous method merged both fsts.
|
||||||
external_documents_ids.into_static().hard
|
external_documents_ids.into_static().to_fst()?.into_owned()
|
||||||
};
|
};
|
||||||
|
|
||||||
let documents_ids = self.index.documents_ids(wtxn)?;
|
let documents_ids = self.index.documents_ids(wtxn)?;
|
||||||
|
Loading…
Reference in New Issue
Block a user