2020-11-22 14:48:42 +01:00
|
|
|
use std::borrow::Cow;
|
2021-06-30 10:43:12 +02:00
|
|
|
use std::collections::HashMap;
|
2020-11-22 14:48:42 +01:00
|
|
|
use std::convert::TryInto;
|
2021-06-30 10:43:12 +02:00
|
|
|
use std::{fmt, str};
|
2021-06-16 18:33:33 +02:00
|
|
|
|
2021-06-30 11:22:57 +02:00
|
|
|
use fst::map::IndexedValue;
|
2021-06-16 18:33:33 +02:00
|
|
|
use fst::{IntoStreamer, Streamer};
|
2022-06-13 17:59:34 +02:00
|
|
|
use roaring::RoaringBitmap;
|
2020-11-22 14:48:42 +01:00
|
|
|
|
2021-06-30 10:43:12 +02:00
|
|
|
const DELETED_ID: u64 = u64::MAX;
|
|
|
|
|
2020-11-22 14:48:42 +01:00
|
|
|
pub struct ExternalDocumentsIds<'a> {
|
|
|
|
pub(crate) hard: fst::Map<Cow<'a, [u8]>>,
|
|
|
|
pub(crate) soft: fst::Map<Cow<'a, [u8]>>,
|
2022-06-13 17:59:34 +02:00
|
|
|
soft_deleted_docids: RoaringBitmap,
|
2020-11-22 14:48:42 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
impl<'a> ExternalDocumentsIds<'a> {
|
2021-06-16 18:33:33 +02:00
|
|
|
pub fn new(
|
|
|
|
hard: fst::Map<Cow<'a, [u8]>>,
|
|
|
|
soft: fst::Map<Cow<'a, [u8]>>,
|
2022-06-13 17:59:34 +02:00
|
|
|
soft_deleted_docids: RoaringBitmap,
|
2021-06-16 18:33:33 +02:00
|
|
|
) -> ExternalDocumentsIds<'a> {
|
2022-06-13 17:59:34 +02:00
|
|
|
ExternalDocumentsIds { hard, soft, soft_deleted_docids }
|
2020-11-22 14:48:42 +01:00
|
|
|
}
|
|
|
|
|
2020-11-22 17:28:41 +01:00
|
|
|
pub fn into_static(self) -> ExternalDocumentsIds<'static> {
|
|
|
|
ExternalDocumentsIds {
|
|
|
|
hard: self.hard.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
|
|
|
|
soft: self.soft.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
|
2022-06-13 17:59:34 +02:00
|
|
|
soft_deleted_docids: self.soft_deleted_docids,
|
2020-11-22 17:28:41 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-04-01 09:07:16 +02:00
|
|
|
/// Returns `true` if hard and soft external documents lists are empty.
|
|
|
|
pub fn is_empty(&self) -> bool {
|
|
|
|
self.hard.is_empty() && self.soft.is_empty()
|
|
|
|
}
|
|
|
|
|
2020-11-22 17:28:41 +01:00
|
|
|
pub fn get<A: AsRef<[u8]>>(&self, external_id: A) -> Option<u32> {
|
2020-11-22 14:48:42 +01:00
|
|
|
let external_id = external_id.as_ref();
|
|
|
|
match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) {
|
2022-06-13 17:59:34 +02:00
|
|
|
Some(id) if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) => {
|
|
|
|
Some(id.try_into().unwrap())
|
|
|
|
}
|
2021-06-16 18:33:33 +02:00
|
|
|
_otherwise => None,
|
2020-11-22 14:48:42 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-12-06 11:38:15 +01:00
|
|
|
/// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they
|
|
|
|
/// don't contain any soft deleted document id.
|
|
|
|
pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> {
|
|
|
|
let mut new_hard_builder = fst::MapBuilder::memory();
|
|
|
|
|
|
|
|
let union_op = self.hard.op().add(&self.soft).r#union();
|
|
|
|
let mut iter = union_op.into_stream();
|
|
|
|
while let Some((external_id, docids)) = iter.next() {
|
|
|
|
// prefer selecting the ids from soft, always
|
|
|
|
let id = indexed_last_value(docids).unwrap();
|
|
|
|
if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) {
|
|
|
|
new_hard_builder.insert(external_id, id)?;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
drop(iter);
|
|
|
|
|
|
|
|
// Delete soft map completely
|
|
|
|
self.soft = fst::Map::default().map_data(Cow::Owned)?;
|
|
|
|
// We save the new map as the new hard map.
|
|
|
|
self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?;
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
2020-11-22 14:48:42 +01:00
|
|
|
pub fn insert_ids<A: AsRef<[u8]>>(&mut self, other: &fst::Map<A>) -> fst::Result<()> {
|
|
|
|
let union_op = self.soft.op().add(other).r#union();
|
|
|
|
|
|
|
|
let mut new_soft_builder = fst::MapBuilder::memory();
|
|
|
|
let mut iter = union_op.into_stream();
|
2021-06-30 11:22:57 +02:00
|
|
|
while let Some((external_id, marked_docids)) = iter.next() {
|
|
|
|
let id = indexed_last_value(marked_docids).unwrap();
|
2020-11-22 14:48:42 +01:00
|
|
|
new_soft_builder.insert(external_id, id)?;
|
|
|
|
}
|
|
|
|
|
|
|
|
drop(iter);
|
|
|
|
|
|
|
|
// We save the new map as the new soft map.
|
|
|
|
self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?;
|
|
|
|
self.merge_soft_into_hard()
|
|
|
|
}
|
|
|
|
|
2021-06-30 10:43:12 +02:00
|
|
|
/// An helper function to debug this type, returns an `HashMap` of both,
|
|
|
|
/// soft and hard fst maps, combined.
|
|
|
|
pub fn to_hash_map(&self) -> HashMap<String, u32> {
|
|
|
|
let mut map = HashMap::new();
|
|
|
|
|
|
|
|
let union_op = self.hard.op().add(&self.soft).r#union();
|
|
|
|
let mut iter = union_op.into_stream();
|
|
|
|
while let Some((external_id, marked_docids)) = iter.next() {
|
2021-06-30 11:22:57 +02:00
|
|
|
let id = indexed_last_value(marked_docids).unwrap();
|
2021-06-30 10:43:12 +02:00
|
|
|
if id != DELETED_ID {
|
|
|
|
let external_id = str::from_utf8(external_id).unwrap();
|
|
|
|
map.insert(external_id.to_owned(), id.try_into().unwrap());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
map
|
|
|
|
}
|
|
|
|
|
2023-05-22 11:15:14 +02:00
|
|
|
/// Return an fst of the combined hard and soft deleted ID.
|
|
|
|
pub fn to_fst<'b>(&'b self) -> fst::Result<Cow<'b, fst::Map<Cow<'a, [u8]>>>> {
|
|
|
|
if self.soft.is_empty() {
|
|
|
|
return Ok(Cow::Borrowed(&self.hard));
|
|
|
|
}
|
|
|
|
let union_op = self.hard.op().add(&self.soft).r#union();
|
|
|
|
|
|
|
|
let mut iter = union_op.into_stream();
|
|
|
|
let mut new_hard_builder = fst::MapBuilder::memory();
|
|
|
|
while let Some((external_id, marked_docids)) = iter.next() {
|
|
|
|
let value = indexed_last_value(marked_docids).unwrap();
|
|
|
|
if value != DELETED_ID {
|
|
|
|
new_hard_builder.insert(external_id, value)?;
|
2020-11-22 14:48:42 +01:00
|
|
|
}
|
2023-05-22 11:15:14 +02:00
|
|
|
}
|
2020-11-22 14:48:42 +01:00
|
|
|
|
2023-05-22 11:15:14 +02:00
|
|
|
drop(iter);
|
2020-11-22 14:48:42 +01:00
|
|
|
|
2023-05-22 11:15:14 +02:00
|
|
|
Ok(Cow::Owned(new_hard_builder.into_map().map_data(Cow::Owned)?))
|
|
|
|
}
|
|
|
|
|
|
|
|
fn merge_soft_into_hard(&mut self) -> fst::Result<()> {
|
|
|
|
if self.soft.len() >= self.hard.len() / 2 {
|
|
|
|
self.hard = self.to_fst()?.into_owned();
|
2020-11-22 14:48:42 +01:00
|
|
|
self.soft = fst::Map::default().map_data(Cow::Owned)?;
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-06-30 10:43:12 +02:00
|
|
|
impl fmt::Debug for ExternalDocumentsIds<'_> {
|
|
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
|
|
f.debug_tuple("ExternalDocumentsIds").field(&self.to_hash_map()).finish()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-11-22 14:48:42 +01:00
|
|
|
impl Default for ExternalDocumentsIds<'static> {
|
|
|
|
fn default() -> Self {
|
|
|
|
ExternalDocumentsIds {
|
|
|
|
hard: fst::Map::default().map_data(Cow::Owned).unwrap(),
|
|
|
|
soft: fst::Map::default().map_data(Cow::Owned).unwrap(),
|
2022-06-13 17:59:34 +02:00
|
|
|
soft_deleted_docids: RoaringBitmap::new(),
|
2020-11-22 14:48:42 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-06-30 11:22:57 +02:00
|
|
|
/// Returns the value of the `IndexedValue` with the highest _index_.
|
|
|
|
fn indexed_last_value(indexed_values: &[IndexedValue]) -> Option<u64> {
|
|
|
|
indexed_values.iter().copied().max_by_key(|iv| iv.index).map(|iv| iv.value)
|
|
|
|
}
|