diff --git a/src/external_documents_ids.rs b/src/external_documents_ids.rs new file mode 100644 index 000000000..f8765b57e --- /dev/null +++ b/src/external_documents_ids.rs @@ -0,0 +1,149 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use fst::{Streamer, IntoStreamer}; + +pub struct ExternalDocumentsIds<'a> { + pub(crate) hard: fst::Map>, + pub(crate) soft: fst::Map>, +} + +impl<'a> ExternalDocumentsIds<'a> { + pub fn new(hard: fst::Map>, soft: fst::Map>) -> ExternalDocumentsIds<'a> { + ExternalDocumentsIds { hard, soft } + } + + pub fn get>(&self, external_id: A) -> Option { + let external_id = external_id.as_ref(); + match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) { + // u64 MAX means deleted in the soft fst map + Some(id) if id != u64::MAX => Some(id.try_into().unwrap()), + _otherwise => None + } + } + + pub fn delete_ids>(&mut self, other: fst::Set) -> fst::Result<()> { + let other = fst::Map::from(other.into_fst()); + let union_op = self.soft.op().add(&other).r#union(); + + let mut iter = union_op.into_stream(); + let mut new_soft_builder = fst::MapBuilder::memory(); + while let Some((external_id, docids)) = iter.next() { + if docids.iter().any(|v| v.index == 1) { + // If the `other` set returns a value here it means + // that it must be marked as deleted. + new_soft_builder.insert(external_id, u64::MAX)?; + } else { + new_soft_builder.insert(external_id, docids[0].value)?; + } + } + + drop(iter); + + // We save this new map as the new soft map. + self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?; + self.merge_soft_into_hard() + } + + pub fn insert_ids>(&mut self, other: &fst::Map) -> fst::Result<()> { + let union_op = self.soft.op().add(other).r#union(); + + let mut new_soft_builder = fst::MapBuilder::memory(); + let mut iter = union_op.into_stream(); + while let Some((external_id, docids)) = iter.next() { + let id = docids.last().unwrap().value; + new_soft_builder.insert(external_id, id)?; + } + + drop(iter); + + // We save the new map as the new soft map. + self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?; + self.merge_soft_into_hard() + } + + fn merge_soft_into_hard(&mut self) -> fst::Result<()> { + if self.soft.len() >= self.hard.len() / 2 { + let union_op = self.hard.op().add(&self.soft).r#union(); + + let mut iter = union_op.into_stream(); + let mut new_hard_builder = fst::MapBuilder::memory(); + while let Some((external_id, docids)) = iter.next() { + if docids.len() == 2 { + if docids[1].value != u64::MAX { + new_hard_builder.insert(external_id, docids[1].value)?; + } + } else { + new_hard_builder.insert(external_id, docids[0].value)?; + } + } + + drop(iter); + + self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?; + self.soft = fst::Map::default().map_data(Cow::Owned)?; + } + + Ok(()) + } +} + +impl Default for ExternalDocumentsIds<'static> { + fn default() -> Self { + ExternalDocumentsIds { + hard: fst::Map::default().map_data(Cow::Owned).unwrap(), + soft: fst::Map::default().map_data(Cow::Owned).unwrap(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn simple_insert_delete_ids() { + let mut external_documents_ids = ExternalDocumentsIds::default(); + + let new_ids = fst::Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3), ("d", 4)]).unwrap(); + external_documents_ids.insert_ids(&new_ids).unwrap(); + + assert_eq!(external_documents_ids.get("a"), Some(1)); + assert_eq!(external_documents_ids.get("b"), Some(2)); + assert_eq!(external_documents_ids.get("c"), Some(3)); + assert_eq!(external_documents_ids.get("d"), Some(4)); + + let new_ids = fst::Map::from_iter(vec![("e", 5), ("f", 6), ("g", 7)]).unwrap(); + external_documents_ids.insert_ids(&new_ids).unwrap(); + + assert_eq!(external_documents_ids.get("a"), Some(1)); + assert_eq!(external_documents_ids.get("b"), Some(2)); + assert_eq!(external_documents_ids.get("c"), Some(3)); + assert_eq!(external_documents_ids.get("d"), Some(4)); + assert_eq!(external_documents_ids.get("e"), Some(5)); + assert_eq!(external_documents_ids.get("f"), Some(6)); + assert_eq!(external_documents_ids.get("g"), Some(7)); + + let del_ids = fst::Set::from_iter(vec!["a", "c", "f"]).unwrap(); + external_documents_ids.delete_ids(del_ids).unwrap(); + + assert_eq!(external_documents_ids.get("a"), None); + assert_eq!(external_documents_ids.get("b"), Some(2)); + assert_eq!(external_documents_ids.get("c"), None); + assert_eq!(external_documents_ids.get("d"), Some(4)); + assert_eq!(external_documents_ids.get("e"), Some(5)); + assert_eq!(external_documents_ids.get("f"), None); + assert_eq!(external_documents_ids.get("g"), Some(7)); + + let new_ids = fst::Map::from_iter(vec![("a", 5), ("b", 6), ("h", 8)]).unwrap(); + external_documents_ids.insert_ids(&new_ids).unwrap(); + + assert_eq!(external_documents_ids.get("a"), Some(5)); + assert_eq!(external_documents_ids.get("b"), Some(6)); + assert_eq!(external_documents_ids.get("c"), None); + assert_eq!(external_documents_ids.get("d"), Some(4)); + assert_eq!(external_documents_ids.get("e"), Some(5)); + assert_eq!(external_documents_ids.get("f"), None); + assert_eq!(external_documents_ids.get("g"), Some(7)); + assert_eq!(external_documents_ids.get("h"), Some(8)); + } +} diff --git a/src/lib.rs b/src/lib.rs index 808c54a4a..fd0156bfa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ mod criterion; +mod external_documents_ids; mod fields_ids_map; mod index; mod mdfs;