2024-11-07 15:05:20 +01:00
|
|
|
use std::cell::RefCell;
|
2024-09-04 12:17:13 +02:00
|
|
|
|
2024-10-03 18:08:09 +02:00
|
|
|
use hashbrown::HashSet;
|
2024-09-02 10:42:19 +02:00
|
|
|
use heed::types::Bytes;
|
2024-09-24 17:24:50 +02:00
|
|
|
use heed::{Database, RoTxn};
|
2024-11-07 15:05:20 +01:00
|
|
|
use memmap2::Mmap;
|
2024-10-09 11:35:45 +02:00
|
|
|
use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
2024-09-02 10:42:19 +02:00
|
|
|
use roaring::RoaringBitmap;
|
|
|
|
|
2024-09-05 10:56:22 +02:00
|
|
|
use super::channel::*;
|
2024-10-09 11:35:45 +02:00
|
|
|
use super::extract::{
|
|
|
|
merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, FacetKind,
|
2024-11-07 15:05:20 +01:00
|
|
|
GeoExtractorData,
|
2024-10-09 11:35:45 +02:00
|
|
|
};
|
2024-11-07 15:05:20 +01:00
|
|
|
use crate::{CboRoaringBitmapCodec, FieldId, GeoPoint, Index, InternalError, Result};
|
2024-09-02 10:42:19 +02:00
|
|
|
|
2024-11-07 15:05:20 +01:00
|
|
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
|
|
|
|
pub fn merge_and_send_rtree<'extractor, MSP>(
|
|
|
|
datastore: impl IntoIterator<Item = RefCell<GeoExtractorData<'extractor>>>,
|
|
|
|
rtxn: &RoTxn,
|
|
|
|
index: &Index,
|
|
|
|
geo_sender: GeoSender<'_>,
|
|
|
|
must_stop_processing: &MSP,
|
|
|
|
) -> Result<()>
|
|
|
|
where
|
|
|
|
MSP: Fn() -> bool + Sync,
|
|
|
|
{
|
|
|
|
let mut rtree = index.geo_rtree(rtxn)?.unwrap_or_default();
|
|
|
|
let mut faceted = index.geo_faceted_documents_ids(rtxn)?;
|
2024-09-12 18:01:02 +02:00
|
|
|
|
2024-11-07 15:05:20 +01:00
|
|
|
for data in datastore {
|
|
|
|
if must_stop_processing() {
|
|
|
|
return Err(InternalError::AbortedIndexation.into());
|
2024-09-12 18:01:02 +02:00
|
|
|
}
|
|
|
|
|
2024-11-07 15:05:20 +01:00
|
|
|
let mut frozen = data.into_inner().freeze()?;
|
|
|
|
for result in frozen.iter_and_clear_removed() {
|
|
|
|
let extracted_geo_point = result?;
|
|
|
|
debug_assert!(rtree.remove(&GeoPoint::from(extracted_geo_point)).is_some());
|
|
|
|
debug_assert!(faceted.remove(extracted_geo_point.docid));
|
2024-09-12 18:01:02 +02:00
|
|
|
}
|
|
|
|
|
2024-11-07 15:05:20 +01:00
|
|
|
for result in frozen.iter_and_clear_inserted() {
|
|
|
|
let extracted_geo_point = result?;
|
|
|
|
rtree.insert(GeoPoint::from(extracted_geo_point));
|
|
|
|
debug_assert!(faceted.insert(extracted_geo_point.docid));
|
2024-09-12 18:01:02 +02:00
|
|
|
}
|
|
|
|
}
|
2024-11-07 15:05:20 +01:00
|
|
|
|
|
|
|
let mut file = tempfile::tempfile()?;
|
|
|
|
/// manage error
|
2024-11-13 14:15:42 +01:00
|
|
|
bincode::serialize_into(&mut file, &rtree).unwrap();
|
2024-11-07 15:05:20 +01:00
|
|
|
file.sync_all()?;
|
|
|
|
|
|
|
|
let rtree_mmap = unsafe { Mmap::map(&file)? };
|
|
|
|
geo_sender.set_rtree(rtree_mmap).unwrap();
|
|
|
|
geo_sender.set_geo_faceted(&faceted).unwrap();
|
|
|
|
|
|
|
|
Ok(())
|
2024-09-12 18:01:02 +02:00
|
|
|
}
|
|
|
|
|
2024-09-05 17:36:19 +02:00
|
|
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
|
2024-11-07 11:23:49 +01:00
|
|
|
pub fn merge_and_send_docids<'extractor, MSP>(
|
2024-10-09 11:35:45 +02:00
|
|
|
mut caches: Vec<BalancedCaches<'extractor>>,
|
2024-09-04 12:17:13 +02:00
|
|
|
database: Database<Bytes, Bytes>,
|
2024-10-09 11:35:45 +02:00
|
|
|
index: &Index,
|
|
|
|
docids_sender: impl DocidsSender + Sync,
|
2024-11-07 11:23:49 +01:00
|
|
|
must_stop_processing: &MSP,
|
|
|
|
) -> Result<()>
|
|
|
|
where
|
|
|
|
MSP: Fn() -> bool + Sync,
|
|
|
|
{
|
2024-10-09 11:35:45 +02:00
|
|
|
transpose_and_freeze_caches(&mut caches)?.into_par_iter().try_for_each(|frozen| {
|
|
|
|
let rtxn = index.read_txn()?;
|
|
|
|
let mut buffer = Vec::new();
|
2024-11-07 11:23:49 +01:00
|
|
|
if must_stop_processing() {
|
|
|
|
return Err(InternalError::AbortedIndexation.into());
|
|
|
|
}
|
2024-10-09 11:35:45 +02:00
|
|
|
merge_caches(frozen, |key, DelAddRoaringBitmap { del, add }| {
|
|
|
|
let current = database.get(&rtxn, key)?;
|
|
|
|
match merge_cbo_bitmaps(current, del, add)? {
|
|
|
|
Operation::Write(bitmap) => {
|
|
|
|
let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer);
|
|
|
|
docids_sender.write(key, value).unwrap();
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
Operation::Delete => {
|
|
|
|
docids_sender.delete(key).unwrap();
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
Operation::Ignore => Ok(()),
|
2024-09-04 12:17:13 +02:00
|
|
|
}
|
2024-10-09 11:35:45 +02:00
|
|
|
})
|
|
|
|
})
|
2024-09-04 12:17:13 +02:00
|
|
|
}
|
|
|
|
|
2024-09-16 09:34:10 +02:00
|
|
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
|
2024-11-05 16:46:43 +01:00
|
|
|
pub fn merge_and_send_facet_docids<'extractor>(
|
2024-10-09 11:35:45 +02:00
|
|
|
mut caches: Vec<BalancedCaches<'extractor>>,
|
2024-09-16 09:34:10 +02:00
|
|
|
database: FacetDatabases,
|
2024-10-09 11:35:45 +02:00
|
|
|
index: &Index,
|
|
|
|
docids_sender: impl DocidsSender + Sync,
|
2024-11-05 16:46:43 +01:00
|
|
|
) -> Result<FacetFieldIdsDelta> {
|
2024-10-09 11:35:45 +02:00
|
|
|
transpose_and_freeze_caches(&mut caches)?
|
|
|
|
.into_par_iter()
|
|
|
|
.map(|frozen| {
|
|
|
|
let mut facet_field_ids_delta = FacetFieldIdsDelta::default();
|
|
|
|
let rtxn = index.read_txn()?;
|
|
|
|
let mut buffer = Vec::new();
|
|
|
|
merge_caches(frozen, |key, DelAddRoaringBitmap { del, add }| {
|
|
|
|
let current = database.get_cbo_roaring_bytes_value(&rtxn, key)?;
|
|
|
|
match merge_cbo_bitmaps(current, del, add)? {
|
|
|
|
Operation::Write(bitmap) => {
|
|
|
|
facet_field_ids_delta.register_from_key(key);
|
|
|
|
let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer);
|
|
|
|
docids_sender.write(key, value).unwrap();
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
Operation::Delete => {
|
|
|
|
facet_field_ids_delta.register_from_key(key);
|
|
|
|
docids_sender.delete(key).unwrap();
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
Operation::Ignore => Ok(()),
|
|
|
|
}
|
|
|
|
})?;
|
|
|
|
|
2024-11-05 16:46:43 +01:00
|
|
|
Ok(facet_field_ids_delta)
|
2024-10-09 11:35:45 +02:00
|
|
|
})
|
2024-11-05 16:46:43 +01:00
|
|
|
.reduce(|| Ok(FacetFieldIdsDelta::default()), |lhs, rhs| Ok(lhs?.merge(rhs?)))
|
2024-09-16 09:34:10 +02:00
|
|
|
}
|
|
|
|
|
2024-10-09 11:35:45 +02:00
|
|
|
pub struct FacetDatabases<'a> {
|
2024-10-01 16:13:08 +02:00
|
|
|
index: &'a Index,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<'a> FacetDatabases<'a> {
|
2024-10-09 11:35:45 +02:00
|
|
|
pub fn new(index: &'a Index) -> Self {
|
2024-10-01 16:13:08 +02:00
|
|
|
Self { index }
|
|
|
|
}
|
|
|
|
|
|
|
|
fn get_cbo_roaring_bytes_value<'t>(
|
|
|
|
&self,
|
|
|
|
rtxn: &'t RoTxn<'_>,
|
|
|
|
key: &[u8],
|
|
|
|
) -> heed::Result<Option<&'t [u8]>> {
|
|
|
|
let (facet_kind, key) = FacetKind::extract_from_key(key);
|
|
|
|
|
|
|
|
let value =
|
|
|
|
super::channel::Database::from(facet_kind).database(self.index).get(rtxn, key)?;
|
|
|
|
match facet_kind {
|
|
|
|
// skip level group size
|
|
|
|
FacetKind::String | FacetKind::Number => Ok(value.map(|v| &v[1..])),
|
|
|
|
_ => Ok(value),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-10-09 11:35:45 +02:00
|
|
|
#[derive(Debug, Default)]
|
2024-10-01 16:13:08 +02:00
|
|
|
pub struct FacetFieldIdsDelta {
|
|
|
|
/// The field ids that have been modified
|
|
|
|
modified_facet_string_ids: HashSet<FieldId>,
|
|
|
|
modified_facet_number_ids: HashSet<FieldId>,
|
2024-09-16 09:34:10 +02:00
|
|
|
}
|
|
|
|
|
2024-10-01 16:13:08 +02:00
|
|
|
impl FacetFieldIdsDelta {
|
|
|
|
fn register_facet_string_id(&mut self, field_id: FieldId) {
|
|
|
|
self.modified_facet_string_ids.insert(field_id);
|
|
|
|
}
|
|
|
|
|
|
|
|
fn register_facet_number_id(&mut self, field_id: FieldId) {
|
|
|
|
self.modified_facet_number_ids.insert(field_id);
|
|
|
|
}
|
|
|
|
|
|
|
|
fn register_from_key(&mut self, key: &[u8]) {
|
|
|
|
let (facet_kind, field_id) = self.extract_key_data(key);
|
2024-09-16 09:34:10 +02:00
|
|
|
match facet_kind {
|
2024-10-01 16:13:08 +02:00
|
|
|
FacetKind::Number => self.register_facet_number_id(field_id),
|
|
|
|
FacetKind::String => self.register_facet_string_id(field_id),
|
|
|
|
_ => (),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-10-10 22:42:37 +02:00
|
|
|
fn extract_key_data(&self, key: &[u8]) -> (FacetKind, FieldId) {
|
2024-10-01 16:13:08 +02:00
|
|
|
let facet_kind = FacetKind::from(key[0]);
|
|
|
|
let field_id = FieldId::from_be_bytes([key[1], key[2]]);
|
|
|
|
(facet_kind, field_id)
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn modified_facet_string_ids(&self) -> Option<Vec<FieldId>> {
|
|
|
|
if self.modified_facet_string_ids.is_empty() {
|
|
|
|
None
|
|
|
|
} else {
|
|
|
|
Some(self.modified_facet_string_ids.iter().copied().collect())
|
2024-09-16 09:34:10 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-10-01 16:13:08 +02:00
|
|
|
pub fn modified_facet_number_ids(&self) -> Option<Vec<FieldId>> {
|
|
|
|
if self.modified_facet_number_ids.is_empty() {
|
|
|
|
None
|
|
|
|
} else {
|
|
|
|
Some(self.modified_facet_number_ids.iter().copied().collect())
|
|
|
|
}
|
2024-09-16 09:34:10 +02:00
|
|
|
}
|
2024-10-09 11:35:45 +02:00
|
|
|
|
|
|
|
pub fn merge(mut self, rhs: Self) -> Self {
|
|
|
|
let Self { modified_facet_number_ids, modified_facet_string_ids } = rhs;
|
|
|
|
modified_facet_number_ids.into_iter().for_each(|fid| {
|
|
|
|
self.modified_facet_number_ids.insert(fid);
|
|
|
|
});
|
|
|
|
modified_facet_string_ids.into_iter().for_each(|fid| {
|
|
|
|
self.modified_facet_string_ids.insert(fid);
|
|
|
|
});
|
|
|
|
self
|
|
|
|
}
|
2024-09-16 09:34:10 +02:00
|
|
|
}
|
|
|
|
|
2024-09-02 10:42:19 +02:00
|
|
|
enum Operation {
|
|
|
|
Write(RoaringBitmap),
|
|
|
|
Delete,
|
|
|
|
Ignore,
|
|
|
|
}
|
|
|
|
|
|
|
|
/// A function that merges the DelAdd CboRoaringBitmaps with the current bitmap.
|
|
|
|
fn merge_cbo_bitmaps(
|
|
|
|
current: Option<&[u8]>,
|
2024-10-09 11:35:45 +02:00
|
|
|
del: Option<RoaringBitmap>,
|
|
|
|
add: Option<RoaringBitmap>,
|
2024-09-02 10:42:19 +02:00
|
|
|
) -> Result<Operation> {
|
|
|
|
let current = current.map(CboRoaringBitmapCodec::deserialize_from).transpose()?;
|
|
|
|
match (current, del, add) {
|
|
|
|
(None, None, None) => Ok(Operation::Ignore), // but it's strange
|
|
|
|
(None, None, Some(add)) => Ok(Operation::Write(add)),
|
|
|
|
(None, Some(_del), None) => Ok(Operation::Ignore), // but it's strange
|
|
|
|
(None, Some(_del), Some(add)) => Ok(Operation::Write(add)),
|
|
|
|
(Some(_current), None, None) => Ok(Operation::Ignore), // but it's strange
|
|
|
|
(Some(current), None, Some(add)) => Ok(Operation::Write(current | add)),
|
|
|
|
(Some(current), Some(del), add) => {
|
|
|
|
let output = match add {
|
2024-10-01 16:13:08 +02:00
|
|
|
Some(add) => (¤t - del) | add,
|
|
|
|
None => ¤t - del,
|
2024-09-02 10:42:19 +02:00
|
|
|
};
|
|
|
|
if output.is_empty() {
|
|
|
|
Ok(Operation::Delete)
|
2024-10-01 16:13:08 +02:00
|
|
|
} else if current == output {
|
|
|
|
Ok(Operation::Ignore)
|
2024-09-02 10:42:19 +02:00
|
|
|
} else {
|
|
|
|
Ok(Operation::Write(output))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-09-04 09:59:19 +02:00
|
|
|
/// TODO Return the slice directly from the serialize_into method
|
|
|
|
fn cbo_bitmap_serialize_into_vec<'b>(bitmap: &RoaringBitmap, buffer: &'b mut Vec<u8>) -> &'b [u8] {
|
2024-09-02 10:42:19 +02:00
|
|
|
buffer.clear();
|
|
|
|
CboRoaringBitmapCodec::serialize_into(bitmap, buffer);
|
|
|
|
buffer.as_slice()
|
|
|
|
}
|