2024-09-04 12:17:13 +02:00
|
|
|
use std::fs::File;
|
2024-09-12 18:01:02 +02:00
|
|
|
use std::io::{self, BufWriter};
|
2024-09-04 12:17:13 +02:00
|
|
|
|
2024-09-12 18:01:02 +02:00
|
|
|
use bincode::ErrorKind;
|
2024-09-04 09:59:19 +02:00
|
|
|
use fst::{Set, SetBuilder};
|
2024-09-04 12:17:13 +02:00
|
|
|
use grenad::Merger;
|
2024-09-02 10:42:19 +02:00
|
|
|
use heed::types::Bytes;
|
2024-09-12 18:01:02 +02:00
|
|
|
use heed::{BoxedError, Database, RoTxn};
|
2024-09-04 09:59:19 +02:00
|
|
|
use memmap2::Mmap;
|
2024-09-02 10:42:19 +02:00
|
|
|
use roaring::RoaringBitmap;
|
2024-09-04 09:59:19 +02:00
|
|
|
use tempfile::tempfile;
|
2024-09-02 10:42:19 +02:00
|
|
|
|
2024-09-05 10:56:22 +02:00
|
|
|
use super::channel::*;
|
2024-09-12 18:01:02 +02:00
|
|
|
use super::{Deletion, DocumentChange, Insertion, KvReaderDelAdd, KvReaderFieldId, Update};
|
2024-09-02 10:42:19 +02:00
|
|
|
use crate::update::del_add::DelAdd;
|
|
|
|
use crate::update::new::channel::MergerOperation;
|
2024-09-04 12:17:13 +02:00
|
|
|
use crate::update::MergeDeladdCboRoaringBitmaps;
|
2024-09-12 18:01:02 +02:00
|
|
|
use crate::{
|
|
|
|
CboRoaringBitmapCodec, Error, GeoPoint, GlobalFieldsIdsMap, Index, InternalError, Result,
|
|
|
|
};
|
2024-09-02 10:42:19 +02:00
|
|
|
|
|
|
|
/// TODO We must return some infos/stats
|
2024-09-05 17:36:19 +02:00
|
|
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::documents", name = "merge")]
|
2024-09-02 10:42:19 +02:00
|
|
|
pub fn merge_grenad_entries(
|
|
|
|
receiver: MergerReceiver,
|
|
|
|
sender: MergerSender,
|
|
|
|
rtxn: &RoTxn,
|
|
|
|
index: &Index,
|
2024-09-12 18:01:02 +02:00
|
|
|
mut global_fields_ids_map: GlobalFieldsIdsMap<'_>,
|
2024-09-02 10:42:19 +02:00
|
|
|
) -> Result<()> {
|
|
|
|
let mut buffer = Vec::new();
|
2024-09-04 09:59:19 +02:00
|
|
|
let mut documents_ids = index.documents_ids(rtxn)?;
|
2024-09-12 18:01:02 +02:00
|
|
|
let mut geo_extractor = GeoExtractor::new(rtxn, index)?;
|
2024-09-02 10:42:19 +02:00
|
|
|
|
|
|
|
for merger_operation in receiver {
|
|
|
|
match merger_operation {
|
2024-09-05 10:56:22 +02:00
|
|
|
MergerOperation::ExactWordDocidsMerger(merger) => {
|
2024-09-05 17:36:19 +02:00
|
|
|
let span =
|
|
|
|
tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids");
|
|
|
|
let _entered = span.enter();
|
2024-09-05 10:56:22 +02:00
|
|
|
merge_and_send_docids(
|
|
|
|
merger,
|
|
|
|
/// TODO do a MergerOperation::database(&Index) -> Database<Bytes, Bytes>.
|
|
|
|
index.exact_word_docids.remap_types(),
|
|
|
|
rtxn,
|
|
|
|
&mut buffer,
|
|
|
|
sender.docids::<ExactWordDocids>(),
|
|
|
|
|_key| Ok(()),
|
|
|
|
|_key| Ok(()),
|
|
|
|
)?;
|
|
|
|
}
|
|
|
|
MergerOperation::FidWordCountDocidsMerger(merger) => {
|
2024-09-05 17:36:19 +02:00
|
|
|
let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids");
|
|
|
|
let _entered = span.enter();
|
2024-09-05 10:56:22 +02:00
|
|
|
merge_and_send_docids(
|
|
|
|
merger,
|
|
|
|
index.field_id_word_count_docids.remap_types(),
|
|
|
|
rtxn,
|
|
|
|
&mut buffer,
|
|
|
|
sender.docids::<FidWordCountDocids>(),
|
|
|
|
|_key| Ok(()),
|
|
|
|
|_key| Ok(()),
|
|
|
|
)?;
|
|
|
|
}
|
2024-09-03 11:02:39 +02:00
|
|
|
MergerOperation::WordDocidsMerger(merger) => {
|
2024-09-05 17:36:19 +02:00
|
|
|
let span =
|
|
|
|
tracing::trace_span!(target: "indexing::documents::merge", "word_docids");
|
|
|
|
let _entered = span.enter();
|
2024-09-12 11:48:00 +02:00
|
|
|
let mut add_words_fst = SetBuilder::new(BufWriter::new(tempfile()?))?;
|
|
|
|
let mut del_words_fst = SetBuilder::new(BufWriter::new(tempfile()?))?;
|
2024-09-02 10:42:19 +02:00
|
|
|
|
2024-09-04 12:17:13 +02:00
|
|
|
merge_and_send_docids(
|
|
|
|
merger,
|
|
|
|
index.word_docids.remap_types(),
|
|
|
|
rtxn,
|
|
|
|
&mut buffer,
|
|
|
|
sender.docids::<WordDocids>(),
|
|
|
|
|key| add_words_fst.insert(key),
|
|
|
|
|key| del_words_fst.insert(key),
|
|
|
|
)?;
|
2024-09-04 09:59:19 +02:00
|
|
|
|
|
|
|
// Move that into a dedicated function
|
|
|
|
let words_fst = index.words_fst(rtxn)?;
|
2024-09-04 14:30:09 +02:00
|
|
|
let mmap = compute_new_words_fst(add_words_fst, del_words_fst, words_fst)?;
|
|
|
|
sender.main().write_words_fst(mmap).unwrap();
|
2024-09-04 09:59:19 +02:00
|
|
|
}
|
2024-09-05 10:56:22 +02:00
|
|
|
MergerOperation::WordFidDocidsMerger(merger) => {
|
2024-09-05 17:36:19 +02:00
|
|
|
let span =
|
|
|
|
tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids");
|
|
|
|
let _entered = span.enter();
|
2024-09-04 12:17:13 +02:00
|
|
|
merge_and_send_docids(
|
|
|
|
merger,
|
2024-09-05 10:56:22 +02:00
|
|
|
index.word_fid_docids.remap_types(),
|
2024-09-04 12:17:13 +02:00
|
|
|
rtxn,
|
|
|
|
&mut buffer,
|
2024-09-05 10:56:22 +02:00
|
|
|
sender.docids::<WordFidDocids>(),
|
2024-09-04 12:17:13 +02:00
|
|
|
|_key| Ok(()),
|
|
|
|
|_key| Ok(()),
|
|
|
|
)?;
|
|
|
|
}
|
2024-09-05 10:56:22 +02:00
|
|
|
MergerOperation::WordPairProximityDocidsMerger(merger) => {
|
2024-09-05 17:36:19 +02:00
|
|
|
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_pair_proximity_docids");
|
|
|
|
let _entered = span.enter();
|
2024-09-04 12:17:13 +02:00
|
|
|
merge_and_send_docids(
|
|
|
|
merger,
|
2024-09-05 10:56:22 +02:00
|
|
|
index.word_pair_proximity_docids.remap_types(),
|
2024-09-04 12:17:13 +02:00
|
|
|
rtxn,
|
|
|
|
&mut buffer,
|
2024-09-05 10:56:22 +02:00
|
|
|
sender.docids::<WordPairProximityDocids>(),
|
2024-09-04 12:17:13 +02:00
|
|
|
|_key| Ok(()),
|
|
|
|
|_key| Ok(()),
|
|
|
|
)?;
|
|
|
|
}
|
|
|
|
MergerOperation::WordPositionDocidsMerger(merger) => {
|
2024-09-05 17:36:19 +02:00
|
|
|
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids");
|
|
|
|
let _entered = span.enter();
|
2024-09-04 12:17:13 +02:00
|
|
|
merge_and_send_docids(
|
|
|
|
merger,
|
|
|
|
index.word_position_docids.remap_types(),
|
|
|
|
rtxn,
|
|
|
|
&mut buffer,
|
|
|
|
sender.docids::<WordPositionDocids>(),
|
|
|
|
|_key| Ok(()),
|
|
|
|
|_key| Ok(()),
|
|
|
|
)?;
|
2024-09-04 11:39:53 +02:00
|
|
|
}
|
2024-09-04 09:59:19 +02:00
|
|
|
MergerOperation::InsertDocument { docid, document } => {
|
2024-09-05 17:36:19 +02:00
|
|
|
let span =
|
|
|
|
tracing::trace_span!(target: "indexing::documents::merge", "insert_document");
|
|
|
|
let _entered = span.enter();
|
2024-09-04 09:59:19 +02:00
|
|
|
documents_ids.insert(docid);
|
|
|
|
sender.documents().uncompressed(docid, &document).unwrap();
|
2024-09-12 18:01:02 +02:00
|
|
|
|
|
|
|
if let Some(geo_extractor) = geo_extractor.as_mut() {
|
|
|
|
let current = index.documents.remap_data_type::<Bytes>().get(rtxn, &docid)?;
|
|
|
|
let current: Option<&KvReaderFieldId> = current.map(Into::into);
|
|
|
|
let change = match current {
|
|
|
|
Some(current) => {
|
|
|
|
DocumentChange::Update(Update::create(docid, current.boxed(), document))
|
|
|
|
}
|
|
|
|
None => DocumentChange::Insertion(Insertion::create(docid, document)),
|
|
|
|
};
|
|
|
|
geo_extractor.manage_change(&mut global_fields_ids_map, &change)?;
|
|
|
|
}
|
2024-09-04 09:59:19 +02:00
|
|
|
}
|
|
|
|
MergerOperation::DeleteDocument { docid } => {
|
2024-09-05 17:36:19 +02:00
|
|
|
let span =
|
|
|
|
tracing::trace_span!(target: "indexing::documents::merge", "delete_document");
|
|
|
|
let _entered = span.enter();
|
2024-09-04 09:59:19 +02:00
|
|
|
if !documents_ids.remove(docid) {
|
|
|
|
unreachable!("Tried deleting a document that we do not know about");
|
|
|
|
}
|
|
|
|
sender.documents().delete(docid).unwrap();
|
2024-09-12 18:01:02 +02:00
|
|
|
|
|
|
|
if let Some(geo_extractor) = geo_extractor.as_mut() {
|
|
|
|
let current = index.document(rtxn, docid)?;
|
|
|
|
let change = DocumentChange::Deletion(Deletion::create(docid, current.boxed()));
|
|
|
|
geo_extractor.manage_change(&mut global_fields_ids_map, &change)?;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
MergerOperation::FinishedDocument => {
|
|
|
|
// send the rtree
|
2024-09-02 10:42:19 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-09-05 17:36:19 +02:00
|
|
|
{
|
|
|
|
let span = tracing::trace_span!(target: "indexing::documents::merge", "documents_ids");
|
|
|
|
let _entered = span.enter();
|
|
|
|
|
|
|
|
// Send the documents ids unionized with the current one
|
|
|
|
/// TODO return the slice of bytes directly
|
|
|
|
serialize_bitmap_into_vec(&documents_ids, &mut buffer);
|
|
|
|
sender.send_documents_ids(&buffer).unwrap();
|
|
|
|
}
|
2024-09-04 09:59:19 +02:00
|
|
|
|
|
|
|
// ...
|
|
|
|
|
2024-09-02 10:42:19 +02:00
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
2024-09-12 18:01:02 +02:00
|
|
|
pub struct GeoExtractor {
|
|
|
|
rtree: Option<rstar::RTree<GeoPoint>>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl GeoExtractor {
|
|
|
|
pub fn new(rtxn: &RoTxn, index: &Index) -> Result<Option<Self>> {
|
|
|
|
let is_sortable = index.sortable_fields(rtxn)?.contains("_geo");
|
|
|
|
let is_filterable = index.filterable_fields(rtxn)?.contains("_geo");
|
|
|
|
if is_sortable || is_filterable {
|
|
|
|
Ok(Some(GeoExtractor { rtree: index.geo_rtree(rtxn)? }))
|
|
|
|
} else {
|
|
|
|
Ok(None)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn manage_change(
|
|
|
|
&mut self,
|
|
|
|
fidmap: &mut GlobalFieldsIdsMap,
|
|
|
|
change: &DocumentChange,
|
|
|
|
) -> Result<()> {
|
|
|
|
match change {
|
|
|
|
DocumentChange::Deletion(_) => todo!(),
|
|
|
|
DocumentChange::Update(_) => todo!(),
|
|
|
|
DocumentChange::Insertion(_) => todo!(),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn serialize_rtree<W: io::Write>(self, writer: &mut W) -> Result<bool> {
|
|
|
|
match self.rtree {
|
|
|
|
Some(rtree) => {
|
|
|
|
// TODO What should I do?
|
|
|
|
bincode::serialize_into(writer, &rtree).map(|_| true).map_err(|e| match *e {
|
|
|
|
ErrorKind::Io(e) => Error::IoError(e),
|
|
|
|
ErrorKind::InvalidUtf8Encoding(_) => todo!(),
|
|
|
|
ErrorKind::InvalidBoolEncoding(_) => todo!(),
|
|
|
|
ErrorKind::InvalidCharEncoding => todo!(),
|
|
|
|
ErrorKind::InvalidTagEncoding(_) => todo!(),
|
|
|
|
ErrorKind::DeserializeAnyNotSupported => todo!(),
|
|
|
|
ErrorKind::SizeLimit => todo!(),
|
|
|
|
ErrorKind::SequenceMustHaveLength => todo!(),
|
|
|
|
ErrorKind::Custom(_) => todo!(),
|
|
|
|
})
|
|
|
|
}
|
|
|
|
None => Ok(false),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-09-04 14:30:09 +02:00
|
|
|
fn compute_new_words_fst(
|
2024-09-12 11:48:00 +02:00
|
|
|
add_words_fst: SetBuilder<BufWriter<File>>,
|
|
|
|
del_words_fst: SetBuilder<BufWriter<File>>,
|
2024-09-04 14:30:09 +02:00
|
|
|
words_fst: Set<std::borrow::Cow<'_, [u8]>>,
|
|
|
|
) -> Result<Mmap> {
|
|
|
|
let add_words_fst_file = add_words_fst.into_inner()?;
|
2024-09-12 11:48:00 +02:00
|
|
|
let add_words_fst_mmap = unsafe { Mmap::map(&add_words_fst_file.into_inner().unwrap())? };
|
2024-09-04 14:30:09 +02:00
|
|
|
let add_words_fst = Set::new(&add_words_fst_mmap)?;
|
|
|
|
|
|
|
|
let del_words_fst_file = del_words_fst.into_inner()?;
|
2024-09-12 11:48:00 +02:00
|
|
|
let del_words_fst_mmap = unsafe { Mmap::map(&del_words_fst_file.into_inner().unwrap())? };
|
2024-09-04 14:30:09 +02:00
|
|
|
let del_words_fst = Set::new(&del_words_fst_mmap)?;
|
|
|
|
|
|
|
|
let diff = words_fst.op().add(&del_words_fst).difference();
|
|
|
|
let stream = add_words_fst.op().add(diff).union();
|
|
|
|
|
|
|
|
let mut words_fst = SetBuilder::new(tempfile()?)?;
|
|
|
|
words_fst.extend_stream(stream)?;
|
|
|
|
let words_fst_file = words_fst.into_inner()?;
|
|
|
|
let words_fst_mmap = unsafe { Mmap::map(&words_fst_file)? };
|
|
|
|
|
|
|
|
Ok(words_fst_mmap)
|
|
|
|
}
|
|
|
|
|
2024-09-05 17:36:19 +02:00
|
|
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
|
2024-09-04 12:17:13 +02:00
|
|
|
fn merge_and_send_docids<D: DatabaseType>(
|
|
|
|
merger: Merger<File, MergeDeladdCboRoaringBitmaps>,
|
|
|
|
database: Database<Bytes, Bytes>,
|
|
|
|
rtxn: &RoTxn<'_>,
|
|
|
|
buffer: &mut Vec<u8>,
|
|
|
|
word_docids_sender: DocidsSender<'_, D>,
|
|
|
|
mut add_key: impl FnMut(&[u8]) -> fst::Result<()>,
|
|
|
|
mut del_key: impl FnMut(&[u8]) -> fst::Result<()>,
|
|
|
|
) -> Result<()> {
|
|
|
|
let mut merger_iter = merger.into_stream_merger_iter().unwrap();
|
|
|
|
while let Some((key, deladd)) = merger_iter.next().unwrap() {
|
|
|
|
let current = database.get(rtxn, key)?;
|
|
|
|
let deladd: &KvReaderDelAdd = deladd.into();
|
|
|
|
let del = deladd.get(DelAdd::Deletion);
|
|
|
|
let add = deladd.get(DelAdd::Addition);
|
|
|
|
|
|
|
|
match merge_cbo_bitmaps(current, del, add)? {
|
|
|
|
Operation::Write(bitmap) => {
|
|
|
|
let value = cbo_bitmap_serialize_into_vec(&bitmap, buffer);
|
|
|
|
word_docids_sender.write(key, value).unwrap();
|
|
|
|
add_key(key)?;
|
|
|
|
}
|
|
|
|
Operation::Delete => {
|
|
|
|
word_docids_sender.delete(key).unwrap();
|
|
|
|
del_key(key)?;
|
|
|
|
}
|
|
|
|
Operation::Ignore => (),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
2024-09-02 10:42:19 +02:00
|
|
|
enum Operation {
|
|
|
|
Write(RoaringBitmap),
|
|
|
|
Delete,
|
|
|
|
Ignore,
|
|
|
|
}
|
|
|
|
|
|
|
|
/// A function that merges the DelAdd CboRoaringBitmaps with the current bitmap.
|
|
|
|
fn merge_cbo_bitmaps(
|
|
|
|
current: Option<&[u8]>,
|
|
|
|
del: Option<&[u8]>,
|
|
|
|
add: Option<&[u8]>,
|
|
|
|
) -> Result<Operation> {
|
|
|
|
let current = current.map(CboRoaringBitmapCodec::deserialize_from).transpose()?;
|
|
|
|
let del = del.map(CboRoaringBitmapCodec::deserialize_from).transpose()?;
|
|
|
|
let add = add.map(CboRoaringBitmapCodec::deserialize_from).transpose()?;
|
|
|
|
|
|
|
|
match (current, del, add) {
|
|
|
|
(None, None, None) => Ok(Operation::Ignore), // but it's strange
|
|
|
|
(None, None, Some(add)) => Ok(Operation::Write(add)),
|
|
|
|
(None, Some(_del), None) => Ok(Operation::Ignore), // but it's strange
|
|
|
|
(None, Some(_del), Some(add)) => Ok(Operation::Write(add)),
|
|
|
|
(Some(_current), None, None) => Ok(Operation::Ignore), // but it's strange
|
|
|
|
(Some(current), None, Some(add)) => Ok(Operation::Write(current | add)),
|
|
|
|
(Some(current), Some(del), add) => {
|
|
|
|
let output = match add {
|
|
|
|
Some(add) => (current - del) | add,
|
|
|
|
None => current - del,
|
|
|
|
};
|
|
|
|
if output.is_empty() {
|
|
|
|
Ok(Operation::Delete)
|
|
|
|
} else {
|
|
|
|
Ok(Operation::Write(output))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-09-04 09:59:19 +02:00
|
|
|
/// TODO Return the slice directly from the serialize_into method
|
|
|
|
fn cbo_bitmap_serialize_into_vec<'b>(bitmap: &RoaringBitmap, buffer: &'b mut Vec<u8>) -> &'b [u8] {
|
2024-09-02 10:42:19 +02:00
|
|
|
buffer.clear();
|
|
|
|
CboRoaringBitmapCodec::serialize_into(bitmap, buffer);
|
|
|
|
buffer.as_slice()
|
|
|
|
}
|
2024-09-04 09:59:19 +02:00
|
|
|
|
|
|
|
/// TODO Return the slice directly from the serialize_into method
|
2024-09-04 17:03:09 +02:00
|
|
|
fn serialize_bitmap_into_vec(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) {
|
2024-09-04 09:59:19 +02:00
|
|
|
buffer.clear();
|
|
|
|
bitmap.serialize_into(buffer).unwrap();
|
|
|
|
// buffer.as_slice()
|
|
|
|
}
|