WIP: reset documents in TypedChunk::Documents

This commit is contained in:
Louis Dureuil 2023-10-24 14:26:49 +02:00
parent cda6ca1ee6
commit 946c762d28
No known key found for this signature in database
2 changed files with 25 additions and 18 deletions

View File

@ -35,7 +35,7 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader};
use crate::error::{Error, InternalError, UserError}; use crate::error::{Error, InternalError, UserError};
pub use crate::update::index_documents::helpers::CursorClonableMmap; pub use crate::update::index_documents::helpers::CursorClonableMmap;
use crate::update::{ use crate::update::{
self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep,
WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
}; };
use crate::{CboRoaringBitmapCodec, Index, Result}; use crate::{CboRoaringBitmapCodec, Index, Result};
@ -374,17 +374,6 @@ where
drop(lmdb_writer_sx) drop(lmdb_writer_sx)
}); });
// We delete the documents that this document addition replaces. This way we are
// able to simply insert all the documents even if they already exist in the database.
if !replaced_documents_ids.is_empty() {
let mut deletion_builder = update::DeleteDocuments::new(self.wtxn, self.index)?;
deletion_builder.strategy(self.config.deletion_strategy);
debug!("documents to delete {:?}", replaced_documents_ids);
deletion_builder.delete_documents(&replaced_documents_ids);
let deleted_documents_result = deletion_builder.execute_inner()?;
debug!("{} documents actually deleted", deleted_documents_result.deleted_documents);
}
let index_documents_ids = self.index.documents_ids(self.wtxn)?; let index_documents_ids = self.index.documents_ids(self.wtxn)?;
let index_is_empty = index_documents_ids.is_empty(); let index_is_empty = index_documents_ids.is_empty();
let mut final_documents_ids = RoaringBitmap::new(); let mut final_documents_ids = RoaringBitmap::new();
@ -437,6 +426,7 @@ where
otherwise => otherwise, otherwise => otherwise,
}; };
// FIXME: return newly added as well as newly deleted documents
let (docids, is_merged_database) = let (docids, is_merged_database) =
write_typed_chunk_into_index(typed_chunk, self.index, self.wtxn, index_is_empty)?; write_typed_chunk_into_index(typed_chunk, self.index, self.wtxn, index_is_empty)?;
if !docids.is_empty() { if !docids.is_empty() {
@ -472,8 +462,9 @@ where
let external_documents_ids = external_documents_ids.into_static(); let external_documents_ids = external_documents_ids.into_static();
self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?; self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?;
// FIXME: remove `new_documents_ids` entirely and `replaced_documents_ids`
let all_documents_ids = index_documents_ids | new_documents_ids; let all_documents_ids = index_documents_ids | new_documents_ids;
self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; //self.index.put_documents_ids(self.wtxn, &all_documents_ids)?;
// TODO: reactivate prefix DB with diff-indexing // TODO: reactivate prefix DB with diff-indexing
// self.execute_prefix_databases( // self.execute_prefix_databases(

View File

@ -118,23 +118,39 @@ pub(crate) fn write_typed_chunk_into_index(
let mut is_merged_database = false; let mut is_merged_database = false;
match typed_chunk { match typed_chunk {
TypedChunk::Documents(obkv_documents_iter) => { TypedChunk::Documents(obkv_documents_iter) => {
let mut docids = index.documents_ids(wtxn)?;
let mut cursor = obkv_documents_iter.into_cursor()?; let mut cursor = obkv_documents_iter.into_cursor()?;
while let Some((docid, reader)) = cursor.move_on_next()? { while let Some((docid, reader)) = cursor.move_on_next()? {
let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
let reader: KvReader<FieldId> = KvReader::new(reader); let reader: KvReader<FieldId> = KvReader::new(reader);
let mut written = false;
for (field_id, value) in reader.iter() { for (field_id, value) in reader.iter() {
let Some(value) = KvReaderDelAdd::new(value).get(DelAdd::Addition) else { let Some(value) = KvReaderDelAdd::new(value).get(DelAdd::Addition) else {
continue; continue;
}; };
// TODO: writer.is_empty
written = true;
writer.insert(field_id, value)?; writer.insert(field_id, value)?;
} }
index.documents.remap_types::<ByteSlice, ByteSlice>().put(
wtxn, let db = index.documents.remap_data_type::<ByteSlice>();
docid, let docid = docid.try_into().map(DocumentId::from_be_bytes).unwrap();
&writer.into_inner().unwrap(),
)?; if written {
db.put(wtxn, &BEU32::new(docid), &writer.into_inner().unwrap())?;
docids.insert(docid);
} else {
db.delete(wtxn, &BEU32::new(docid))?;
// FIXME: unwrap
if !docids.remove(docid) {
panic!("Attempt to remove a document id that doesn't exist")
} }
} }
}
index.put_documents_ids(wtxn, &docids)?;
}
TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => { TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => {
append_entries_into_database( append_entries_into_database(
fid_word_count_docids_iter, fid_word_count_docids_iter,