2024-08-28 18:45:16 +02:00
mod document_change ;
// mod extract;
2024-08-29 15:07:59 +02:00
mod channel ;
2024-08-28 18:45:16 +02:00
mod items_pool ;
2024-08-29 19:20:10 +02:00
/// TODO remove this
// mod global_fields_ids_map;
2024-08-28 18:45:16 +02:00
2024-08-29 15:07:59 +02:00
pub type StdResult < T , E > = std ::result ::Result < T , E > ;
2024-08-28 18:45:16 +02:00
mod indexer {
2024-08-29 12:06:44 +02:00
use std ::borrow ::Cow ;
2024-08-28 18:45:16 +02:00
use std ::collections ::{ BTreeMap , HashMap } ;
use std ::fs ::File ;
use std ::io ::Cursor ;
use std ::os ::unix ::fs ::MetadataExt ;
use std ::sync ::Arc ;
2024-08-29 18:27:02 +02:00
use std ::thread ;
2024-08-28 18:45:16 +02:00
2024-08-29 18:27:02 +02:00
use big_s ::S ;
2024-08-29 12:06:44 +02:00
use heed ::types ::Bytes ;
2024-08-29 18:27:02 +02:00
use heed ::{ RoTxn , RwTxn } ;
2024-08-28 18:45:16 +02:00
use memmap2 ::Mmap ;
2024-08-30 15:07:21 +02:00
use obkv ::KvWriter ;
2024-08-28 18:45:16 +02:00
use rayon ::iter ::{ IntoParallelIterator , ParallelBridge , ParallelIterator } ;
2024-08-30 14:34:24 +02:00
use rayon ::ThreadPool ;
2024-08-28 18:45:16 +02:00
use roaring ::RoaringBitmap ;
use serde_json ::Value ;
2024-08-29 18:27:02 +02:00
use super ::channel ::{
extractors_merger_channels , merger_writer_channels , EntryOperation ,
2024-08-29 19:20:10 +02:00
ExtractorsMergerChannels , MergerReceiver , MergerSender , WriterOperation ,
2024-08-29 18:27:02 +02:00
} ;
2024-08-29 14:08:31 +02:00
use super ::document_change ::{ Deletion , DocumentChange , Insertion , Update } ;
2024-08-28 18:45:16 +02:00
use super ::items_pool ::ItemsPool ;
use crate ::documents ::{
obkv_to_object , DocumentIdExtractionError , DocumentsBatchReader , PrimaryKey ,
} ;
2024-08-30 15:07:21 +02:00
use crate ::update ::concurrent_available_ids ::ConcurrentAvailableIds ;
2024-08-29 17:51:42 +02:00
use crate ::update ::del_add ::DelAdd ;
use crate ::update ::new ::channel ::MergerOperation ;
2024-08-30 15:07:21 +02:00
use crate ::update ::{ AvailableIds , IndexDocumentsMethod , MergeDeladdCboRoaringBitmaps } ;
2024-08-28 18:45:16 +02:00
use crate ::{
2024-08-30 15:07:21 +02:00
all_obkv_to_json , obkv_to_json , CboRoaringBitmapCodec , DocumentId , Error , FieldId ,
FieldsIdsMap , Index , InternalError , Object , Result , UserError ,
2024-08-28 18:45:16 +02:00
} ;
2024-08-29 19:20:10 +02:00
pub type KvReaderFieldId = obkv ::KvReader < FieldId > ;
pub type KvReaderDelAdd = obkv ::KvReader < DelAdd > ;
pub type KvWriterFieldId < W > = obkv ::KvWriter < W , FieldId > ;
pub type KvWriterDelAdd < W > = obkv ::KvWriter < W , DelAdd > ;
2024-08-28 18:45:16 +02:00
pub struct DocumentOperationIndexer {
operations : Vec < Payload > ,
2024-08-30 14:34:24 +02:00
index_documents_method : IndexDocumentsMethod ,
2024-08-28 18:45:16 +02:00
}
enum Payload {
Addition ( File ) ,
Deletion ( Vec < String > ) ,
}
pub struct PayloadStats {
pub document_count : usize ,
pub bytes : u64 ,
}
enum DocumentOperation {
Addition ( DocumentOffset ) ,
Deletion ,
}
/// Represents an offset where a document lives
/// in an mmapped grenad reader file.
struct DocumentOffset {
/// The mmapped grenad reader file.
pub content : Arc < Mmap > , // grenad::Reader
/// The offset of the document in the file.
pub offset : u32 ,
}
impl DocumentOperationIndexer {
pub fn new ( method : IndexDocumentsMethod ) -> Self {
2024-08-30 14:34:24 +02:00
Self { operations : Default ::default ( ) , index_documents_method : method }
2024-08-28 18:45:16 +02:00
}
/// TODO please give me a type
/// The payload is expected to be in the grenad format
pub fn add_documents ( & mut self , payload : File ) -> Result < PayloadStats > {
let reader = DocumentsBatchReader ::from_reader ( & payload ) ? ;
let bytes = payload . metadata ( ) ? . size ( ) ;
let document_count = reader . documents_count ( ) as usize ;
self . operations . push ( Payload ::Addition ( payload ) ) ;
Ok ( PayloadStats { bytes , document_count } )
}
pub fn delete_documents ( & mut self , to_delete : Vec < String > ) {
self . operations . push ( Payload ::Deletion ( to_delete ) )
}
pub fn document_changes < ' a > (
self ,
index : & ' a Index ,
rtxn : & ' a RoTxn ,
2024-08-30 14:34:24 +02:00
fields_ids_map : & ' a mut FieldsIdsMap ,
2024-08-28 18:45:16 +02:00
primary_key : & ' a PrimaryKey < ' a > ,
2024-08-29 19:20:10 +02:00
) -> Result < impl ParallelIterator < Item = Result < Option < DocumentChange > > > + ' a > {
2024-08-28 18:45:16 +02:00
let documents_ids = index . documents_ids ( rtxn ) ? ;
2024-08-30 15:07:21 +02:00
let mut available_docids = AvailableIds ::new ( & documents_ids ) ;
2024-08-28 18:45:16 +02:00
let mut docids_version_offsets = HashMap ::< String , _ > ::new ( ) ;
for operation in self . operations {
match operation {
Payload ::Addition ( payload ) = > {
let content = unsafe { Mmap ::map ( & payload ) . map ( Arc ::new ) ? } ;
let cursor = Cursor ::new ( content . as_ref ( ) ) ;
let reader = DocumentsBatchReader ::from_reader ( cursor ) ? ;
let ( mut batch_cursor , batch_index ) = reader . into_cursor_and_fields_index ( ) ;
// TODO Fetch all document fields to fill the fields ids map
batch_index . iter ( ) . for_each ( | ( _ , name ) | {
fields_ids_map . insert ( name ) ;
} ) ;
let mut offset : u32 = 0 ;
while let Some ( document ) = batch_cursor . next_document ( ) ? {
let external_document_id =
2024-08-30 15:07:21 +02:00
match primary_key . document_id ( document , & batch_index ) ? {
2024-08-28 18:45:16 +02:00
Ok ( document_id ) = > Ok ( document_id ) ,
Err ( DocumentIdExtractionError ::InvalidDocumentId (
user_error ,
) ) = > Err ( user_error ) ,
Err ( DocumentIdExtractionError ::MissingDocumentId ) = > {
Err ( UserError ::MissingDocumentId {
primary_key : primary_key . name ( ) . to_string ( ) ,
2024-08-30 15:07:21 +02:00
document : obkv_to_object ( document , & batch_index ) ? ,
2024-08-28 18:45:16 +02:00
} )
}
Err ( DocumentIdExtractionError ::TooManyDocumentIds ( _ ) ) = > {
Err ( UserError ::TooManyDocumentIds {
primary_key : primary_key . name ( ) . to_string ( ) ,
2024-08-30 15:07:21 +02:00
document : obkv_to_object ( document , & batch_index ) ? ,
2024-08-28 18:45:16 +02:00
} )
}
} ? ;
let content = content . clone ( ) ;
let document_offset = DocumentOffset { content , offset } ;
let document_operation = DocumentOperation ::Addition ( document_offset ) ;
match docids_version_offsets . get_mut ( & external_document_id ) {
None = > {
let docid = match index
. external_documents_ids ( )
. get ( rtxn , & external_document_id ) ?
{
Some ( docid ) = > docid ,
None = > available_docids . next ( ) . ok_or ( Error ::UserError (
UserError ::DocumentLimitReached ,
) ) ? ,
} ;
docids_version_offsets . insert (
2024-08-30 15:07:21 +02:00
external_document_id ,
2024-08-28 18:45:16 +02:00
( docid , vec! [ document_operation ] ) ,
) ;
}
Some ( ( _ , offsets ) ) = > offsets . push ( document_operation ) ,
}
offset + = 1 ;
}
}
Payload ::Deletion ( to_delete ) = > {
for external_document_id in to_delete {
match docids_version_offsets . get_mut ( & external_document_id ) {
None = > {
let docid = match index
. external_documents_ids ( )
. get ( rtxn , & external_document_id ) ?
{
Some ( docid ) = > docid ,
None = > available_docids . next ( ) . ok_or ( Error ::UserError (
UserError ::DocumentLimitReached ,
) ) ? ,
} ;
docids_version_offsets . insert (
external_document_id ,
( docid , vec! [ DocumentOperation ::Deletion ] ) ,
) ;
}
Some ( ( _ , offsets ) ) = > offsets . push ( DocumentOperation ::Deletion ) ,
}
}
}
}
}
2024-08-30 14:34:24 +02:00
Ok ( docids_version_offsets . into_par_iter ( ) . map_with (
Arc ::new ( ItemsPool ::new ( | | index . read_txn ( ) . map_err ( crate ::Error ::from ) ) ) ,
move | context_pool , ( external_docid , ( internal_docid , operations ) ) | {
context_pool . with ( | rtxn | {
use IndexDocumentsMethod as Idm ;
let document_merge_function = match self . index_documents_method {
Idm ::ReplaceDocuments = > merge_document_for_replacements ,
Idm ::UpdateDocuments = > merge_document_for_updates ,
} ;
document_merge_function (
2024-08-28 18:45:16 +02:00
rtxn ,
index ,
2024-08-30 14:34:24 +02:00
fields_ids_map ,
2024-08-28 18:45:16 +02:00
internal_docid ,
external_docid ,
& operations ,
2024-08-30 14:34:24 +02:00
)
2024-08-28 18:45:16 +02:00
} )
} ,
2024-08-30 14:34:24 +02:00
) )
2024-08-28 18:45:16 +02:00
}
}
pub struct DeleteDocumentIndexer {
to_delete : RoaringBitmap ,
}
impl DeleteDocumentIndexer {
pub fn new ( ) -> Self {
Self { to_delete : Default ::default ( ) }
}
pub fn delete_documents_by_docids ( & mut self , docids : RoaringBitmap ) {
self . to_delete | = docids ;
}
// let fields = index.fields_ids_map(rtxn)?;
// let primary_key =
// index.primary_key(rtxn)?.ok_or(InternalError::DatabaseMissingEntry {
// db_name: db_name::MAIN,
// key: Some(main_key::PRIMARY_KEY_KEY),
// })?;
// let primary_key = PrimaryKey::new(primary_key, &fields).ok_or_else(|| {
// InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldName {
// field_name: primary_key.to_owned(),
// process: "external_id_of",
// })
// })?;
2024-08-30 14:34:24 +02:00
pub fn document_changes < ' a > (
2024-08-28 18:45:16 +02:00
self ,
index : & ' a Index ,
fields : & ' a FieldsIdsMap ,
primary_key : & ' a PrimaryKey < ' a > ,
2024-08-29 14:08:31 +02:00
) -> Result < impl ParallelIterator < Item = Result < DocumentChange > > + ' a > {
2024-08-28 18:45:16 +02:00
let items = Arc ::new ( ItemsPool ::new ( | | index . read_txn ( ) . map_err ( crate ::Error ::from ) ) ) ;
Ok ( self . to_delete . into_iter ( ) . par_bridge ( ) . map_with ( items , | items , docid | {
items . with ( | rtxn | {
2024-08-29 14:08:31 +02:00
let current = index . document ( rtxn , docid ) ? ;
2024-08-30 14:34:24 +02:00
let external_docid = match primary_key . document_id ( current , fields ) ? {
2024-08-28 18:45:16 +02:00
Ok ( document_id ) = > Ok ( document_id ) as Result < _ > ,
Err ( _ ) = > Err ( InternalError ::DocumentsError (
crate ::documents ::Error ::InvalidDocumentFormat ,
)
. into ( ) ) ,
} ? ;
2024-08-29 12:06:44 +02:00
2024-08-30 10:03:54 +02:00
Ok ( DocumentChange ::Deletion ( Deletion ::create (
docid ,
external_docid ,
current . boxed ( ) ,
) ) )
2024-08-28 18:45:16 +02:00
} )
} ) )
}
}
2024-08-30 15:07:21 +02:00
pub struct PartialDumpIndexer < I > {
iter : I ,
}
2024-08-28 18:45:16 +02:00
2024-08-30 15:07:21 +02:00
impl < I > PartialDumpIndexer < I >
where
I : IntoIterator < Item = Object > ,
I ::IntoIter : Send ,
I ::Item : Send ,
{
pub fn new_from_jsonlines ( iter : I ) -> Self {
PartialDumpIndexer { iter }
2024-08-28 18:45:16 +02:00
}
2024-08-30 15:07:21 +02:00
/// Note for future self:
/// - the field ids map must already be valid so you must have to generate it beforehand.
/// - We should probably expose another method that generates the fields ids map from an iterator of JSON objects.
/// - We recommend sending chunks of documents in this `PartialDumpIndexer` we therefore need to create a custom take_while_size method (that doesn't drop items).
pub fn document_changes < ' a > (
2024-08-28 18:45:16 +02:00
self ,
2024-08-30 15:07:21 +02:00
fields_ids_map : & ' a FieldsIdsMap ,
concurrent_available_ids : & ' a ConcurrentAvailableIds ,
primary_key : & ' a PrimaryKey < ' a > ,
) -> impl ParallelIterator < Item = Result < Option < DocumentChange > > > + ' a
2024-08-28 18:45:16 +02:00
where
2024-08-30 15:07:21 +02:00
// I don't like this, it will not fit in the future trait easily
I ::IntoIter : ' a ,
2024-08-28 18:45:16 +02:00
{
2024-08-30 15:07:21 +02:00
self . iter . into_iter ( ) . par_bridge ( ) . map ( | object | {
let docid = match concurrent_available_ids . next ( ) {
Some ( id ) = > id ,
None = > return Err ( Error ::UserError ( UserError ::DocumentLimitReached ) ) ,
} ;
let mut writer = KvWriterFieldId ::memory ( ) ;
object . iter ( ) . for_each ( | ( key , value ) | {
let key = fields_ids_map . id ( key ) . unwrap ( ) ;
/// TODO better error management
let value = serde_json ::to_vec ( & value ) . unwrap ( ) ;
writer . insert ( key , value ) . unwrap ( ) ;
} ) ;
let document = writer . into_boxed ( ) ;
let external_docid = match primary_key . document_id ( & document , fields_ids_map ) ? {
Ok ( document_id ) = > Ok ( document_id ) ,
Err ( DocumentIdExtractionError ::InvalidDocumentId ( user_error ) ) = > {
Err ( user_error )
}
Err ( DocumentIdExtractionError ::MissingDocumentId ) = > {
Err ( UserError ::MissingDocumentId {
primary_key : primary_key . name ( ) . to_string ( ) ,
document : all_obkv_to_json ( & document , fields_ids_map ) ? ,
} )
}
Err ( DocumentIdExtractionError ::TooManyDocumentIds ( _ ) ) = > {
Err ( UserError ::TooManyDocumentIds {
primary_key : primary_key . name ( ) . to_string ( ) ,
document : all_obkv_to_json ( & document , fields_ids_map ) ? ,
} )
}
} ? ;
let insertion = Insertion ::create ( docid , external_docid , document ) ;
Ok ( Some ( DocumentChange ::Insertion ( insertion ) ) )
} )
2024-08-28 18:45:16 +02:00
}
}
pub struct UpdateByFunctionIndexer ;
2024-08-29 18:27:02 +02:00
/// TODO return stats
/// TODO take the rayon ThreadPool
2024-08-30 14:34:24 +02:00
pub fn index < PI > (
wtxn : & mut RwTxn ,
index : & Index ,
pool : & ThreadPool ,
document_changes : PI ,
) -> Result < ( ) >
2024-08-29 18:27:02 +02:00
where
PI : IntoParallelIterator < Item = Result < DocumentChange > > + Send ,
PI ::Iter : Clone ,
{
let ( merger_sender , writer_receiver ) = merger_writer_channels ( 100 ) ;
let ExtractorsMergerChannels { merger_receiver , deladd_cbo_roaring_bitmap_sender } =
extractors_merger_channels ( 100 ) ;
thread ::scope ( | s | {
thread ::Builder ::new ( ) . name ( S ( " indexer-extractors " ) ) . spawn_scoped ( s , | | {
2024-08-30 14:34:24 +02:00
pool . in_place_scope ( | _s | {
document_changes . into_par_iter ( ) . for_each ( | _dc | ( ) ) ;
} )
2024-08-30 11:49:47 +02:00
} ) ? ;
2024-08-29 18:27:02 +02:00
// TODO manage the errors correctly
thread ::Builder ::new ( ) . name ( S ( " indexer-merger " ) ) . spawn_scoped ( s , | | {
let rtxn = index . read_txn ( ) . unwrap ( ) ;
merge_grenad_entries ( merger_receiver , merger_sender , & rtxn , index ) . unwrap ( )
2024-08-30 11:49:47 +02:00
} ) ? ;
2024-08-29 18:27:02 +02:00
// TODO Split this code into another function
for operation in writer_receiver {
let database = operation . database ( index ) ;
match operation {
WriterOperation ::WordDocids ( operation ) = > match operation {
EntryOperation ::Delete ( e ) = > database . delete ( wtxn , e . entry ( ) ) . map ( drop ) ? ,
EntryOperation ::Write ( e ) = > database . put ( wtxn , e . key ( ) , e . value ( ) ) ? ,
} ,
WriterOperation ::Document ( e ) = > database . put ( wtxn , & e . key ( ) , e . content ( ) ) ? ,
}
}
Ok ( ( ) )
} )
}
2024-08-29 17:51:42 +02:00
enum Operation {
Write ( RoaringBitmap ) ,
Delete ,
Ignore ,
}
/// A function that merges the DelAdd CboRoaringBitmaps with the current bitmap.
fn merge_cbo_bitmaps (
current : Option < & [ u8 ] > ,
del : Option < & [ u8 ] > ,
add : Option < & [ u8 ] > ,
) -> Result < Operation > {
let bitmap = match current {
Some ( current_bitmap_bytes ) = > {
let bitmap_without_del = match del {
Some ( del_bytes ) = > {
let del_bitmap = CboRoaringBitmapCodec ::deserialize_from ( del_bytes ) ? ;
CboRoaringBitmapCodec ::intersection_with_serialized (
current_bitmap_bytes ,
& del_bitmap ,
) ?
}
None = > CboRoaringBitmapCodec ::deserialize_from ( current_bitmap_bytes ) ? ,
} ;
match add {
Some ( add_bytes ) = > {
let add = CboRoaringBitmapCodec ::deserialize_from ( add_bytes ) ? ;
bitmap_without_del | add
}
None = > bitmap_without_del ,
}
}
None = > match add {
Some ( add_bytes ) = > CboRoaringBitmapCodec ::deserialize_from ( add_bytes ) ? ,
None = > return Ok ( Operation ::Ignore ) ,
} ,
} ;
if bitmap . is_empty ( ) {
Ok ( Operation ::Delete )
} else {
Ok ( Operation ::Write ( bitmap ) )
}
}
/// Return the slice directly from the serialize_into method
fn cbo_serialize_into_vec < ' b > ( bitmap : & RoaringBitmap , buffer : & ' b mut Vec < u8 > ) -> & ' b [ u8 ] {
buffer . clear ( ) ;
CboRoaringBitmapCodec ::serialize_into ( bitmap , buffer ) ;
buffer . as_slice ( )
}
/// TODO We must return some infos/stats
fn merge_grenad_entries (
receiver : MergerReceiver ,
sender : MergerSender ,
rtxn : & RoTxn ,
index : & Index ,
) -> Result < ( ) > {
let mut buffer = Vec ::new ( ) ;
for merger_operation in receiver {
match merger_operation {
MergerOperation ::WordDocidsCursors ( cursors ) = > {
let sender = sender . word_docids ( ) ;
let database = index . word_docids . remap_types ::< Bytes , Bytes > ( ) ;
2024-08-30 14:33:58 +02:00
let mut builder = grenad ::MergerBuilder ::new ( MergeDeladdCboRoaringBitmaps ) ;
2024-08-29 17:51:42 +02:00
builder . extend ( cursors ) ;
/// TODO manage the error correctly
let mut merger_iter = builder . build ( ) . into_stream_merger_iter ( ) . unwrap ( ) ;
// TODO manage the error correctly
while let Some ( ( key , deladd ) ) = merger_iter . next ( ) . unwrap ( ) {
let current = database . get ( rtxn , key ) ? ;
let deladd : & KvReaderDelAdd = deladd . into ( ) ;
let del = deladd . get ( DelAdd ::Deletion ) ;
let add = deladd . get ( DelAdd ::Addition ) ;
match merge_cbo_bitmaps ( current , del , add ) ? {
Operation ::Write ( bitmap ) = > {
let value = cbo_serialize_into_vec ( & bitmap , & mut buffer ) ;
sender . write ( key , value ) . unwrap ( ) ;
}
Operation ::Delete = > sender . delete ( key ) . unwrap ( ) ,
Operation ::Ignore = > ( ) ,
}
}
}
}
}
Ok ( ( ) )
}
2024-08-29 15:07:59 +02:00
2024-08-28 18:45:16 +02:00
/// Reads the previous version of a document from the database, the new versions
/// in the grenad update files and merges them to generate a new boxed obkv.
///
/// This function is only meant to be used when doing an update and not a replacement.
2024-08-29 15:07:59 +02:00
fn merge_document_for_updates (
2024-08-28 18:45:16 +02:00
rtxn : & RoTxn ,
index : & Index ,
fields_ids_map : & FieldsIdsMap ,
docid : DocumentId ,
external_docid : String ,
operations : & [ DocumentOperation ] ,
) -> Result < Option < DocumentChange > > {
2024-08-29 12:06:44 +02:00
let mut document = BTreeMap ::< _ , Cow < _ > > ::new ( ) ;
2024-08-29 14:08:31 +02:00
let current = index . documents . remap_data_type ::< Bytes > ( ) . get ( rtxn , & docid ) ? ;
let current : Option < & KvReaderFieldId > = current . map ( Into ::into ) ;
2024-08-29 12:06:44 +02:00
2024-08-29 14:08:31 +02:00
if let Some ( current ) = current {
current . into_iter ( ) . for_each ( | ( k , v ) | {
2024-08-29 12:06:44 +02:00
document . insert ( k , v . into ( ) ) ;
2024-08-28 18:45:16 +02:00
} ) ;
}
let last_deletion = operations
. iter ( )
. rposition ( | operation | matches! ( operation , DocumentOperation ::Deletion ) ) ;
let operations = & operations [ last_deletion . map_or ( 0 , | i | i + 1 ) .. ] ;
if operations . is_empty ( ) {
2024-08-29 14:08:31 +02:00
match current {
Some ( current ) = > {
2024-08-30 10:03:54 +02:00
return Ok ( Some ( DocumentChange ::Deletion ( Deletion ::create (
docid ,
external_docid ,
current . boxed ( ) ,
) ) ) ) ;
2024-08-28 18:45:16 +02:00
}
None = > return Ok ( None ) ,
}
}
for operation in operations {
2024-08-29 12:06:44 +02:00
let DocumentOffset { content , offset } = match operation {
DocumentOperation ::Addition ( offset ) = > offset ,
DocumentOperation ::Deletion = > unreachable! ( " Deletion in document operations " ) ,
} ;
2024-08-28 18:45:16 +02:00
let reader = DocumentsBatchReader ::from_reader ( Cursor ::new ( content . as_ref ( ) ) ) ? ;
let ( mut cursor , batch_index ) = reader . into_cursor_and_fields_index ( ) ;
2024-08-29 12:06:44 +02:00
let update = cursor . get ( * offset ) ? . expect ( " must exists " ) ;
2024-08-28 18:45:16 +02:00
2024-08-29 12:06:44 +02:00
update . into_iter ( ) . for_each ( | ( k , v ) | {
2024-08-28 18:45:16 +02:00
let field_name = batch_index . name ( k ) . unwrap ( ) ;
let id = fields_ids_map . id ( field_name ) . unwrap ( ) ;
2024-08-29 12:06:44 +02:00
document . insert ( id , v . to_vec ( ) . into ( ) ) ;
2024-08-28 18:45:16 +02:00
} ) ;
}
let mut writer = KvWriterFieldId ::memory ( ) ;
document . into_iter ( ) . for_each ( | ( id , value ) | writer . insert ( id , value ) . unwrap ( ) ) ;
2024-08-30 10:03:54 +02:00
let new = writer . into_boxed ( ) ;
2024-08-29 12:06:44 +02:00
2024-08-29 14:08:31 +02:00
match current {
Some ( current ) = > {
2024-08-30 10:03:54 +02:00
let update = Update ::create ( docid , external_docid , current . boxed ( ) , new ) ;
2024-08-29 12:06:44 +02:00
Ok ( Some ( DocumentChange ::Update ( update ) ) )
}
None = > {
let insertion = Insertion ::create ( docid , external_docid , new ) ;
Ok ( Some ( DocumentChange ::Insertion ( insertion ) ) )
}
}
2024-08-28 18:45:16 +02:00
}
2024-08-29 14:08:31 +02:00
/// Returns only the most recent version of a document based on the updates from the payloads.
///
/// This function is only meant to be used when doing a replacement and not an update.
2024-08-29 15:07:59 +02:00
fn merge_document_for_replacements (
2024-08-29 14:08:31 +02:00
rtxn : & RoTxn ,
index : & Index ,
fields_ids_map : & FieldsIdsMap ,
docid : DocumentId ,
external_docid : String ,
operations : & [ DocumentOperation ] ,
) -> Result < Option < DocumentChange > > {
let current = index . documents . remap_data_type ::< Bytes > ( ) . get ( rtxn , & docid ) ? ;
let current : Option < & KvReaderFieldId > = current . map ( Into ::into ) ;
match operations . last ( ) {
Some ( DocumentOperation ::Addition ( DocumentOffset { content , offset } ) ) = > {
let reader = DocumentsBatchReader ::from_reader ( Cursor ::new ( content . as_ref ( ) ) ) ? ;
let ( mut cursor , batch_index ) = reader . into_cursor_and_fields_index ( ) ;
let update = cursor . get ( * offset ) ? . expect ( " must exists " ) ;
let mut document_entries = Vec ::new ( ) ;
update . into_iter ( ) . for_each ( | ( k , v ) | {
let field_name = batch_index . name ( k ) . unwrap ( ) ;
let id = fields_ids_map . id ( field_name ) . unwrap ( ) ;
document_entries . push ( ( id , v ) ) ;
} ) ;
document_entries . sort_unstable_by_key ( | ( id , _ ) | * id ) ;
let mut writer = KvWriterFieldId ::memory ( ) ;
document_entries
. into_iter ( )
. for_each ( | ( id , value ) | writer . insert ( id , value ) . unwrap ( ) ) ;
2024-08-30 10:03:54 +02:00
let new = writer . into_boxed ( ) ;
2024-08-29 14:08:31 +02:00
match current {
Some ( current ) = > {
2024-08-30 10:03:54 +02:00
let update = Update ::create ( docid , external_docid , current . boxed ( ) , new ) ;
2024-08-29 14:08:31 +02:00
Ok ( Some ( DocumentChange ::Update ( update ) ) )
}
None = > {
let insertion = Insertion ::create ( docid , external_docid , new ) ;
Ok ( Some ( DocumentChange ::Insertion ( insertion ) ) )
}
}
}
2024-08-30 10:03:54 +02:00
Some ( DocumentOperation ::Deletion ) = > match current {
Some ( current ) = > {
let deletion = Deletion ::create ( docid , external_docid , current . boxed ( ) ) ;
Ok ( Some ( DocumentChange ::Deletion ( deletion ) ) )
2024-08-29 14:08:31 +02:00
}
2024-08-30 10:03:54 +02:00
None = > Ok ( None ) ,
} ,
2024-08-29 14:08:31 +02:00
None = > Ok ( None ) ,
}
}
2024-08-28 18:45:16 +02:00
}