Post processing of the merge

This commit is contained in:
Louis Dureuil 2024-11-06 17:50:12 +01:00
parent ee03743355
commit 10f49f0d75
No known key found for this signature in database
12 changed files with 512 additions and 996 deletions

View file

@ -1,73 +1,140 @@
use std::cell::RefCell;
use bumpalo::Bump;
use hashbrown::HashMap;
use super::DelAddRoaringBitmap;
use crate::update::new::channel::DocumentsSender;
use crate::update::new::document::write_to_obkv;
use crate::update::new::document::{write_to_obkv, Document as _};
use crate::update::new::indexer::document_changes::{
DocumentChangeContext, Extractor, FullySend, RefCellExt as _,
};
use crate::update::new::DocumentChange;
use crate::vector::EmbeddingConfigs;
use crate::Result;
pub struct DocumentsExtractor<'a> {
documents_sender: &'a DocumentsSender<'a>,
document_sender: &'a DocumentsSender<'a>,
embedders: &'a EmbeddingConfigs,
}
impl<'a> DocumentsExtractor<'a> {
pub fn new(documents_sender: &'a DocumentsSender<'a>) -> Self {
Self { documents_sender }
pub fn new(document_sender: &'a DocumentsSender<'a>, embedders: &'a EmbeddingConfigs) -> Self {
Self { document_sender, embedders }
}
}
#[derive(Default)]
pub struct DocumentExtractorData {
pub docids_delta: DelAddRoaringBitmap,
pub field_distribution_delta: HashMap<String, i64>,
}
impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> {
type Data = FullySend<RefCell<DelAddRoaringBitmap>>;
type Data = FullySend<RefCell<DocumentExtractorData>>;
fn init_data(&self, _extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
Ok(FullySend(RefCell::new(DelAddRoaringBitmap::empty())))
Ok(FullySend(Default::default()))
}
fn process(
fn process<'doc>(
&self,
change: DocumentChange,
changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
context: &DocumentChangeContext<Self::Data>,
) -> Result<()> {
let mut document_buffer = Vec::new();
let mut delta_documents_ids = context.data.0.borrow_mut_or_yield();
let mut document_buffer = bumpalo::collections::Vec::new_in(&context.doc_alloc);
let mut document_extractor_data = context.data.0.borrow_mut_or_yield();
let new_fields_ids_map = context.new_fields_ids_map.borrow_or_yield();
let new_fields_ids_map = &*new_fields_ids_map;
let new_fields_ids_map = new_fields_ids_map.local_map();
let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
let external_docid = change.external_docid().to_owned();
for change in changes {
let change = change?;
let external_docid = change.external_docid().to_owned();
// document but we need to create a function that collects and compresses documents.
match change {
DocumentChange::Deletion(deletion) => {
let docid = deletion.docid();
self.documents_sender.delete(docid, external_docid).unwrap();
delta_documents_ids.insert_del_u32(docid);
}
/// TODO: change NONE by SOME(vector) when implemented
DocumentChange::Update(update) => {
let docid = update.docid();
let content =
update.new(&context.txn, context.index, &context.db_fields_ids_map)?;
let content =
write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?;
self.documents_sender.uncompressed(docid, external_docid, content).unwrap();
}
DocumentChange::Insertion(insertion) => {
let docid = insertion.docid();
let content = insertion.new();
let content =
write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?;
self.documents_sender.uncompressed(docid, external_docid, content).unwrap();
delta_documents_ids.insert_add_u32(docid);
// extracted_dictionary_sender.send(self, dictionary: &[u8]);
// document but we need to create a function that collects and compresses documents.
match change {
DocumentChange::Deletion(deletion) => {
let docid = deletion.docid();
let content = deletion.current(
&context.txn,
context.index,
&context.db_fields_ids_map,
)?;
for res in content.iter_top_level_fields() {
let (f, _) = res?;
let entry = document_extractor_data
.field_distribution_delta
.entry_ref(f)
.or_default();
*entry -= 1;
}
document_extractor_data.docids_delta.insert_del_u32(docid);
self.document_sender.delete(docid, external_docid).unwrap();
}
DocumentChange::Update(update) => {
let docid = update.docid();
let content =
update.current(&context.txn, context.index, &context.db_fields_ids_map)?;
for res in content.iter_top_level_fields() {
let (f, _) = res?;
let entry = document_extractor_data
.field_distribution_delta
.entry_ref(f)
.or_default();
*entry -= 1;
}
let content = update.updated();
for res in content.iter_top_level_fields() {
let (f, _) = res?;
let entry = document_extractor_data
.field_distribution_delta
.entry_ref(f)
.or_default();
*entry += 1;
}
let content =
update.merged(&context.txn, context.index, &context.db_fields_ids_map)?;
let vector_content = update.merged_vectors(
&context.txn,
context.index,
&context.db_fields_ids_map,
&context.doc_alloc,
self.embedders,
)?;
let content = write_to_obkv(
&content,
vector_content.as_ref(),
&mut new_fields_ids_map,
&mut document_buffer,
)?;
self.document_sender.uncompressed(docid, external_docid, content).unwrap();
}
DocumentChange::Insertion(insertion) => {
let docid = insertion.docid();
let content = insertion.inserted();
for res in content.iter_top_level_fields() {
let (f, _) = res?;
let entry = document_extractor_data
.field_distribution_delta
.entry_ref(f)
.or_default();
*entry += 1;
}
let inserted_vectors =
insertion.inserted_vectors(&context.doc_alloc, self.embedders)?;
let content = write_to_obkv(
&content,
inserted_vectors.as_ref(),
&mut new_fields_ids_map,
&mut document_buffer,
)?;
document_extractor_data.docids_delta.insert_add_u32(docid);
self.document_sender.uncompressed(docid, external_docid, content).unwrap();
// extracted_dictionary_sender.send(self, dictionary: &[u8]);
}
}
}
Ok(())
}
}

View file

@ -228,7 +228,7 @@ impl DocidsExtractor for FacetedDocidsExtractor {
grenad_parameters: GrenadParameters,
document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
extractor_allocs: &mut ThreadLocal<FullySend<Bump>>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
finished_steps: u16,
total_steps: u16,
step_name: &'static str,

View file

@ -14,7 +14,7 @@ pub use vectors::EmbeddingExtractor;
use super::indexer::document_changes::{
DocumentChanges, FullySend, IndexingContext, Progress, ThreadLocal,
};
use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps};
use crate::update::GrenadParameters;
use crate::Result;
pub trait DocidsExtractor {
@ -26,7 +26,7 @@ pub trait DocidsExtractor {
finished_steps: u16,
total_steps: u16,
step_name: &'static str,
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>>
) -> Result<Vec<BalancedCaches<'extractor>>>
where
MSP: Fn() -> bool + Sync,
SP: Fn(Progress) + Sync;

View file

@ -11,8 +11,8 @@ use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
use crate::update::new::extract::cache::BalancedCaches;
use crate::update::new::extract::perm_json_p::contained_in;
use crate::update::new::indexer::document_changes::{
for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend,
IndexingContext, MostlySend, RefCellExt, ThreadLocal,
extract, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext,
MostlySend, Progress, RefCellExt, ThreadLocal,
};
use crate::update::new::DocumentChange;
use crate::update::GrenadParameters;
@ -218,24 +218,44 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> {
))))
}
fn process(
fn process<'doc>(
&self,
change: DocumentChange,
changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
context: &DocumentChangeContext<Self::Data>,
) -> Result<()> {
WordDocidsExtractors::extract_document_change(context, self.tokenizer, change)
for change in changes {
let change = change?;
WordDocidsExtractors::extract_document_change(context, self.tokenizer, change)?;
}
Ok(())
}
}
pub struct WordDocidsExtractors;
impl WordDocidsExtractors {
pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>>(
pub fn run_extraction<
'pl,
'fid,
'indexer,
'index,
'extractor,
DC: DocumentChanges<'pl>,
MSP,
SP,
>(
grenad_parameters: GrenadParameters,
document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index>,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
) -> Result<WordDocidsCaches<'extractor>> {
finished_steps: u16,
total_steps: u16,
step_name: &'static str,
) -> Result<WordDocidsCaches<'extractor>>
where
MSP: Fn() -> bool + Sync,
SP: Fn(Progress) + Sync,
{
let index = indexing_context.index;
let rtxn = index.read_txn()?;
@ -279,12 +299,15 @@ impl WordDocidsExtractors {
buckets: rayon::current_num_threads(),
};
for_each_document_change(
extract(
document_changes,
&extractor,
indexing_context,
extractor_allocs,
&datastore,
finished_steps,
total_steps,
step_name,
)?;
}
@ -358,7 +381,7 @@ impl WordDocidsExtractors {
)
};
document_tokenizer.tokenize_document(
inner.new(rtxn, index, context.db_fields_ids_map)?,
inner.merged(rtxn, index, context.db_fields_ids_map)?,
new_fields_ids_map,
&mut token_fn,
)?;
@ -375,7 +398,7 @@ impl WordDocidsExtractors {
)
};
document_tokenizer.tokenize_document(
inner.new(),
inner.inserted(),
new_fields_ids_map,
&mut token_fn,
)?;

View file

@ -8,7 +8,7 @@ use super::cache::DelAddRoaringBitmap;
use crate::error::FaultSource;
use crate::prompt::Prompt;
use crate::update::new::channel::EmbeddingSender;
use crate::update::new::indexer::document_changes::{Extractor, FullySend};
use crate::update::new::indexer::document_changes::{Extractor, MostlySend};
use crate::update::new::vector_document::VectorDocument;
use crate::update::new::DocumentChange;
use crate::vector::error::{
@ -36,15 +36,17 @@ impl<'a> EmbeddingExtractor<'a> {
}
}
impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
type Data = FullySend<RefCell<HashMap<String, DelAddRoaringBitmap>>>;
pub struct EmbeddingExtractorData<'extractor>(
pub HashMap<String, DelAddRoaringBitmap, hashbrown::DefaultHashBuilder, &'extractor Bump>,
);
fn init_data<'doc>(
&'doc self,
_extractor_alloc: raw_collections::alloc::RefBump<'extractor>,
) -> crate::Result<Self::Data> {
/// TODO: use the extractor_alloc in the hashbrown once you merge the branch where it is no longer a RefBump
Ok(FullySend(Default::default()))
unsafe impl MostlySend for EmbeddingExtractorData<'_> {}
impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
type Data = RefCell<EmbeddingExtractorData<'extractor>>;
fn init_data<'doc>(&'doc self, extractor_alloc: &'extractor Bump) -> crate::Result<Self::Data> {
Ok(RefCell::new(EmbeddingExtractorData(HashMap::new_in(extractor_alloc))))
}
fn process<'doc>(
@ -72,7 +74,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
embedder_id,
embedder_name,
prompt,
&context.data.0,
context.data,
&self.possible_embedding_mistakes,
self.threads,
self.sender,
@ -252,7 +254,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
// Currently this is the case as:
// 1. BVec are inside of the bumaplo
// 2. All other fields are either trivial (u8) or references.
struct Chunks<'a> {
struct Chunks<'a, 'extractor> {
texts: BVec<'a, &'a str>,
ids: BVec<'a, DocumentId>,
@ -261,19 +263,19 @@ struct Chunks<'a> {
embedder_name: &'a str,
prompt: &'a Prompt,
possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
user_provided: &'a RefCell<HashMap<String, DelAddRoaringBitmap>>,
user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>,
threads: &'a ThreadPoolNoAbort,
sender: &'a EmbeddingSender<'a>,
}
impl<'a> Chunks<'a> {
impl<'a, 'extractor> Chunks<'a, 'extractor> {
#[allow(clippy::too_many_arguments)]
pub fn new(
embedder: &'a Embedder,
embedder_id: u8,
embedder_name: &'a str,
prompt: &'a Prompt,
user_provided: &'a RefCell<HashMap<String, DelAddRoaringBitmap>>,
user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>,
possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
threads: &'a ThreadPoolNoAbort,
sender: &'a EmbeddingSender<'a>,
@ -417,7 +419,7 @@ impl<'a> Chunks<'a> {
fn set_regenerate(&self, docid: DocumentId, regenerate: bool) {
let mut user_provided = self.user_provided.borrow_mut();
let user_provided = user_provided.entry_ref(self.embedder_name).or_default();
let user_provided = user_provided.0.entry_ref(self.embedder_name).or_default();
if regenerate {
// regenerate == !user_provided
user_provided.del.get_or_insert(Default::default()).insert(docid);