mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-23 05:14:27 +01:00
Add progress
This commit is contained in:
parent
a77d5ea8c1
commit
3658f57f93
@ -22,6 +22,7 @@ use std::ffi::OsStr;
|
|||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::fs::{self, File};
|
use std::fs::{self, File};
|
||||||
use std::io::BufWriter;
|
use std::io::BufWriter;
|
||||||
|
use std::sync::atomic::{self, AtomicU16, AtomicU32};
|
||||||
|
|
||||||
use bumpalo::collections::CollectIn;
|
use bumpalo::collections::CollectIn;
|
||||||
use bumpalo::Bump;
|
use bumpalo::Bump;
|
||||||
@ -30,6 +31,7 @@ use meilisearch_types::error::Code;
|
|||||||
use meilisearch_types::heed::{RoTxn, RwTxn};
|
use meilisearch_types::heed::{RoTxn, RwTxn};
|
||||||
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader, PrimaryKey};
|
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader, PrimaryKey};
|
||||||
use meilisearch_types::milli::heed::CompactionOption;
|
use meilisearch_types::milli::heed::CompactionOption;
|
||||||
|
use meilisearch_types::milli::update::new::indexer::document_changes::Progress;
|
||||||
use meilisearch_types::milli::update::new::indexer::{
|
use meilisearch_types::milli::update::new::indexer::{
|
||||||
self, retrieve_or_guess_primary_key, UpdateByFunction,
|
self, retrieve_or_guess_primary_key, UpdateByFunction,
|
||||||
};
|
};
|
||||||
@ -1221,6 +1223,40 @@ impl IndexScheduler {
|
|||||||
) -> Result<Vec<Task>> {
|
) -> Result<Vec<Task>> {
|
||||||
let indexer_alloc = Bump::new();
|
let indexer_alloc = Bump::new();
|
||||||
|
|
||||||
|
let last_finished_steps = AtomicU16::new(0);
|
||||||
|
let last_finished_documents = AtomicU32::new(0);
|
||||||
|
|
||||||
|
let send_progress =
|
||||||
|
|Progress { finished_steps, total_steps, step_name, finished_total_documents }| {
|
||||||
|
/*
|
||||||
|
let current = rayon::current_thread_index();
|
||||||
|
|
||||||
|
let last_finished_steps =
|
||||||
|
last_finished_steps.fetch_max(finished_steps, atomic::Ordering::Relaxed);
|
||||||
|
|
||||||
|
if last_finished_steps > finished_steps {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some((finished_documents, total_documents)) = finished_total_documents {
|
||||||
|
if last_finished_steps < finished_steps {
|
||||||
|
last_finished_documents.store(finished_documents, atomic::Ordering::Relaxed);
|
||||||
|
} else {
|
||||||
|
let last_finished_documents = last_finished_documents
|
||||||
|
.fetch_max(finished_documents, atomic::Ordering::Relaxed);
|
||||||
|
if last_finished_documents > finished_documents {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tracing::warn!("Progress from {current:?}: {step_name} ({finished_steps}/{total_steps}), document {finished_documents}/{total_documents}")
|
||||||
|
} else {
|
||||||
|
tracing::warn!(
|
||||||
|
"Progress from {current:?}: {step_name} ({finished_steps}/{total_steps})"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
};
|
||||||
|
|
||||||
match operation {
|
match operation {
|
||||||
IndexOperation::DocumentClear { mut tasks, .. } => {
|
IndexOperation::DocumentClear { mut tasks, .. } => {
|
||||||
let count = milli::update::ClearDocuments::new(index_wtxn, index).execute()?;
|
let count = milli::update::ClearDocuments::new(index_wtxn, index).execute()?;
|
||||||
@ -1377,6 +1413,8 @@ impl IndexScheduler {
|
|||||||
&pool,
|
&pool,
|
||||||
&document_changes,
|
&document_changes,
|
||||||
embedders,
|
embedders,
|
||||||
|
&|| must_stop_processing.get(),
|
||||||
|
&send_progress,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
|
// tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
|
||||||
@ -1465,6 +1503,7 @@ impl IndexScheduler {
|
|||||||
let document_changes = indexer.into_changes(&primary_key)?;
|
let document_changes = indexer.into_changes(&primary_key)?;
|
||||||
let embedders = index.embedding_configs(index_wtxn)?;
|
let embedders = index.embedding_configs(index_wtxn)?;
|
||||||
let embedders = self.embedders(embedders)?;
|
let embedders = self.embedders(embedders)?;
|
||||||
|
let must_stop_processing = &self.must_stop_processing;
|
||||||
|
|
||||||
indexer::index(
|
indexer::index(
|
||||||
index_wtxn,
|
index_wtxn,
|
||||||
@ -1475,6 +1514,8 @@ impl IndexScheduler {
|
|||||||
&pool,
|
&pool,
|
||||||
&document_changes,
|
&document_changes,
|
||||||
embedders,
|
embedders,
|
||||||
|
&|| must_stop_processing.get(),
|
||||||
|
&send_progress,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
|
// tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
|
||||||
@ -1604,6 +1645,7 @@ impl IndexScheduler {
|
|||||||
let document_changes = indexer.into_changes(&indexer_alloc, primary_key);
|
let document_changes = indexer.into_changes(&indexer_alloc, primary_key);
|
||||||
let embedders = index.embedding_configs(index_wtxn)?;
|
let embedders = index.embedding_configs(index_wtxn)?;
|
||||||
let embedders = self.embedders(embedders)?;
|
let embedders = self.embedders(embedders)?;
|
||||||
|
let must_stop_processing = &self.must_stop_processing;
|
||||||
|
|
||||||
indexer::index(
|
indexer::index(
|
||||||
index_wtxn,
|
index_wtxn,
|
||||||
@ -1614,6 +1656,8 @@ impl IndexScheduler {
|
|||||||
&pool,
|
&pool,
|
||||||
&document_changes,
|
&document_changes,
|
||||||
embedders,
|
embedders,
|
||||||
|
&|| must_stop_processing.get(),
|
||||||
|
&send_progress,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
|
// tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
|
||||||
|
@ -16,8 +16,8 @@ use super::FacetKind;
|
|||||||
use crate::facet::value_encoding::f64_into_bytes;
|
use crate::facet::value_encoding::f64_into_bytes;
|
||||||
use crate::update::new::extract::DocidsExtractor;
|
use crate::update::new::extract::DocidsExtractor;
|
||||||
use crate::update::new::indexer::document_changes::{
|
use crate::update::new::indexer::document_changes::{
|
||||||
for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend,
|
extract, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext,
|
||||||
IndexingContext, RefCellExt, ThreadLocal,
|
Progress, RefCellExt, ThreadLocal,
|
||||||
};
|
};
|
||||||
use crate::update::new::DocumentChange;
|
use crate::update::new::DocumentChange;
|
||||||
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
||||||
@ -250,12 +250,19 @@ fn truncate_str(s: &str) -> &str {
|
|||||||
|
|
||||||
impl DocidsExtractor for FacetedDocidsExtractor {
|
impl DocidsExtractor for FacetedDocidsExtractor {
|
||||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")]
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")]
|
||||||
fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>(
|
fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>, MSP, SP>(
|
||||||
grenad_parameters: GrenadParameters,
|
grenad_parameters: GrenadParameters,
|
||||||
document_changes: &DC,
|
document_changes: &DC,
|
||||||
indexing_context: IndexingContext<'fid, 'indexer, 'index>,
|
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
|
||||||
extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
|
extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
|
||||||
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>> {
|
finished_steps: u16,
|
||||||
|
total_steps: u16,
|
||||||
|
step_name: &'static str,
|
||||||
|
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>>
|
||||||
|
where
|
||||||
|
MSP: Fn() -> bool + Sync,
|
||||||
|
SP: Fn(Progress) + Sync,
|
||||||
|
{
|
||||||
let max_memory = grenad_parameters.max_memory_by_thread();
|
let max_memory = grenad_parameters.max_memory_by_thread();
|
||||||
|
|
||||||
let index = indexing_context.index;
|
let index = indexing_context.index;
|
||||||
@ -276,12 +283,15 @@ impl DocidsExtractor for FacetedDocidsExtractor {
|
|||||||
grenad_parameters,
|
grenad_parameters,
|
||||||
max_memory,
|
max_memory,
|
||||||
};
|
};
|
||||||
for_each_document_change(
|
extract(
|
||||||
document_changes,
|
document_changes,
|
||||||
&extractor,
|
&extractor,
|
||||||
indexing_context,
|
indexing_context,
|
||||||
extractor_allocs,
|
extractor_allocs,
|
||||||
&datastore,
|
&datastore,
|
||||||
|
finished_steps,
|
||||||
|
total_steps,
|
||||||
|
step_name,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
|
@ -13,17 +13,25 @@ use grenad::Merger;
|
|||||||
pub use searchable::*;
|
pub use searchable::*;
|
||||||
pub use vectors::EmbeddingExtractor;
|
pub use vectors::EmbeddingExtractor;
|
||||||
|
|
||||||
use super::indexer::document_changes::{DocumentChanges, FullySend, IndexingContext, ThreadLocal};
|
use super::indexer::document_changes::{
|
||||||
|
DocumentChanges, FullySend, IndexingContext, Progress, ThreadLocal,
|
||||||
|
};
|
||||||
use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
|
|
||||||
pub trait DocidsExtractor {
|
pub trait DocidsExtractor {
|
||||||
fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>(
|
fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>, MSP, SP>(
|
||||||
grenad_parameters: GrenadParameters,
|
grenad_parameters: GrenadParameters,
|
||||||
document_changes: &DC,
|
document_changes: &DC,
|
||||||
indexing_context: IndexingContext<'fid, 'indexer, 'index>,
|
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
|
||||||
extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
|
extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
|
||||||
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>>;
|
finished_steps: u16,
|
||||||
|
total_steps: u16,
|
||||||
|
step_name: &'static str,
|
||||||
|
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>>
|
||||||
|
where
|
||||||
|
MSP: Fn() -> bool + Sync,
|
||||||
|
SP: Fn(Progress) + Sync;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// TODO move in permissive json pointer
|
/// TODO move in permissive json pointer
|
||||||
|
@ -12,8 +12,8 @@ use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
|
|||||||
use crate::update::new::extract::cache::CboCachedSorter;
|
use crate::update::new::extract::cache::CboCachedSorter;
|
||||||
use crate::update::new::extract::perm_json_p::contained_in;
|
use crate::update::new::extract::perm_json_p::contained_in;
|
||||||
use crate::update::new::indexer::document_changes::{
|
use crate::update::new::indexer::document_changes::{
|
||||||
for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend,
|
extract, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext,
|
||||||
IndexingContext, RefCellExt, ThreadLocal,
|
Progress, RefCellExt, ThreadLocal,
|
||||||
};
|
};
|
||||||
use crate::update::new::DocumentChange;
|
use crate::update::new::DocumentChange;
|
||||||
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
||||||
@ -341,12 +341,19 @@ impl<'extractor> Extractor<'extractor> for WordDocidsExtractorData<'extractor> {
|
|||||||
pub struct WordDocidsExtractors;
|
pub struct WordDocidsExtractors;
|
||||||
|
|
||||||
impl WordDocidsExtractors {
|
impl WordDocidsExtractors {
|
||||||
pub fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>(
|
pub fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>, MSP, SP>(
|
||||||
grenad_parameters: GrenadParameters,
|
grenad_parameters: GrenadParameters,
|
||||||
document_changes: &DC,
|
document_changes: &DC,
|
||||||
indexing_context: IndexingContext<'fid, 'indexer, 'index>,
|
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
|
||||||
extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
|
extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
|
||||||
) -> Result<WordDocidsMergers> {
|
finished_steps: u16,
|
||||||
|
total_steps: u16,
|
||||||
|
step_name: &'static str,
|
||||||
|
) -> Result<WordDocidsMergers>
|
||||||
|
where
|
||||||
|
MSP: Fn() -> bool + Sync,
|
||||||
|
SP: Fn(Progress) + Sync,
|
||||||
|
{
|
||||||
let max_memory = grenad_parameters.max_memory_by_thread();
|
let max_memory = grenad_parameters.max_memory_by_thread();
|
||||||
let index = indexing_context.index;
|
let index = indexing_context.index;
|
||||||
|
|
||||||
@ -391,12 +398,15 @@ impl WordDocidsExtractors {
|
|||||||
max_memory,
|
max_memory,
|
||||||
};
|
};
|
||||||
|
|
||||||
for_each_document_change(
|
extract(
|
||||||
document_changes,
|
document_changes,
|
||||||
&extractor,
|
&extractor,
|
||||||
indexing_context,
|
indexing_context,
|
||||||
extractor_allocs,
|
extractor_allocs,
|
||||||
&datastore,
|
&datastore,
|
||||||
|
finished_steps,
|
||||||
|
total_steps,
|
||||||
|
step_name,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17,8 +17,8 @@ use tokenize_document::{tokenizer_builder, DocumentTokenizer};
|
|||||||
use super::cache::CboCachedSorter;
|
use super::cache::CboCachedSorter;
|
||||||
use super::DocidsExtractor;
|
use super::DocidsExtractor;
|
||||||
use crate::update::new::indexer::document_changes::{
|
use crate::update::new::indexer::document_changes::{
|
||||||
for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend,
|
extract, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext,
|
||||||
IndexingContext, ThreadLocal,
|
Progress, ThreadLocal,
|
||||||
};
|
};
|
||||||
use crate::update::new::DocumentChange;
|
use crate::update::new::DocumentChange;
|
||||||
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
||||||
@ -69,12 +69,19 @@ impl<'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor>
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub trait SearchableExtractor: Sized + Sync {
|
pub trait SearchableExtractor: Sized + Sync {
|
||||||
fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>(
|
fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>, MSP, SP>(
|
||||||
grenad_parameters: GrenadParameters,
|
grenad_parameters: GrenadParameters,
|
||||||
document_changes: &DC,
|
document_changes: &DC,
|
||||||
indexing_context: IndexingContext<'fid, 'indexer, 'index>,
|
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
|
||||||
extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
|
extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
|
||||||
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>> {
|
finished_steps: u16,
|
||||||
|
total_steps: u16,
|
||||||
|
step_name: &'static str,
|
||||||
|
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>>
|
||||||
|
where
|
||||||
|
MSP: Fn() -> bool + Sync,
|
||||||
|
SP: Fn(Progress) + Sync,
|
||||||
|
{
|
||||||
let max_memory = grenad_parameters.max_memory_by_thread();
|
let max_memory = grenad_parameters.max_memory_by_thread();
|
||||||
|
|
||||||
let rtxn = indexing_context.index.read_txn()?;
|
let rtxn = indexing_context.index.read_txn()?;
|
||||||
@ -118,12 +125,15 @@ pub trait SearchableExtractor: Sized + Sync {
|
|||||||
let span =
|
let span =
|
||||||
tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
|
tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
for_each_document_change(
|
extract(
|
||||||
document_changes,
|
document_changes,
|
||||||
&extractor_data,
|
&extractor_data,
|
||||||
indexing_context,
|
indexing_context,
|
||||||
extractor_allocs,
|
extractor_allocs,
|
||||||
&datastore,
|
&datastore,
|
||||||
|
finished_steps,
|
||||||
|
total_steps,
|
||||||
|
step_name,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@ -168,17 +178,27 @@ pub trait SearchableExtractor: Sized + Sync {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<T: SearchableExtractor> DocidsExtractor for T {
|
impl<T: SearchableExtractor> DocidsExtractor for T {
|
||||||
fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>(
|
fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>, MSP, SP>(
|
||||||
grenad_parameters: GrenadParameters,
|
grenad_parameters: GrenadParameters,
|
||||||
document_changes: &DC,
|
document_changes: &DC,
|
||||||
indexing_context: IndexingContext<'fid, 'indexer, 'index>,
|
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
|
||||||
extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
|
extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
|
||||||
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>> {
|
finished_steps: u16,
|
||||||
|
total_steps: u16,
|
||||||
|
step_name: &'static str,
|
||||||
|
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>>
|
||||||
|
where
|
||||||
|
MSP: Fn() -> bool + Sync,
|
||||||
|
SP: Fn(Progress) + Sync,
|
||||||
|
{
|
||||||
Self::run_extraction(
|
Self::run_extraction(
|
||||||
grenad_parameters,
|
grenad_parameters,
|
||||||
document_changes,
|
document_changes,
|
||||||
indexing_context,
|
indexing_context,
|
||||||
extractor_allocs,
|
extractor_allocs,
|
||||||
|
finished_steps,
|
||||||
|
total_steps,
|
||||||
|
step_name,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -9,7 +9,7 @@ use rayon::iter::IndexedParallelIterator;
|
|||||||
use super::super::document_change::DocumentChange;
|
use super::super::document_change::DocumentChange;
|
||||||
use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
|
use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
|
||||||
use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _;
|
use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _;
|
||||||
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result};
|
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result};
|
||||||
|
|
||||||
pub trait RefCellExt<T: ?Sized> {
|
pub trait RefCellExt<T: ?Sized> {
|
||||||
fn try_borrow_or_yield(&self) -> std::result::Result<Ref<'_, T>, std::cell::BorrowError>;
|
fn try_borrow_or_yield(&self) -> std::result::Result<Ref<'_, T>, std::cell::BorrowError>;
|
||||||
@ -335,6 +335,12 @@ pub trait DocumentChanges<'pl // lifetime of the underlying payload
|
|||||||
|
|
||||||
fn iter(&self, chunk_size: usize) -> impl IndexedParallelIterator<Item = impl AsRef<[Self::Item]>>;
|
fn iter(&self, chunk_size: usize) -> impl IndexedParallelIterator<Item = impl AsRef<[Self::Item]>>;
|
||||||
|
|
||||||
|
fn len(&self) -> usize;
|
||||||
|
|
||||||
|
fn is_empty(&self) -> bool {
|
||||||
|
self.len() == 0
|
||||||
|
}
|
||||||
|
|
||||||
fn item_to_document_change<'doc, // lifetime of a single `process` call
|
fn item_to_document_change<'doc, // lifetime of a single `process` call
|
||||||
T: MostlySend>(
|
T: MostlySend>(
|
||||||
&'doc self,
|
&'doc self,
|
||||||
@ -344,22 +350,72 @@ pub trait DocumentChanges<'pl // lifetime of the underlying payload
|
|||||||
;
|
;
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Copy)]
|
|
||||||
pub struct IndexingContext<
|
pub struct IndexingContext<
|
||||||
'fid, // invariant lifetime of fields ids map
|
'fid, // invariant lifetime of fields ids map
|
||||||
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
|
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
|
||||||
'index, // covariant lifetime of the index
|
'index, // covariant lifetime of the index
|
||||||
> {
|
MSP,
|
||||||
|
SP,
|
||||||
|
> where
|
||||||
|
MSP: Fn() -> bool + Sync,
|
||||||
|
SP: Fn(Progress) + Sync,
|
||||||
|
{
|
||||||
pub index: &'index Index,
|
pub index: &'index Index,
|
||||||
pub db_fields_ids_map: &'indexer FieldsIdsMap,
|
pub db_fields_ids_map: &'indexer FieldsIdsMap,
|
||||||
pub new_fields_ids_map: &'fid RwLock<FieldIdMapWithMetadata>,
|
pub new_fields_ids_map: &'fid RwLock<FieldIdMapWithMetadata>,
|
||||||
pub doc_allocs: &'indexer ThreadLocal<FullySend<Cell<Bump>>>,
|
pub doc_allocs: &'indexer ThreadLocal<FullySend<Cell<Bump>>>,
|
||||||
pub fields_ids_map_store: &'indexer ThreadLocal<FullySend<RefCell<GlobalFieldsIdsMap<'fid>>>>,
|
pub fields_ids_map_store: &'indexer ThreadLocal<FullySend<RefCell<GlobalFieldsIdsMap<'fid>>>>,
|
||||||
|
pub must_stop_processing: &'indexer MSP,
|
||||||
|
pub send_progress: &'indexer SP,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<
|
||||||
|
'fid, // invariant lifetime of fields ids map
|
||||||
|
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
|
||||||
|
'index, // covariant lifetime of the index
|
||||||
|
MSP,
|
||||||
|
SP,
|
||||||
|
> Copy
|
||||||
|
for IndexingContext<
|
||||||
|
'fid, // invariant lifetime of fields ids map
|
||||||
|
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
|
||||||
|
'index, // covariant lifetime of the index
|
||||||
|
MSP,
|
||||||
|
SP,
|
||||||
|
>
|
||||||
|
where
|
||||||
|
MSP: Fn() -> bool + Sync,
|
||||||
|
SP: Fn(Progress) + Sync,
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<
|
||||||
|
'fid, // invariant lifetime of fields ids map
|
||||||
|
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
|
||||||
|
'index, // covariant lifetime of the index
|
||||||
|
MSP,
|
||||||
|
SP,
|
||||||
|
> Clone
|
||||||
|
for IndexingContext<
|
||||||
|
'fid, // invariant lifetime of fields ids map
|
||||||
|
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
|
||||||
|
'index, // covariant lifetime of the index
|
||||||
|
MSP,
|
||||||
|
SP,
|
||||||
|
>
|
||||||
|
where
|
||||||
|
MSP: Fn() -> bool + Sync,
|
||||||
|
SP: Fn(Progress) + Sync,
|
||||||
|
{
|
||||||
|
fn clone(&self) -> Self {
|
||||||
|
*self
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const CHUNK_SIZE: usize = 100;
|
const CHUNK_SIZE: usize = 100;
|
||||||
|
|
||||||
pub fn for_each_document_change<
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
pub fn extract<
|
||||||
'pl, // covariant lifetime of the underlying payload
|
'pl, // covariant lifetime of the underlying payload
|
||||||
'extractor, // invariant lifetime of extractor_alloc
|
'extractor, // invariant lifetime of extractor_alloc
|
||||||
'fid, // invariant lifetime of fields ids map
|
'fid, // invariant lifetime of fields ids map
|
||||||
@ -368,6 +424,8 @@ pub fn for_each_document_change<
|
|||||||
'index, // covariant lifetime of the index
|
'index, // covariant lifetime of the index
|
||||||
EX,
|
EX,
|
||||||
DC: DocumentChanges<'pl>,
|
DC: DocumentChanges<'pl>,
|
||||||
|
MSP,
|
||||||
|
SP,
|
||||||
>(
|
>(
|
||||||
document_changes: &DC,
|
document_changes: &DC,
|
||||||
extractor: &EX,
|
extractor: &EX,
|
||||||
@ -377,20 +435,29 @@ pub fn for_each_document_change<
|
|||||||
new_fields_ids_map,
|
new_fields_ids_map,
|
||||||
doc_allocs,
|
doc_allocs,
|
||||||
fields_ids_map_store,
|
fields_ids_map_store,
|
||||||
}: IndexingContext<'fid, 'indexer, 'index>,
|
must_stop_processing,
|
||||||
|
send_progress,
|
||||||
|
}: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
|
||||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<RefCell<Bump>>>,
|
extractor_allocs: &'extractor mut ThreadLocal<FullySend<RefCell<Bump>>>,
|
||||||
datastore: &'data ThreadLocal<EX::Data>,
|
datastore: &'data ThreadLocal<EX::Data>,
|
||||||
|
finished_steps: u16,
|
||||||
|
total_steps: u16,
|
||||||
|
step_name: &'static str,
|
||||||
) -> Result<()>
|
) -> Result<()>
|
||||||
where
|
where
|
||||||
EX: Extractor<'extractor>,
|
EX: Extractor<'extractor>,
|
||||||
|
MSP: Fn() -> bool + Sync,
|
||||||
|
SP: Fn(Progress) + Sync,
|
||||||
{
|
{
|
||||||
// Clean up and reuse the extractor allocs
|
// Clean up and reuse the extractor allocs
|
||||||
for extractor_alloc in extractor_allocs.iter_mut() {
|
for extractor_alloc in extractor_allocs.iter_mut() {
|
||||||
extractor_alloc.0.get_mut().reset();
|
extractor_alloc.0.get_mut().reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let total_documents = document_changes.len();
|
||||||
|
|
||||||
let pi = document_changes.iter(CHUNK_SIZE);
|
let pi = document_changes.iter(CHUNK_SIZE);
|
||||||
pi.try_arc_for_each_try_init(
|
pi.enumerate().try_arc_for_each_try_init(
|
||||||
|| {
|
|| {
|
||||||
DocumentChangeContext::new(
|
DocumentChangeContext::new(
|
||||||
index,
|
index,
|
||||||
@ -403,7 +470,19 @@ where
|
|||||||
move |index_alloc| extractor.init_data(index_alloc),
|
move |index_alloc| extractor.init_data(index_alloc),
|
||||||
)
|
)
|
||||||
},
|
},
|
||||||
|context, items| {
|
|context, (finished_documents, items)| {
|
||||||
|
if (must_stop_processing)() {
|
||||||
|
return Err(Arc::new(InternalError::AbortedIndexation.into()));
|
||||||
|
}
|
||||||
|
let finished_documents = finished_documents * CHUNK_SIZE;
|
||||||
|
|
||||||
|
(send_progress)(Progress {
|
||||||
|
finished_steps,
|
||||||
|
total_steps,
|
||||||
|
step_name,
|
||||||
|
finished_total_documents: Some((finished_documents as u32, total_documents as u32)),
|
||||||
|
});
|
||||||
|
|
||||||
// Clean up and reuse the document-specific allocator
|
// Clean up and reuse the document-specific allocator
|
||||||
context.doc_alloc.reset();
|
context.doc_alloc.reset();
|
||||||
|
|
||||||
@ -419,5 +498,21 @@ where
|
|||||||
|
|
||||||
res
|
res
|
||||||
},
|
},
|
||||||
)
|
)?;
|
||||||
|
|
||||||
|
(send_progress)(Progress {
|
||||||
|
finished_steps,
|
||||||
|
total_steps,
|
||||||
|
step_name,
|
||||||
|
finished_total_documents: Some((total_documents as u32, total_documents as u32)),
|
||||||
|
});
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct Progress {
|
||||||
|
pub finished_steps: u16,
|
||||||
|
pub total_steps: u16,
|
||||||
|
pub step_name: &'static str,
|
||||||
|
pub finished_total_documents: Option<(u32, u32)>,
|
||||||
}
|
}
|
||||||
|
@ -75,6 +75,10 @@ impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> {
|
|||||||
|
|
||||||
Ok(Some(DocumentChange::Deletion(Deletion::create(*docid, external_document_id))))
|
Ok(Some(DocumentChange::Deletion(Deletion::create(*docid, external_document_id))))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn len(&self) -> usize {
|
||||||
|
self.to_delete.len()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@ -89,8 +93,7 @@ mod test {
|
|||||||
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
|
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
|
||||||
use crate::index::tests::TempIndex;
|
use crate::index::tests::TempIndex;
|
||||||
use crate::update::new::indexer::document_changes::{
|
use crate::update::new::indexer::document_changes::{
|
||||||
for_each_document_change, DocumentChangeContext, Extractor, IndexingContext, MostlySend,
|
extract, DocumentChangeContext, Extractor, IndexingContext, MostlySend, ThreadLocal,
|
||||||
ThreadLocal,
|
|
||||||
};
|
};
|
||||||
use crate::update::new::indexer::DocumentDeletion;
|
use crate::update::new::indexer::DocumentDeletion;
|
||||||
use crate::update::new::DocumentChange;
|
use crate::update::new::DocumentChange;
|
||||||
@ -165,17 +168,22 @@ mod test {
|
|||||||
new_fields_ids_map: &fields_ids_map,
|
new_fields_ids_map: &fields_ids_map,
|
||||||
doc_allocs: &doc_allocs,
|
doc_allocs: &doc_allocs,
|
||||||
fields_ids_map_store: &fields_ids_map_store,
|
fields_ids_map_store: &fields_ids_map_store,
|
||||||
|
must_stop_processing: &(|| false),
|
||||||
|
send_progress: &(|_progress| {}),
|
||||||
};
|
};
|
||||||
|
|
||||||
for _ in 0..3 {
|
for _ in 0..3 {
|
||||||
let datastore = ThreadLocal::new();
|
let datastore = ThreadLocal::new();
|
||||||
|
|
||||||
for_each_document_change(
|
extract(
|
||||||
&changes,
|
&changes,
|
||||||
&deletion_tracker,
|
&deletion_tracker,
|
||||||
context,
|
context,
|
||||||
&mut extractor_allocs,
|
&mut extractor_allocs,
|
||||||
&datastore,
|
&datastore,
|
||||||
|
0,
|
||||||
|
1,
|
||||||
|
"test",
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
|
@ -241,6 +241,10 @@ impl<'pl> DocumentChanges<'pl> for DocumentOperationChanges<'pl> {
|
|||||||
)?;
|
)?;
|
||||||
Ok(change)
|
Ok(change)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn len(&self) -> usize {
|
||||||
|
self.docids_version_offsets.len()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
trait MergeChanges {
|
trait MergeChanges {
|
||||||
|
@ -5,8 +5,8 @@ use std::thread::{self, Builder};
|
|||||||
use big_s::S;
|
use big_s::S;
|
||||||
use bumpalo::Bump;
|
use bumpalo::Bump;
|
||||||
use document_changes::{
|
use document_changes::{
|
||||||
for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend,
|
extract, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext,
|
||||||
IndexingContext, RefCellExt, ThreadLocal,
|
Progress, RefCellExt, ThreadLocal,
|
||||||
};
|
};
|
||||||
pub use document_deletion::DocumentDeletion;
|
pub use document_deletion::DocumentDeletion;
|
||||||
pub use document_operation::DocumentOperation;
|
pub use document_operation::DocumentOperation;
|
||||||
@ -72,7 +72,7 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> {
|
|||||||
context: &DocumentChangeContext<Self::Data>,
|
context: &DocumentChangeContext<Self::Data>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let mut document_buffer = Vec::new();
|
let mut document_buffer = Vec::new();
|
||||||
let mut field_distribution_delta = context.data.0.borrow_mut();
|
let mut field_distribution_delta = context.data.0.borrow_mut_or_yield();
|
||||||
|
|
||||||
let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
|
let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
|
||||||
|
|
||||||
@ -155,13 +155,70 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mod steps {
|
||||||
|
pub const STEPS: &[&str] = &[
|
||||||
|
"extracting documents",
|
||||||
|
"extracting facets",
|
||||||
|
"extracting words",
|
||||||
|
"extracting word proximity",
|
||||||
|
"extracting embeddings",
|
||||||
|
"writing to database",
|
||||||
|
"post-processing facets",
|
||||||
|
"post-processing words",
|
||||||
|
"finalizing",
|
||||||
|
];
|
||||||
|
|
||||||
|
const fn step(step: u16) -> (u16, &'static str) {
|
||||||
|
(step, STEPS[step as usize])
|
||||||
|
}
|
||||||
|
|
||||||
|
pub const fn total_steps() -> u16 {
|
||||||
|
STEPS.len() as u16
|
||||||
|
}
|
||||||
|
|
||||||
|
pub const fn extract_documents() -> (u16, &'static str) {
|
||||||
|
step(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub const fn extract_facets() -> (u16, &'static str) {
|
||||||
|
step(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub const fn extract_words() -> (u16, &'static str) {
|
||||||
|
step(2)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub const fn extract_word_proximity() -> (u16, &'static str) {
|
||||||
|
step(3)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub const fn extract_embeddings() -> (u16, &'static str) {
|
||||||
|
step(4)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub const fn write_db() -> (u16, &'static str) {
|
||||||
|
step(5)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub const fn post_processing_facets() -> (u16, &'static str) {
|
||||||
|
step(6)
|
||||||
|
}
|
||||||
|
pub const fn post_processing_words() -> (u16, &'static str) {
|
||||||
|
step(7)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub const fn finalizing() -> (u16, &'static str) {
|
||||||
|
step(8)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// This is the main function of this crate.
|
/// This is the main function of this crate.
|
||||||
///
|
///
|
||||||
/// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`].
|
/// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`].
|
||||||
///
|
///
|
||||||
/// TODO return stats
|
/// TODO return stats
|
||||||
#[allow(clippy::too_many_arguments)] // clippy: 😝
|
#[allow(clippy::too_many_arguments)] // clippy: 😝
|
||||||
pub fn index<'pl, 'indexer, 'index, DC>(
|
pub fn index<'pl, 'indexer, 'index, DC, MSP, SP>(
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
index: &'index Index,
|
index: &'index Index,
|
||||||
db_fields_ids_map: &'indexer FieldsIdsMap,
|
db_fields_ids_map: &'indexer FieldsIdsMap,
|
||||||
@ -170,9 +227,13 @@ pub fn index<'pl, 'indexer, 'index, DC>(
|
|||||||
pool: &ThreadPool,
|
pool: &ThreadPool,
|
||||||
document_changes: &DC,
|
document_changes: &DC,
|
||||||
embedders: EmbeddingConfigs,
|
embedders: EmbeddingConfigs,
|
||||||
|
must_stop_processing: &'indexer MSP,
|
||||||
|
send_progress: &'indexer SP,
|
||||||
) -> Result<()>
|
) -> Result<()>
|
||||||
where
|
where
|
||||||
DC: DocumentChanges<'pl>,
|
DC: DocumentChanges<'pl>,
|
||||||
|
MSP: Fn() -> bool + Sync,
|
||||||
|
SP: Fn(Progress) + Sync,
|
||||||
{
|
{
|
||||||
let (merger_sender, writer_receiver) = merger_writer_channel(10_000);
|
let (merger_sender, writer_receiver) = merger_writer_channel(10_000);
|
||||||
// This channel acts as a rendezvous point to ensure that we are one task ahead
|
// This channel acts as a rendezvous point to ensure that we are one task ahead
|
||||||
@ -194,8 +255,12 @@ where
|
|||||||
new_fields_ids_map: &new_fields_ids_map,
|
new_fields_ids_map: &new_fields_ids_map,
|
||||||
doc_allocs: &doc_allocs,
|
doc_allocs: &doc_allocs,
|
||||||
fields_ids_map_store: &fields_ids_map_store,
|
fields_ids_map_store: &fields_ids_map_store,
|
||||||
|
must_stop_processing,
|
||||||
|
send_progress,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let total_steps = steps::total_steps();
|
||||||
|
|
||||||
let mut field_distribution = index.field_distribution(wtxn)?;
|
let mut field_distribution = index.field_distribution(wtxn)?;
|
||||||
|
|
||||||
thread::scope(|s| -> Result<()> {
|
thread::scope(|s| -> Result<()> {
|
||||||
@ -213,7 +278,8 @@ where
|
|||||||
let document_sender = extractor_sender.document_sender();
|
let document_sender = extractor_sender.document_sender();
|
||||||
let document_extractor = DocumentExtractor { document_sender: &document_sender, embedders };
|
let document_extractor = DocumentExtractor { document_sender: &document_sender, embedders };
|
||||||
let datastore = ThreadLocal::with_capacity(pool.current_num_threads());
|
let datastore = ThreadLocal::with_capacity(pool.current_num_threads());
|
||||||
for_each_document_change(document_changes, &document_extractor, indexing_context, &mut extractor_allocs, &datastore)?;
|
let (finished_steps, step_name) = steps::extract_documents();
|
||||||
|
extract(document_changes, &document_extractor, indexing_context, &mut extractor_allocs, &datastore, finished_steps, total_steps, step_name)?;
|
||||||
|
|
||||||
for field_distribution_delta in datastore {
|
for field_distribution_delta in datastore {
|
||||||
let field_distribution_delta = field_distribution_delta.0.into_inner();
|
let field_distribution_delta = field_distribution_delta.0.into_inner();
|
||||||
@ -238,22 +304,29 @@ where
|
|||||||
{
|
{
|
||||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted");
|
let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
|
let (finished_steps, step_name) = steps::extract_facets();
|
||||||
extract_and_send_docids::<
|
extract_and_send_docids::<
|
||||||
_,
|
_,
|
||||||
FacetedDocidsExtractor,
|
FacetedDocidsExtractor,
|
||||||
FacetDocids,
|
FacetDocids,
|
||||||
|
_,
|
||||||
|
_
|
||||||
>(
|
>(
|
||||||
grenad_parameters,
|
grenad_parameters,
|
||||||
document_changes,
|
document_changes,
|
||||||
indexing_context,
|
indexing_context,
|
||||||
&mut extractor_allocs,
|
&mut extractor_allocs,
|
||||||
&extractor_sender,
|
&extractor_sender,
|
||||||
|
finished_steps,
|
||||||
|
total_steps,
|
||||||
|
step_name
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids");
|
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
|
let (finished_steps, step_name) = steps::extract_words();
|
||||||
|
|
||||||
let WordDocidsMergers {
|
let WordDocidsMergers {
|
||||||
word_fid_docids,
|
word_fid_docids,
|
||||||
@ -261,7 +334,7 @@ where
|
|||||||
exact_word_docids,
|
exact_word_docids,
|
||||||
word_position_docids,
|
word_position_docids,
|
||||||
fid_word_count_docids,
|
fid_word_count_docids,
|
||||||
} = WordDocidsExtractors::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs)?;
|
} = WordDocidsExtractors::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs, finished_steps, total_steps, step_name)?;
|
||||||
extractor_sender.send_searchable::<WordDocids>(word_docids).unwrap();
|
extractor_sender.send_searchable::<WordDocids>(word_docids).unwrap();
|
||||||
extractor_sender.send_searchable::<WordFidDocids>(word_fid_docids).unwrap();
|
extractor_sender.send_searchable::<WordFidDocids>(word_fid_docids).unwrap();
|
||||||
extractor_sender.send_searchable::<ExactWordDocids>(exact_word_docids).unwrap();
|
extractor_sender.send_searchable::<ExactWordDocids>(exact_word_docids).unwrap();
|
||||||
@ -276,16 +349,24 @@ where
|
|||||||
if proximity_precision == ProximityPrecision::ByWord {
|
if proximity_precision == ProximityPrecision::ByWord {
|
||||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids");
|
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
|
let (finished_steps, step_name) = steps::extract_word_proximity();
|
||||||
|
|
||||||
|
|
||||||
extract_and_send_docids::<
|
extract_and_send_docids::<
|
||||||
_,
|
_,
|
||||||
WordPairProximityDocidsExtractor,
|
WordPairProximityDocidsExtractor,
|
||||||
WordPairProximityDocids,
|
WordPairProximityDocids,
|
||||||
|
_,
|
||||||
|
_
|
||||||
>(
|
>(
|
||||||
grenad_parameters,
|
grenad_parameters,
|
||||||
document_changes,
|
document_changes,
|
||||||
indexing_context,
|
indexing_context,
|
||||||
&mut extractor_allocs,
|
&mut extractor_allocs,
|
||||||
&extractor_sender,
|
&extractor_sender,
|
||||||
|
finished_steps,
|
||||||
|
total_steps,
|
||||||
|
step_name,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -301,8 +382,10 @@ where
|
|||||||
let embedding_sender = todo!();
|
let embedding_sender = todo!();
|
||||||
let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, &field_distribution, request_threads());
|
let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, &field_distribution, request_threads());
|
||||||
let datastore = ThreadLocal::with_capacity(pool.current_num_threads());
|
let datastore = ThreadLocal::with_capacity(pool.current_num_threads());
|
||||||
|
let (finished_steps, step_name) = steps::extract_embeddings();
|
||||||
|
|
||||||
for_each_document_change(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore)?;
|
|
||||||
|
extract(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore, finished_steps, total_steps, step_name)?;
|
||||||
|
|
||||||
|
|
||||||
let mut user_provided = HashMap::new();
|
let mut user_provided = HashMap::new();
|
||||||
@ -325,6 +408,8 @@ where
|
|||||||
{
|
{
|
||||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH");
|
let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
|
let (finished_steps, step_name) = steps::write_db();
|
||||||
|
(indexing_context.send_progress)(Progress { finished_steps, total_steps, step_name, finished_total_documents: None });
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO THIS IS TOO MUCH
|
// TODO THIS IS TOO MUCH
|
||||||
@ -501,15 +586,38 @@ where
|
|||||||
/// TODO handle the panicking threads
|
/// TODO handle the panicking threads
|
||||||
handle.join().unwrap()?;
|
handle.join().unwrap()?;
|
||||||
let merger_result = merger_thread.join().unwrap()?;
|
let merger_result = merger_thread.join().unwrap()?;
|
||||||
|
let (finished_steps, step_name) = steps::post_processing_facets();
|
||||||
|
(indexing_context.send_progress)(Progress {
|
||||||
|
finished_steps,
|
||||||
|
total_steps,
|
||||||
|
step_name,
|
||||||
|
finished_total_documents: None,
|
||||||
|
});
|
||||||
|
|
||||||
if let Some(facet_field_ids_delta) = merger_result.facet_field_ids_delta {
|
if let Some(facet_field_ids_delta) = merger_result.facet_field_ids_delta {
|
||||||
compute_facet_level_database(index, wtxn, facet_field_ids_delta)?;
|
compute_facet_level_database(index, wtxn, facet_field_ids_delta)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let (finished_steps, step_name) = steps::post_processing_words();
|
||||||
|
(indexing_context.send_progress)(Progress {
|
||||||
|
finished_steps,
|
||||||
|
total_steps,
|
||||||
|
step_name,
|
||||||
|
finished_total_documents: None,
|
||||||
|
});
|
||||||
|
|
||||||
if let Some(prefix_delta) = merger_result.prefix_delta {
|
if let Some(prefix_delta) = merger_result.prefix_delta {
|
||||||
compute_prefix_database(index, wtxn, prefix_delta)?;
|
compute_prefix_database(index, wtxn, prefix_delta)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let (finished_steps, step_name) = steps::finalizing();
|
||||||
|
(indexing_context.send_progress)(Progress {
|
||||||
|
finished_steps,
|
||||||
|
total_steps,
|
||||||
|
step_name,
|
||||||
|
finished_total_documents: None,
|
||||||
|
});
|
||||||
|
|
||||||
Ok(()) as Result<_>
|
Ok(()) as Result<_>
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
@ -585,6 +693,7 @@ fn compute_facet_level_database(
|
|||||||
/// TODO: GrenadParameters::default() should be removed in favor a passed parameter
|
/// TODO: GrenadParameters::default() should be removed in favor a passed parameter
|
||||||
/// TODO: manage the errors correctly
|
/// TODO: manage the errors correctly
|
||||||
/// TODO: we must have a single trait that also gives the extractor type
|
/// TODO: we must have a single trait that also gives the extractor type
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
fn extract_and_send_docids<
|
fn extract_and_send_docids<
|
||||||
'pl,
|
'pl,
|
||||||
'fid,
|
'fid,
|
||||||
@ -593,15 +702,31 @@ fn extract_and_send_docids<
|
|||||||
DC: DocumentChanges<'pl>,
|
DC: DocumentChanges<'pl>,
|
||||||
E: DocidsExtractor,
|
E: DocidsExtractor,
|
||||||
D: MergerOperationType,
|
D: MergerOperationType,
|
||||||
|
MSP,
|
||||||
|
SP,
|
||||||
>(
|
>(
|
||||||
grenad_parameters: GrenadParameters,
|
grenad_parameters: GrenadParameters,
|
||||||
document_changes: &DC,
|
document_changes: &DC,
|
||||||
indexing_context: IndexingContext<'fid, 'indexer, 'index>,
|
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
|
||||||
extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
|
extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
|
||||||
sender: &ExtractorSender,
|
sender: &ExtractorSender,
|
||||||
) -> Result<()> {
|
finished_steps: u16,
|
||||||
let merger =
|
total_steps: u16,
|
||||||
E::run_extraction(grenad_parameters, document_changes, indexing_context, extractor_allocs)?;
|
step_name: &'static str,
|
||||||
|
) -> Result<()>
|
||||||
|
where
|
||||||
|
MSP: Fn() -> bool + Sync,
|
||||||
|
SP: Fn(Progress) + Sync,
|
||||||
|
{
|
||||||
|
let merger = E::run_extraction(
|
||||||
|
grenad_parameters,
|
||||||
|
document_changes,
|
||||||
|
indexing_context,
|
||||||
|
extractor_allocs,
|
||||||
|
finished_steps,
|
||||||
|
total_steps,
|
||||||
|
step_name,
|
||||||
|
)?;
|
||||||
sender.send_searchable::<D>(merger).unwrap();
|
sender.send_searchable::<D>(merger).unwrap();
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -79,4 +79,8 @@ where
|
|||||||
let insertion = Insertion::create(docid, external_document_id, Versions::single(document));
|
let insertion = Insertion::create(docid, external_document_id, Versions::single(document));
|
||||||
Ok(Some(DocumentChange::Insertion(insertion)))
|
Ok(Some(DocumentChange::Insertion(insertion)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn len(&self) -> usize {
|
||||||
|
self.iter.len()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -9,7 +9,7 @@ use super::DocumentChanges;
|
|||||||
use crate::documents::Error::InvalidDocumentFormat;
|
use crate::documents::Error::InvalidDocumentFormat;
|
||||||
use crate::documents::PrimaryKey;
|
use crate::documents::PrimaryKey;
|
||||||
use crate::error::{FieldIdMapMissingEntry, InternalError};
|
use crate::error::{FieldIdMapMissingEntry, InternalError};
|
||||||
use crate::update::new::document::{DocumentFromVersions, Versions};
|
use crate::update::new::document::Versions;
|
||||||
use crate::update::new::{Deletion, DocumentChange, KvReaderFieldId, Update};
|
use crate::update::new::{Deletion, DocumentChange, KvReaderFieldId, Update};
|
||||||
use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError};
|
use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError};
|
||||||
|
|
||||||
@ -176,6 +176,10 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn len(&self) -> usize {
|
||||||
|
self.documents.len()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn obkv_to_rhaimap(obkv: &KvReaderFieldId, fields_ids_map: &FieldsIdsMap) -> Result<rhai::Map> {
|
fn obkv_to_rhaimap(obkv: &KvReaderFieldId, fields_ids_map: &FieldsIdsMap) -> Result<rhai::Map> {
|
||||||
|
Loading…
Reference in New Issue
Block a user