mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 20:37:15 +02:00
Introduce indexer level bumpalo
This commit is contained in:
parent
39b27e42be
commit
68a2502388
26 changed files with 1984 additions and 721 deletions
|
@ -39,6 +39,7 @@ time = { version = "0.3.36", features = [
|
|||
tracing = "0.1.40"
|
||||
ureq = "2.10.0"
|
||||
uuid = { version = "1.10.0", features = ["serde", "v4"] }
|
||||
bumpalo = "3.16.0"
|
||||
|
||||
[dev-dependencies]
|
||||
arroy = { git = "https://github.com/meilisearch/arroy/", rev = "2386594dfb009ce08821a925ccc89fb8e30bf73d" }
|
||||
|
|
|
@ -23,14 +23,15 @@ use std::fmt;
|
|||
use std::fs::{self, File};
|
||||
use std::io::BufWriter;
|
||||
|
||||
use bumpalo::collections::CollectIn;
|
||||
use bumpalo::Bump;
|
||||
use dump::IndexMetadata;
|
||||
use meilisearch_types::error::Code;
|
||||
use meilisearch_types::heed::{RoTxn, RwTxn};
|
||||
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
|
||||
use meilisearch_types::milli::heed::CompactionOption;
|
||||
use meilisearch_types::milli::update::new::indexer::{
|
||||
self, retrieve_or_guess_primary_key, DocumentChanges,
|
||||
};
|
||||
use meilisearch_types::milli::update::new::indexer::document_changes::DocumentChanges;
|
||||
use meilisearch_types::milli::update::new::indexer::{self, retrieve_or_guess_primary_key};
|
||||
use meilisearch_types::milli::update::{
|
||||
IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings,
|
||||
};
|
||||
|
@ -1219,6 +1220,8 @@ impl IndexScheduler {
|
|||
index: &'i Index,
|
||||
operation: IndexOperation,
|
||||
) -> Result<Vec<Task>> {
|
||||
let indexer_alloc = Bump::new();
|
||||
|
||||
match operation {
|
||||
IndexOperation::DocumentClear { mut tasks, .. } => {
|
||||
let count = milli::update::ClearDocuments::new(index_wtxn, index).execute()?;
|
||||
|
@ -1252,6 +1255,9 @@ impl IndexScheduler {
|
|||
let mut primary_key_has_been_set = false;
|
||||
let must_stop_processing = self.must_stop_processing.clone();
|
||||
let indexer_config = self.index_mapper.indexer_config();
|
||||
// TODO: at some point, for better efficiency we might want to reuse the bumpalo for successive batches.
|
||||
// this is made difficult by the fact we're doing private clones of the index scheduler and sending it
|
||||
// to a fresh thread.
|
||||
|
||||
/// TODO manage errors correctly
|
||||
let rtxn = index.read_txn()?;
|
||||
|
@ -1274,7 +1280,9 @@ impl IndexScheduler {
|
|||
}
|
||||
}
|
||||
|
||||
let mut fields_ids_map = index.fields_ids_map(&rtxn)?;
|
||||
let db_fields_ids_map = index.fields_ids_map(&rtxn)?;
|
||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||
|
||||
let first_document = match content_files.first() {
|
||||
Some(mmap) => {
|
||||
let mut iter = serde_json::Deserializer::from_slice(mmap).into_iter();
|
||||
|
@ -1286,7 +1294,7 @@ impl IndexScheduler {
|
|||
let primary_key = retrieve_or_guess_primary_key(
|
||||
&rtxn,
|
||||
index,
|
||||
&mut fields_ids_map,
|
||||
&mut new_fields_ids_map,
|
||||
first_document.as_ref(),
|
||||
)?
|
||||
.unwrap();
|
||||
|
@ -1320,7 +1328,11 @@ impl IndexScheduler {
|
|||
}
|
||||
DocumentOperation::Delete(document_ids) => {
|
||||
let count = document_ids.len();
|
||||
indexer.delete_documents(document_ids);
|
||||
let document_ids: bumpalo::collections::vec::Vec<_> = document_ids
|
||||
.iter()
|
||||
.map(|s| &*indexer_alloc.alloc_str(s))
|
||||
.collect_in(&indexer_alloc);
|
||||
indexer.delete_documents(document_ids.into_bump_slice());
|
||||
// Uses Invariant: remove documents actually always returns Ok for the inner result
|
||||
// let count = user_result.unwrap();
|
||||
let provided_ids =
|
||||
|
@ -1347,10 +1359,22 @@ impl IndexScheduler {
|
|||
// let pool = indexer_config.thread_pool.unwrap();
|
||||
let pool = rayon::ThreadPoolBuilder::new().build().unwrap();
|
||||
|
||||
let param = (index, &rtxn, &primary_key);
|
||||
let document_changes = indexer.document_changes(&mut fields_ids_map, param)?;
|
||||
/// TODO pass/write the FieldsIdsMap
|
||||
indexer::index(index_wtxn, index, fields_ids_map, &pool, document_changes)?;
|
||||
let document_changes = indexer.into_changes(
|
||||
&indexer_alloc,
|
||||
index,
|
||||
&rtxn,
|
||||
&primary_key,
|
||||
&mut new_fields_ids_map,
|
||||
)?;
|
||||
|
||||
indexer::index(
|
||||
index_wtxn,
|
||||
index,
|
||||
&db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
&pool,
|
||||
&document_changes,
|
||||
)?;
|
||||
|
||||
// tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
|
||||
}
|
||||
|
@ -1501,10 +1525,11 @@ impl IndexScheduler {
|
|||
}
|
||||
|
||||
let rtxn = index.read_txn()?;
|
||||
let mut fields_ids_map = index.fields_ids_map(&rtxn)?;
|
||||
let db_fields_ids_map = index.fields_ids_map(&rtxn)?;
|
||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||
|
||||
let primary_key =
|
||||
retrieve_or_guess_primary_key(&rtxn, index, &mut fields_ids_map, None)?
|
||||
retrieve_or_guess_primary_key(&rtxn, index, &mut new_fields_ids_map, None)?
|
||||
.unwrap();
|
||||
|
||||
if !tasks.iter().all(|res| res.error.is_some()) {
|
||||
|
@ -1512,19 +1537,17 @@ impl IndexScheduler {
|
|||
// let pool = indexer_config.thread_pool.unwrap();
|
||||
let pool = rayon::ThreadPoolBuilder::new().build().unwrap();
|
||||
|
||||
let param = (index, &fields_ids_map, &primary_key);
|
||||
let mut indexer = indexer::DocumentDeletion::new();
|
||||
indexer.delete_documents_by_docids(to_delete);
|
||||
/// TODO remove this fields-ids-map, it's useless for the deletion pipeline (the &mut cloned one).
|
||||
let document_changes =
|
||||
indexer.document_changes(&mut fields_ids_map.clone(), param)?;
|
||||
/// TODO pass/write the FieldsIdsMap
|
||||
let document_changes = indexer.into_changes(&indexer_alloc, primary_key);
|
||||
|
||||
indexer::index(
|
||||
index_wtxn,
|
||||
index,
|
||||
fields_ids_map.clone(),
|
||||
&db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
&pool,
|
||||
document_changes,
|
||||
&document_changes,
|
||||
)?;
|
||||
|
||||
// tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue