mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-27 13:47:29 +01:00
Merge pull request #35 from meilisearch/better-update-progress
Better update progress
This commit is contained in:
commit
b4951c058b
@ -54,13 +54,22 @@ $(window).on('load', function () {
|
||||
const content = $(`#${id} .updateStatus.content`);
|
||||
|
||||
let html;
|
||||
let { type, processed_number_of_documents, total_number_of_documents } = status.meta;
|
||||
if (type === 'DocumentsAddition' && processed_number_of_documents && total_number_of_documents) {
|
||||
let progress = Math.round(processed_number_of_documents / total_number_of_documents * 100);
|
||||
|
||||
let { type, step, total_steps, current, total } = status.meta;
|
||||
|
||||
if (type === 'DocumentsAddition') {
|
||||
// If the total is null or undefined then the progress results is infinity.
|
||||
let progress = Math.round(current / total * 100);
|
||||
// We must divide the progress by the total number of indexing steps.
|
||||
progress = progress / total_steps;
|
||||
// And mark the previous steps as processed.
|
||||
progress = progress + (step * 100 / total_steps);
|
||||
// Generate the appropriate html bulma progress bar.
|
||||
html = `<progress class="progress" title="${progress}%" value="${progress}" max="100"></progress>`;
|
||||
} else {
|
||||
html = `<progress class="progress" max="100"></progress>`;
|
||||
}
|
||||
|
||||
content.html(html);
|
||||
}
|
||||
|
||||
|
@ -26,6 +26,7 @@ use warp::filters::ws::Message;
|
||||
use warp::{Filter, http::Response};
|
||||
|
||||
use milli::tokenizer::{simple_tokenizer, TokenType};
|
||||
use milli::update::UpdateIndexingStep::*;
|
||||
use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat};
|
||||
use milli::{obkv_to_json, Index, UpdateStore, SearchResult};
|
||||
|
||||
@ -201,8 +202,10 @@ enum UpdateMeta {
|
||||
#[serde(tag = "type")]
|
||||
enum UpdateMetaProgress {
|
||||
DocumentsAddition {
|
||||
processed_number_of_documents: usize,
|
||||
total_number_of_documents: Option<usize>,
|
||||
step: usize,
|
||||
total_steps: usize,
|
||||
current: usize,
|
||||
total: Option<usize>,
|
||||
},
|
||||
}
|
||||
|
||||
@ -310,12 +313,20 @@ async fn main() -> anyhow::Result<()> {
|
||||
Box::new(content) as Box<dyn io::Read>
|
||||
};
|
||||
|
||||
let result = builder.execute(reader, |count, total| {
|
||||
let result = builder.execute(reader, |indexing_step| {
|
||||
let (current, total) = match indexing_step {
|
||||
TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None),
|
||||
ComputeIdsAndMergeDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)),
|
||||
IndexDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)),
|
||||
MergeDataIntoFinalDatabase { databases_seen, total_databases } => (databases_seen, Some(total_databases)),
|
||||
};
|
||||
let _ = update_status_sender_cloned.send(UpdateStatus::Progressing {
|
||||
update_id,
|
||||
meta: UpdateMetaProgress::DocumentsAddition {
|
||||
processed_number_of_documents: count,
|
||||
total_number_of_documents: Some(total),
|
||||
step: indexing_step.step(),
|
||||
total_steps: indexing_step.number_of_steps(),
|
||||
current,
|
||||
total,
|
||||
}
|
||||
});
|
||||
});
|
||||
@ -356,12 +367,20 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
let result = builder.execute(|count, total| {
|
||||
let result = builder.execute(|indexing_step| {
|
||||
let (current, total) = match indexing_step {
|
||||
TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None),
|
||||
ComputeIdsAndMergeDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)),
|
||||
IndexDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)),
|
||||
MergeDataIntoFinalDatabase { databases_seen, total_databases } => (databases_seen, Some(total_databases)),
|
||||
};
|
||||
let _ = update_status_sender_cloned.send(UpdateStatus::Progressing {
|
||||
update_id,
|
||||
meta: UpdateMetaProgress::DocumentsAddition {
|
||||
processed_number_of_documents: count,
|
||||
total_number_of_documents: Some(total),
|
||||
step: indexing_step.step(),
|
||||
total_steps: indexing_step.number_of_steps(),
|
||||
current,
|
||||
total,
|
||||
}
|
||||
});
|
||||
});
|
||||
|
@ -15,6 +15,7 @@ use rayon::prelude::*;
|
||||
use rayon::ThreadPool;
|
||||
|
||||
use crate::index::Index;
|
||||
use crate::update::UpdateIndexingStep;
|
||||
use self::store::Store;
|
||||
use self::merge_function::{
|
||||
main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
|
||||
@ -249,13 +250,14 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
pub fn execute<R, F>(self, reader: R, progress_callback: F) -> anyhow::Result<()>
|
||||
where
|
||||
R: io::Read,
|
||||
F: Fn(usize, usize) + Sync,
|
||||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
{
|
||||
let before_transform = Instant::now();
|
||||
|
||||
let transform = Transform {
|
||||
rtxn: &self.wtxn,
|
||||
index: self.index,
|
||||
log_every_n: self.log_every_n,
|
||||
chunk_compression_type: self.chunk_compression_type,
|
||||
chunk_compression_level: self.chunk_compression_level,
|
||||
chunk_fusing_shrink_size: self.chunk_fusing_shrink_size,
|
||||
@ -266,9 +268,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
};
|
||||
|
||||
let output = match self.update_format {
|
||||
UpdateFormat::Csv => transform.from_csv(reader)?,
|
||||
UpdateFormat::Json => transform.from_json(reader)?,
|
||||
UpdateFormat::JsonStream => transform.from_json_stream(reader)?,
|
||||
UpdateFormat::Csv => transform.from_csv(reader, &progress_callback)?,
|
||||
UpdateFormat::Json => transform.from_json(reader, &progress_callback)?,
|
||||
UpdateFormat::JsonStream => transform.from_json_stream(reader, &progress_callback)?,
|
||||
};
|
||||
|
||||
info!("Update transformed in {:.02?}", before_transform.elapsed());
|
||||
@ -278,7 +280,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
|
||||
pub fn execute_raw<F>(self, output: TransformOutput, progress_callback: F) -> anyhow::Result<()>
|
||||
where
|
||||
F: Fn(usize, usize) + Sync
|
||||
F: Fn(UpdateIndexingStep) + Sync
|
||||
{
|
||||
let before_indexing = Instant::now();
|
||||
|
||||
@ -460,6 +462,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
documents_ids.union_with(&replaced_documents_ids);
|
||||
self.index.put_documents_ids(self.wtxn, &documents_ids)?;
|
||||
|
||||
let mut database_count = 0;
|
||||
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen: 0,
|
||||
total_databases: 5,
|
||||
});
|
||||
|
||||
debug!("Writing the docid word positions into LMDB on disk...");
|
||||
merge_into_lmdb_database(
|
||||
self.wtxn,
|
||||
@ -469,6 +477,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
write_method
|
||||
)?;
|
||||
|
||||
database_count += 1;
|
||||
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen: database_count,
|
||||
total_databases: 5,
|
||||
});
|
||||
|
||||
debug!("Writing the documents into LMDB on disk...");
|
||||
merge_into_lmdb_database(
|
||||
self.wtxn,
|
||||
@ -478,6 +492,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
write_method
|
||||
)?;
|
||||
|
||||
database_count += 1;
|
||||
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen: database_count,
|
||||
total_databases: 5,
|
||||
});
|
||||
|
||||
debug!("Writing the words pairs proximities docids into LMDB on disk...");
|
||||
merge_into_lmdb_database(
|
||||
self.wtxn,
|
||||
@ -487,6 +507,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
write_method,
|
||||
)?;
|
||||
|
||||
database_count += 1;
|
||||
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen: database_count,
|
||||
total_databases: 5,
|
||||
});
|
||||
|
||||
for (db_type, result) in receiver {
|
||||
let content = result?;
|
||||
match db_type {
|
||||
@ -512,8 +538,16 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
)?;
|
||||
},
|
||||
}
|
||||
|
||||
database_count += 1;
|
||||
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen: database_count,
|
||||
total_databases: 5,
|
||||
});
|
||||
}
|
||||
|
||||
debug_assert_eq!(database_count, 5);
|
||||
|
||||
info!("Transform output indexed in {:.02?}", before_indexing.elapsed());
|
||||
|
||||
Ok(())
|
||||
@ -537,7 +571,7 @@ mod tests {
|
||||
let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
builder.execute(content, |_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that there is 3 documents now.
|
||||
@ -551,7 +585,7 @@ mod tests {
|
||||
let content = &b"id,name\n1,updated kevin\n"[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
builder.execute(content, |_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that there is **always** 3 documents.
|
||||
@ -565,7 +599,7 @@ mod tests {
|
||||
let content = &b"id,name\n1,updated second kevin\n2,updated kevina\n3,updated benoit\n"[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
builder.execute(content, |_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that there is **always** 3 documents.
|
||||
@ -589,7 +623,7 @@ mod tests {
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
builder.execute(content, |_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that there is only 1 document now.
|
||||
@ -616,7 +650,7 @@ mod tests {
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
builder.execute(content, |_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that there is **always** 1 document.
|
||||
@ -652,7 +686,7 @@ mod tests {
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||
builder.disable_autogenerate_docids();
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
assert!(builder.execute(content, |_, _| ()).is_err());
|
||||
assert!(builder.execute(content, |_| ()).is_err());
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that there is no document.
|
||||
@ -679,7 +713,7 @@ mod tests {
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||
builder.disable_autogenerate_docids();
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
assert!(builder.execute(content, |_, _| ()).is_err());
|
||||
assert!(builder.execute(content, |_| ()).is_err());
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that there is no document.
|
||||
@ -701,7 +735,7 @@ mod tests {
|
||||
let content = &b"name\nkevin\nkevina\nbenoit\n"[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
builder.execute(content, |_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that there is 3 documents now.
|
||||
@ -719,7 +753,7 @@ mod tests {
|
||||
let content = format!("id,name\n{},updated kevin", kevin_uuid);
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.execute(content.as_bytes(), |_, _| ()).unwrap();
|
||||
builder.execute(content.as_bytes(), |_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that there is **always** 3 documents.
|
||||
@ -754,7 +788,7 @@ mod tests {
|
||||
let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
builder.execute(content, |_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that there is 3 documents now.
|
||||
@ -768,7 +802,7 @@ mod tests {
|
||||
let content = &b"name\nnew kevin"[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
builder.execute(content, |_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that there is 4 documents now.
|
||||
@ -790,7 +824,7 @@ mod tests {
|
||||
let content = &b"id,name\n"[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
builder.execute(content, |_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that there is no documents.
|
||||
@ -816,7 +850,7 @@ mod tests {
|
||||
]"#[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
builder.execute(content, |_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that there is 3 documents now.
|
||||
@ -838,7 +872,7 @@ mod tests {
|
||||
let content = &b"[]"[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
builder.execute(content, |_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that there is no documents.
|
||||
@ -864,7 +898,7 @@ mod tests {
|
||||
"#[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||
builder.update_format(UpdateFormat::JsonStream);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
builder.execute(content, |_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that there is 3 documents now.
|
||||
@ -887,7 +921,7 @@ mod tests {
|
||||
let content = &b"id,name\nbrume bleue,kevin\n"[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
assert!(builder.execute(content, |_, _| ()).is_err());
|
||||
assert!(builder.execute(content, |_| ()).is_err());
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// First we send 1 document with a valid id.
|
||||
@ -896,7 +930,7 @@ mod tests {
|
||||
let content = &b"id,name\n32,kevin\n"[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
builder.execute(content, |_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that there is 1 document now.
|
||||
@ -922,7 +956,7 @@ mod tests {
|
||||
]"#[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
builder.execute(content, |_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that there is 1 documents now.
|
||||
|
@ -16,6 +16,7 @@ use tempfile::tempfile;
|
||||
|
||||
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
||||
use crate::tokenizer::{simple_tokenizer, only_token};
|
||||
use crate::update::UpdateIndexingStep;
|
||||
use crate::{json_to_string, SmallVec32, Position, DocumentId};
|
||||
|
||||
use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
|
||||
@ -294,7 +295,7 @@ impl Store {
|
||||
log_every_n: Option<usize>,
|
||||
mut progress_callback: F,
|
||||
) -> anyhow::Result<Readers>
|
||||
where F: FnMut(usize, usize),
|
||||
where F: FnMut(UpdateIndexingStep),
|
||||
{
|
||||
debug!("{:?}: Indexing in a Store...", thread_index);
|
||||
|
||||
@ -311,7 +312,10 @@ impl Store {
|
||||
// This is a log routine that we do every `log_every_n` documents.
|
||||
if log_every_n.map_or(false, |len| count % len == 0) {
|
||||
info!("We have seen {} documents so far ({:.02?}).", format_count(count), before.elapsed());
|
||||
progress_callback(count, documents_count);
|
||||
progress_callback(UpdateIndexingStep::IndexDocuments {
|
||||
documents_seen: count,
|
||||
total_documents: documents_count,
|
||||
});
|
||||
before = Instant::now();
|
||||
}
|
||||
|
||||
@ -343,7 +347,10 @@ impl Store {
|
||||
count = count + 1;
|
||||
}
|
||||
|
||||
progress_callback(count, documents_count);
|
||||
progress_callback(UpdateIndexingStep::IndexDocuments {
|
||||
documents_seen: count,
|
||||
total_documents: documents_count,
|
||||
});
|
||||
|
||||
let readers = self.finish()?;
|
||||
debug!("{:?}: Store created!", thread_index);
|
||||
|
@ -11,7 +11,7 @@ use roaring::RoaringBitmap;
|
||||
use serde_json::{Map, Value};
|
||||
|
||||
use crate::{BEU32, MergeFn, Index, FieldsIdsMap};
|
||||
use crate::update::AvailableDocumentsIds;
|
||||
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
|
||||
use super::merge_function::merge_two_obkvs;
|
||||
use super::{create_writer, create_sorter, IndexDocumentsMethod};
|
||||
|
||||
@ -34,6 +34,7 @@ pub struct TransformOutput {
|
||||
pub struct Transform<'t, 'i> {
|
||||
pub rtxn: &'t heed::RoTxn<'i>,
|
||||
pub index: &'i Index,
|
||||
pub log_every_n: Option<usize>,
|
||||
pub chunk_compression_type: CompressionType,
|
||||
pub chunk_compression_level: Option<u32>,
|
||||
pub chunk_fusing_shrink_size: Option<u64>,
|
||||
@ -44,15 +45,32 @@ pub struct Transform<'t, 'i> {
|
||||
}
|
||||
|
||||
impl Transform<'_, '_> {
|
||||
pub fn from_json<R: Read>(self, reader: R) -> anyhow::Result<TransformOutput> {
|
||||
self.from_generic_json(reader, false)
|
||||
pub fn from_json<R, F>(self, reader: R, progress_callback: F) -> anyhow::Result<TransformOutput>
|
||||
where
|
||||
R: Read,
|
||||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
{
|
||||
self.from_generic_json(reader, false, progress_callback)
|
||||
}
|
||||
|
||||
pub fn from_json_stream<R: Read>(self, reader: R) -> anyhow::Result<TransformOutput> {
|
||||
self.from_generic_json(reader, true)
|
||||
pub fn from_json_stream<R, F>(self, reader: R, progress_callback: F) -> anyhow::Result<TransformOutput>
|
||||
where
|
||||
R: Read,
|
||||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
{
|
||||
self.from_generic_json(reader, true, progress_callback)
|
||||
}
|
||||
|
||||
fn from_generic_json<R: Read>(self, reader: R, is_stream: bool) -> anyhow::Result<TransformOutput> {
|
||||
fn from_generic_json<R, F>(
|
||||
self,
|
||||
reader: R,
|
||||
is_stream: bool,
|
||||
progress_callback: F,
|
||||
) -> anyhow::Result<TransformOutput>
|
||||
where
|
||||
R: Read,
|
||||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
{
|
||||
let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
|
||||
let users_ids_documents_ids = self.index.users_ids_documents_ids(self.rtxn).unwrap();
|
||||
let primary_key = self.index.primary_key(self.rtxn)?;
|
||||
@ -131,10 +149,17 @@ impl Transform<'_, '_> {
|
||||
let mut json_buffer = Vec::new();
|
||||
let mut obkv_buffer = Vec::new();
|
||||
let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
|
||||
let mut documents_count = 0;
|
||||
|
||||
for result in documents {
|
||||
let document = result?;
|
||||
|
||||
if self.log_every_n.map_or(false, |len| documents_count % len == 0) {
|
||||
progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
|
||||
documents_seen: documents_count,
|
||||
});
|
||||
}
|
||||
|
||||
obkv_buffer.clear();
|
||||
let mut writer = obkv::KvWriter::new(&mut obkv_buffer);
|
||||
|
||||
@ -186,14 +211,30 @@ impl Transform<'_, '_> {
|
||||
|
||||
// We use the extracted/generated user id as the key for this document.
|
||||
sorter.insert(user_id.as_bytes(), &obkv_buffer)?;
|
||||
documents_count += 1;
|
||||
}
|
||||
|
||||
progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
|
||||
documents_seen: documents_count,
|
||||
});
|
||||
|
||||
// Now that we have a valid sorter that contains the user id and the obkv we
|
||||
// give it to the last transforming function which returns the TransformOutput.
|
||||
self.from_sorter(sorter, primary_key, fields_ids_map, users_ids_documents_ids)
|
||||
self.from_sorter(
|
||||
sorter,
|
||||
primary_key,
|
||||
fields_ids_map,
|
||||
documents_count,
|
||||
users_ids_documents_ids,
|
||||
progress_callback,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn from_csv<R: Read>(self, reader: R) -> anyhow::Result<TransformOutput> {
|
||||
pub fn from_csv<R, F>(self, reader: R, progress_callback: F) -> anyhow::Result<TransformOutput>
|
||||
where
|
||||
R: Read,
|
||||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
{
|
||||
let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
|
||||
let users_ids_documents_ids = self.index.users_ids_documents_ids(self.rtxn).unwrap();
|
||||
|
||||
@ -255,12 +296,19 @@ impl Transform<'_, '_> {
|
||||
let mut json_buffer = Vec::new();
|
||||
let mut obkv_buffer = Vec::new();
|
||||
let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
|
||||
let mut documents_count = 0;
|
||||
|
||||
let mut record = csv::StringRecord::new();
|
||||
while csv.read_record(&mut record)? {
|
||||
obkv_buffer.clear();
|
||||
let mut writer = obkv::KvWriter::new(&mut obkv_buffer);
|
||||
|
||||
if self.log_every_n.map_or(false, |len| documents_count % len == 0) {
|
||||
progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
|
||||
documents_seen: documents_count,
|
||||
});
|
||||
}
|
||||
|
||||
// We extract the user id if we know where it is or generate an UUID V4 otherwise.
|
||||
let user_id = match user_id_pos {
|
||||
Some(pos) => {
|
||||
@ -292,23 +340,39 @@ impl Transform<'_, '_> {
|
||||
|
||||
// We use the extracted/generated user id as the key for this document.
|
||||
sorter.insert(user_id, &obkv_buffer)?;
|
||||
documents_count += 1;
|
||||
}
|
||||
|
||||
progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
|
||||
documents_seen: documents_count,
|
||||
});
|
||||
|
||||
// Now that we have a valid sorter that contains the user id and the obkv we
|
||||
// give it to the last transforming function which returns the TransformOutput.
|
||||
self.from_sorter(sorter, primary_key_field_id, fields_ids_map, users_ids_documents_ids)
|
||||
self.from_sorter(
|
||||
sorter,
|
||||
primary_key_field_id,
|
||||
fields_ids_map,
|
||||
documents_count,
|
||||
users_ids_documents_ids,
|
||||
progress_callback,
|
||||
)
|
||||
}
|
||||
|
||||
/// Generate the `TransformOutput` based on the given sorter that can be generated from any
|
||||
/// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document
|
||||
/// id for the user side and the value must be an obkv where keys are valid fields ids.
|
||||
fn from_sorter(
|
||||
fn from_sorter<F>(
|
||||
self,
|
||||
sorter: grenad::Sorter<MergeFn>,
|
||||
primary_key: u8,
|
||||
fields_ids_map: FieldsIdsMap,
|
||||
approximate_number_of_documents: usize,
|
||||
users_ids_documents_ids: fst::Map<Cow<'_, [u8]>>,
|
||||
progress_callback: F,
|
||||
) -> anyhow::Result<TransformOutput>
|
||||
where
|
||||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
{
|
||||
let documents_ids = self.index.documents_ids(self.rtxn)?;
|
||||
let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids);
|
||||
@ -332,6 +396,13 @@ impl Transform<'_, '_> {
|
||||
let mut iter = sorter.into_iter()?;
|
||||
while let Some((user_id, update_obkv)) = iter.next()? {
|
||||
|
||||
if self.log_every_n.map_or(false, |len| documents_count % len == 0) {
|
||||
progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments {
|
||||
documents_seen: documents_count,
|
||||
total_documents: approximate_number_of_documents,
|
||||
});
|
||||
}
|
||||
|
||||
let (docid, obkv) = match users_ids_documents_ids.get(user_id) {
|
||||
Some(docid) => {
|
||||
// If we find the user id in the current users ids documents ids map
|
||||
@ -369,6 +440,11 @@ impl Transform<'_, '_> {
|
||||
documents_count += 1;
|
||||
}
|
||||
|
||||
progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments {
|
||||
documents_seen: documents_count,
|
||||
total_documents: documents_count,
|
||||
});
|
||||
|
||||
// We create a final writer to write the new documents in order from the sorter.
|
||||
let file = tempfile::tempfile()?;
|
||||
let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
|
||||
|
@ -4,6 +4,7 @@ mod delete_documents;
|
||||
mod index_documents;
|
||||
mod settings;
|
||||
mod update_builder;
|
||||
mod update_step;
|
||||
mod update_store;
|
||||
|
||||
pub use self::available_documents_ids::AvailableDocumentsIds;
|
||||
@ -12,4 +13,5 @@ pub use self::delete_documents::DeleteDocuments;
|
||||
pub use self::index_documents::{IndexDocuments, IndexDocumentsMethod, UpdateFormat};
|
||||
pub use self::settings::Settings;
|
||||
pub use self::update_builder::UpdateBuilder;
|
||||
pub use self::update_step::UpdateIndexingStep;
|
||||
pub use self::update_store::UpdateStore;
|
||||
|
@ -3,7 +3,7 @@ use grenad::CompressionType;
|
||||
use rayon::ThreadPool;
|
||||
|
||||
use crate::update::index_documents::{Transform, IndexDocumentsMethod};
|
||||
use crate::update::{ClearDocuments, IndexDocuments};
|
||||
use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep};
|
||||
use crate::{Index, FieldsIdsMap};
|
||||
|
||||
pub struct Settings<'a, 't, 'u, 'i> {
|
||||
@ -60,7 +60,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
|
||||
pub fn execute<F>(self, progress_callback: F) -> anyhow::Result<()>
|
||||
where
|
||||
F: Fn(usize, usize) + Sync
|
||||
F: Fn(UpdateIndexingStep) + Sync
|
||||
{
|
||||
// Check that the searchable attributes have been specified.
|
||||
if let Some(value) = self.searchable_fields {
|
||||
@ -126,6 +126,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
let transform = Transform {
|
||||
rtxn: &self.wtxn,
|
||||
index: self.index,
|
||||
log_every_n: self.log_every_n,
|
||||
chunk_compression_type: self.chunk_compression_type,
|
||||
chunk_compression_level: self.chunk_compression_level,
|
||||
chunk_fusing_shrink_size: self.chunk_fusing_shrink_size,
|
||||
@ -231,14 +232,14 @@ mod tests {
|
||||
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
builder.execute(content, |_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// We change the searchable fields to be the "name" field only.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut builder = Settings::new(&mut wtxn, &index);
|
||||
builder.set_searchable_fields(vec!["name".into()]);
|
||||
builder.execute(|_, _| ()).unwrap();
|
||||
builder.execute(|_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that the searchable field is correctly set to "name" only.
|
||||
@ -260,7 +261,7 @@ mod tests {
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut builder = Settings::new(&mut wtxn, &index);
|
||||
builder.reset_searchable_fields();
|
||||
builder.execute(|_, _| ()).unwrap();
|
||||
builder.execute(|_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that the searchable field have been reset and documents are found now.
|
||||
@ -286,7 +287,7 @@ mod tests {
|
||||
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
builder.execute(content, |_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// In the same transaction we change the displayed fields to be only the "age".
|
||||
@ -295,7 +296,7 @@ mod tests {
|
||||
let mut builder = Settings::new(&mut wtxn, &index);
|
||||
builder.set_displayed_fields(vec!["age".into()]);
|
||||
builder.set_searchable_fields(vec!["name".into()]);
|
||||
builder.execute(|_, _| ()).unwrap();
|
||||
builder.execute(|_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that the displayed fields are correctly set to `None` (default value).
|
||||
@ -310,7 +311,7 @@ mod tests {
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut builder = Settings::new(&mut wtxn, &index);
|
||||
builder.reset_searchable_fields();
|
||||
builder.execute(|_, _| ()).unwrap();
|
||||
builder.execute(|_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that the displayed fields always contains only the "age" field.
|
||||
@ -334,7 +335,7 @@ mod tests {
|
||||
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
builder.execute(content, |_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that the displayed fields are correctly set to `None` (default value).
|
||||
@ -356,12 +357,12 @@ mod tests {
|
||||
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index);
|
||||
builder.update_format(UpdateFormat::Csv);
|
||||
builder.execute(content, |_, _| ()).unwrap();
|
||||
builder.execute(content, |_| ()).unwrap();
|
||||
|
||||
// In the same transaction we change the displayed fields to be only the age.
|
||||
let mut builder = Settings::new(&mut wtxn, &index);
|
||||
builder.set_displayed_fields(vec!["age".into()]);
|
||||
builder.execute(|_, _| ()).unwrap();
|
||||
builder.execute(|_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that the displayed fields are correctly set to only the "age" field.
|
||||
@ -376,7 +377,7 @@ mod tests {
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut builder = Settings::new(&mut wtxn, &index);
|
||||
builder.reset_displayed_fields();
|
||||
builder.execute(|_, _| ()).unwrap();
|
||||
builder.execute(|_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// Check that the displayed fields are correctly set to `None` (default value).
|
||||
|
36
src/update/update_step.rs
Normal file
36
src/update/update_step.rs
Normal file
@ -0,0 +1,36 @@
|
||||
use UpdateIndexingStep::*;
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum UpdateIndexingStep {
|
||||
/// Transform from the original user given format (CSV, JSON, JSON lines)
|
||||
/// into a generic format based on the obkv and grenad crates. This step also
|
||||
/// deduplicate potential documents in this batch update by merging or replacing them.
|
||||
TransformFromUserIntoGenericFormat { documents_seen: usize },
|
||||
|
||||
/// This step check the external document id, computes the internal ids and merge
|
||||
/// the documents that are already present in the database.
|
||||
ComputeIdsAndMergeDocuments { documents_seen: usize, total_documents: usize },
|
||||
|
||||
/// Extract the documents words using the tokenizer and compute the documents
|
||||
/// facets. Stores those words, facets and documents ids on disk.
|
||||
IndexDocuments { documents_seen: usize, total_documents: usize },
|
||||
|
||||
/// Merge the previously extracted data (words and facets) into the final LMDB database.
|
||||
/// These extracted data are split into multiple databases.
|
||||
MergeDataIntoFinalDatabase { databases_seen: usize, total_databases: usize },
|
||||
}
|
||||
|
||||
impl UpdateIndexingStep {
|
||||
pub const fn step(&self) -> usize {
|
||||
match self {
|
||||
TransformFromUserIntoGenericFormat { .. } => 0,
|
||||
ComputeIdsAndMergeDocuments { .. } => 1,
|
||||
IndexDocuments { .. } => 2,
|
||||
MergeDataIntoFinalDatabase { .. } => 3,
|
||||
}
|
||||
}
|
||||
|
||||
pub const fn number_of_steps(&self) -> usize {
|
||||
4
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user