Make the indexing process send the new progress step events

This commit is contained in:
Clément Renault 2020-11-11 12:39:09 +01:00
parent e78b96a657
commit ea43080548
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
5 changed files with 167 additions and 49 deletions

View File

@ -15,6 +15,7 @@ use rayon::prelude::*;
use rayon::ThreadPool; use rayon::ThreadPool;
use crate::index::Index; use crate::index::Index;
use crate::update::UpdateIndexingStep;
use self::store::Store; use self::store::Store;
use self::merge_function::{ use self::merge_function::{
main_merge, word_docids_merge, words_pairs_proximities_docids_merge, main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
@ -249,13 +250,14 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
pub fn execute<R, F>(self, reader: R, progress_callback: F) -> anyhow::Result<()> pub fn execute<R, F>(self, reader: R, progress_callback: F) -> anyhow::Result<()>
where where
R: io::Read, R: io::Read,
F: Fn(usize, usize) + Sync, F: Fn(UpdateIndexingStep) + Sync,
{ {
let before_transform = Instant::now(); let before_transform = Instant::now();
let transform = Transform { let transform = Transform {
rtxn: &self.wtxn, rtxn: &self.wtxn,
index: self.index, index: self.index,
log_every_n: self.log_every_n,
chunk_compression_type: self.chunk_compression_type, chunk_compression_type: self.chunk_compression_type,
chunk_compression_level: self.chunk_compression_level, chunk_compression_level: self.chunk_compression_level,
chunk_fusing_shrink_size: self.chunk_fusing_shrink_size, chunk_fusing_shrink_size: self.chunk_fusing_shrink_size,
@ -266,9 +268,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
}; };
let output = match self.update_format { let output = match self.update_format {
UpdateFormat::Csv => transform.from_csv(reader)?, UpdateFormat::Csv => transform.from_csv(reader, &progress_callback)?,
UpdateFormat::Json => transform.from_json(reader)?, UpdateFormat::Json => transform.from_json(reader, &progress_callback)?,
UpdateFormat::JsonStream => transform.from_json_stream(reader)?, UpdateFormat::JsonStream => transform.from_json_stream(reader, &progress_callback)?,
}; };
info!("Update transformed in {:.02?}", before_transform.elapsed()); info!("Update transformed in {:.02?}", before_transform.elapsed());
@ -278,7 +280,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
pub fn execute_raw<F>(self, output: TransformOutput, progress_callback: F) -> anyhow::Result<()> pub fn execute_raw<F>(self, output: TransformOutput, progress_callback: F) -> anyhow::Result<()>
where where
F: Fn(usize, usize) + Sync F: Fn(UpdateIndexingStep) + Sync
{ {
let before_indexing = Instant::now(); let before_indexing = Instant::now();
@ -460,6 +462,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
documents_ids.union_with(&replaced_documents_ids); documents_ids.union_with(&replaced_documents_ids);
self.index.put_documents_ids(self.wtxn, &documents_ids)?; self.index.put_documents_ids(self.wtxn, &documents_ids)?;
let mut database_count = 0;
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen: 0,
total_databases: 5,
});
debug!("Writing the docid word positions into LMDB on disk..."); debug!("Writing the docid word positions into LMDB on disk...");
merge_into_lmdb_database( merge_into_lmdb_database(
self.wtxn, self.wtxn,
@ -469,6 +477,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
write_method write_method
)?; )?;
database_count += 1;
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen: database_count,
total_databases: 5,
});
debug!("Writing the documents into LMDB on disk..."); debug!("Writing the documents into LMDB on disk...");
merge_into_lmdb_database( merge_into_lmdb_database(
self.wtxn, self.wtxn,
@ -478,6 +492,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
write_method write_method
)?; )?;
database_count += 1;
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen: database_count,
total_databases: 5,
});
debug!("Writing the words pairs proximities docids into LMDB on disk..."); debug!("Writing the words pairs proximities docids into LMDB on disk...");
merge_into_lmdb_database( merge_into_lmdb_database(
self.wtxn, self.wtxn,
@ -487,6 +507,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
write_method, write_method,
)?; )?;
database_count += 1;
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen: database_count,
total_databases: 5,
});
for (db_type, result) in receiver { for (db_type, result) in receiver {
let content = result?; let content = result?;
match db_type { match db_type {
@ -512,8 +538,16 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
)?; )?;
}, },
} }
database_count += 1;
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen: database_count,
total_databases: 5,
});
} }
debug_assert_eq!(database_count, 5);
info!("Transform output indexed in {:.02?}", before_indexing.elapsed()); info!("Transform output indexed in {:.02?}", before_indexing.elapsed());
Ok(()) Ok(())
@ -537,7 +571,7 @@ mod tests {
let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..]; let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index); let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv); builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap(); builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that there is 3 documents now. // Check that there is 3 documents now.
@ -551,7 +585,7 @@ mod tests {
let content = &b"id,name\n1,updated kevin\n"[..]; let content = &b"id,name\n1,updated kevin\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index); let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv); builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap(); builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that there is **always** 3 documents. // Check that there is **always** 3 documents.
@ -565,7 +599,7 @@ mod tests {
let content = &b"id,name\n1,updated second kevin\n2,updated kevina\n3,updated benoit\n"[..]; let content = &b"id,name\n1,updated second kevin\n2,updated kevina\n3,updated benoit\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index); let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv); builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap(); builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that there is **always** 3 documents. // Check that there is **always** 3 documents.
@ -589,7 +623,7 @@ mod tests {
let mut builder = IndexDocuments::new(&mut wtxn, &index); let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv); builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments); builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments);
builder.execute(content, |_, _| ()).unwrap(); builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that there is only 1 document now. // Check that there is only 1 document now.
@ -616,7 +650,7 @@ mod tests {
let mut builder = IndexDocuments::new(&mut wtxn, &index); let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv); builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments); builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments);
builder.execute(content, |_, _| ()).unwrap(); builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that there is **always** 1 document. // Check that there is **always** 1 document.
@ -652,7 +686,7 @@ mod tests {
let mut builder = IndexDocuments::new(&mut wtxn, &index); let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.disable_autogenerate_docids(); builder.disable_autogenerate_docids();
builder.update_format(UpdateFormat::Csv); builder.update_format(UpdateFormat::Csv);
assert!(builder.execute(content, |_, _| ()).is_err()); assert!(builder.execute(content, |_| ()).is_err());
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that there is no document. // Check that there is no document.
@ -679,7 +713,7 @@ mod tests {
let mut builder = IndexDocuments::new(&mut wtxn, &index); let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.disable_autogenerate_docids(); builder.disable_autogenerate_docids();
builder.update_format(UpdateFormat::Json); builder.update_format(UpdateFormat::Json);
assert!(builder.execute(content, |_, _| ()).is_err()); assert!(builder.execute(content, |_| ()).is_err());
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that there is no document. // Check that there is no document.
@ -701,7 +735,7 @@ mod tests {
let content = &b"name\nkevin\nkevina\nbenoit\n"[..]; let content = &b"name\nkevin\nkevina\nbenoit\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index); let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv); builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap(); builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that there is 3 documents now. // Check that there is 3 documents now.
@ -719,7 +753,7 @@ mod tests {
let content = format!("id,name\n{},updated kevin", kevin_uuid); let content = format!("id,name\n{},updated kevin", kevin_uuid);
let mut builder = IndexDocuments::new(&mut wtxn, &index); let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv); builder.update_format(UpdateFormat::Csv);
builder.execute(content.as_bytes(), |_, _| ()).unwrap(); builder.execute(content.as_bytes(), |_| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that there is **always** 3 documents. // Check that there is **always** 3 documents.
@ -754,7 +788,7 @@ mod tests {
let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..]; let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index); let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv); builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap(); builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that there is 3 documents now. // Check that there is 3 documents now.
@ -768,7 +802,7 @@ mod tests {
let content = &b"name\nnew kevin"[..]; let content = &b"name\nnew kevin"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index); let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv); builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap(); builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that there is 4 documents now. // Check that there is 4 documents now.
@ -790,7 +824,7 @@ mod tests {
let content = &b"id,name\n"[..]; let content = &b"id,name\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index); let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv); builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap(); builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that there is no documents. // Check that there is no documents.
@ -816,7 +850,7 @@ mod tests {
]"#[..]; ]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index); let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Json); builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap(); builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that there is 3 documents now. // Check that there is 3 documents now.
@ -838,7 +872,7 @@ mod tests {
let content = &b"[]"[..]; let content = &b"[]"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index); let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Json); builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap(); builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that there is no documents. // Check that there is no documents.
@ -864,7 +898,7 @@ mod tests {
"#[..]; "#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index); let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::JsonStream); builder.update_format(UpdateFormat::JsonStream);
builder.execute(content, |_, _| ()).unwrap(); builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that there is 3 documents now. // Check that there is 3 documents now.
@ -887,7 +921,7 @@ mod tests {
let content = &b"id,name\nbrume bleue,kevin\n"[..]; let content = &b"id,name\nbrume bleue,kevin\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index); let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv); builder.update_format(UpdateFormat::Csv);
assert!(builder.execute(content, |_, _| ()).is_err()); assert!(builder.execute(content, |_| ()).is_err());
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// First we send 1 document with a valid id. // First we send 1 document with a valid id.
@ -896,7 +930,7 @@ mod tests {
let content = &b"id,name\n32,kevin\n"[..]; let content = &b"id,name\n32,kevin\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index); let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv); builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap(); builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that there is 1 document now. // Check that there is 1 document now.
@ -922,7 +956,7 @@ mod tests {
]"#[..]; ]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index); let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Json); builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap(); builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that there is 1 documents now. // Check that there is 1 documents now.

View File

@ -16,6 +16,7 @@ use tempfile::tempfile;
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
use crate::tokenizer::{simple_tokenizer, only_token}; use crate::tokenizer::{simple_tokenizer, only_token};
use crate::update::UpdateIndexingStep;
use crate::{json_to_string, SmallVec32, Position, DocumentId}; use crate::{json_to_string, SmallVec32, Position, DocumentId};
use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
@ -294,7 +295,7 @@ impl Store {
log_every_n: Option<usize>, log_every_n: Option<usize>,
mut progress_callback: F, mut progress_callback: F,
) -> anyhow::Result<Readers> ) -> anyhow::Result<Readers>
where F: FnMut(usize, usize), where F: FnMut(UpdateIndexingStep),
{ {
debug!("{:?}: Indexing in a Store...", thread_index); debug!("{:?}: Indexing in a Store...", thread_index);
@ -311,7 +312,10 @@ impl Store {
// This is a log routine that we do every `log_every_n` documents. // This is a log routine that we do every `log_every_n` documents.
if log_every_n.map_or(false, |len| count % len == 0) { if log_every_n.map_or(false, |len| count % len == 0) {
info!("We have seen {} documents so far ({:.02?}).", format_count(count), before.elapsed()); info!("We have seen {} documents so far ({:.02?}).", format_count(count), before.elapsed());
progress_callback(count, documents_count); progress_callback(UpdateIndexingStep::IndexDocuments {
documents_seen: count,
total_documents: documents_count,
});
before = Instant::now(); before = Instant::now();
} }
@ -343,7 +347,10 @@ impl Store {
count = count + 1; count = count + 1;
} }
progress_callback(count, documents_count); progress_callback(UpdateIndexingStep::IndexDocuments {
documents_seen: count,
total_documents: documents_count,
});
let readers = self.finish()?; let readers = self.finish()?;
debug!("{:?}: Store created!", thread_index); debug!("{:?}: Store created!", thread_index);

View File

@ -11,7 +11,7 @@ use roaring::RoaringBitmap;
use serde_json::{Map, Value}; use serde_json::{Map, Value};
use crate::{BEU32, MergeFn, Index, FieldsIdsMap}; use crate::{BEU32, MergeFn, Index, FieldsIdsMap};
use crate::update::AvailableDocumentsIds; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
use super::merge_function::merge_two_obkvs; use super::merge_function::merge_two_obkvs;
use super::{create_writer, create_sorter, IndexDocumentsMethod}; use super::{create_writer, create_sorter, IndexDocumentsMethod};
@ -34,6 +34,7 @@ pub struct TransformOutput {
pub struct Transform<'t, 'i> { pub struct Transform<'t, 'i> {
pub rtxn: &'t heed::RoTxn<'i>, pub rtxn: &'t heed::RoTxn<'i>,
pub index: &'i Index, pub index: &'i Index,
pub log_every_n: Option<usize>,
pub chunk_compression_type: CompressionType, pub chunk_compression_type: CompressionType,
pub chunk_compression_level: Option<u32>, pub chunk_compression_level: Option<u32>,
pub chunk_fusing_shrink_size: Option<u64>, pub chunk_fusing_shrink_size: Option<u64>,
@ -44,15 +45,32 @@ pub struct Transform<'t, 'i> {
} }
impl Transform<'_, '_> { impl Transform<'_, '_> {
pub fn from_json<R: Read>(self, reader: R) -> anyhow::Result<TransformOutput> { pub fn from_json<R, F>(self, reader: R, progress_callback: F) -> anyhow::Result<TransformOutput>
self.from_generic_json(reader, false) where
R: Read,
F: Fn(UpdateIndexingStep) + Sync,
{
self.from_generic_json(reader, false, progress_callback)
} }
pub fn from_json_stream<R: Read>(self, reader: R) -> anyhow::Result<TransformOutput> { pub fn from_json_stream<R, F>(self, reader: R, progress_callback: F) -> anyhow::Result<TransformOutput>
self.from_generic_json(reader, true) where
R: Read,
F: Fn(UpdateIndexingStep) + Sync,
{
self.from_generic_json(reader, true, progress_callback)
} }
fn from_generic_json<R: Read>(self, reader: R, is_stream: bool) -> anyhow::Result<TransformOutput> { fn from_generic_json<R, F>(
self,
reader: R,
is_stream: bool,
progress_callback: F,
) -> anyhow::Result<TransformOutput>
where
R: Read,
F: Fn(UpdateIndexingStep) + Sync,
{
let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?; let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
let users_ids_documents_ids = self.index.users_ids_documents_ids(self.rtxn).unwrap(); let users_ids_documents_ids = self.index.users_ids_documents_ids(self.rtxn).unwrap();
let primary_key = self.index.primary_key(self.rtxn)?; let primary_key = self.index.primary_key(self.rtxn)?;
@ -131,10 +149,17 @@ impl Transform<'_, '_> {
let mut json_buffer = Vec::new(); let mut json_buffer = Vec::new();
let mut obkv_buffer = Vec::new(); let mut obkv_buffer = Vec::new();
let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
let mut documents_count = 0;
for result in documents { for result in documents {
let document = result?; let document = result?;
if self.log_every_n.map_or(false, |len| documents_count % len == 0) {
progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
documents_seen: documents_count,
});
}
obkv_buffer.clear(); obkv_buffer.clear();
let mut writer = obkv::KvWriter::new(&mut obkv_buffer); let mut writer = obkv::KvWriter::new(&mut obkv_buffer);
@ -186,14 +211,30 @@ impl Transform<'_, '_> {
// We use the extracted/generated user id as the key for this document. // We use the extracted/generated user id as the key for this document.
sorter.insert(user_id.as_bytes(), &obkv_buffer)?; sorter.insert(user_id.as_bytes(), &obkv_buffer)?;
documents_count += 1;
} }
progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
documents_seen: documents_count,
});
// Now that we have a valid sorter that contains the user id and the obkv we // Now that we have a valid sorter that contains the user id and the obkv we
// give it to the last transforming function which returns the TransformOutput. // give it to the last transforming function which returns the TransformOutput.
self.from_sorter(sorter, primary_key, fields_ids_map, users_ids_documents_ids) self.from_sorter(
sorter,
primary_key,
fields_ids_map,
documents_count,
users_ids_documents_ids,
progress_callback,
)
} }
pub fn from_csv<R: Read>(self, reader: R) -> anyhow::Result<TransformOutput> { pub fn from_csv<R, F>(self, reader: R, progress_callback: F) -> anyhow::Result<TransformOutput>
where
R: Read,
F: Fn(UpdateIndexingStep) + Sync,
{
let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?; let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
let users_ids_documents_ids = self.index.users_ids_documents_ids(self.rtxn).unwrap(); let users_ids_documents_ids = self.index.users_ids_documents_ids(self.rtxn).unwrap();
@ -255,12 +296,19 @@ impl Transform<'_, '_> {
let mut json_buffer = Vec::new(); let mut json_buffer = Vec::new();
let mut obkv_buffer = Vec::new(); let mut obkv_buffer = Vec::new();
let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
let mut documents_count = 0;
let mut record = csv::StringRecord::new(); let mut record = csv::StringRecord::new();
while csv.read_record(&mut record)? { while csv.read_record(&mut record)? {
obkv_buffer.clear(); obkv_buffer.clear();
let mut writer = obkv::KvWriter::new(&mut obkv_buffer); let mut writer = obkv::KvWriter::new(&mut obkv_buffer);
if self.log_every_n.map_or(false, |len| documents_count % len == 0) {
progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
documents_seen: documents_count,
});
}
// We extract the user id if we know where it is or generate an UUID V4 otherwise. // We extract the user id if we know where it is or generate an UUID V4 otherwise.
let user_id = match user_id_pos { let user_id = match user_id_pos {
Some(pos) => { Some(pos) => {
@ -292,23 +340,39 @@ impl Transform<'_, '_> {
// We use the extracted/generated user id as the key for this document. // We use the extracted/generated user id as the key for this document.
sorter.insert(user_id, &obkv_buffer)?; sorter.insert(user_id, &obkv_buffer)?;
documents_count += 1;
} }
progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
documents_seen: documents_count,
});
// Now that we have a valid sorter that contains the user id and the obkv we // Now that we have a valid sorter that contains the user id and the obkv we
// give it to the last transforming function which returns the TransformOutput. // give it to the last transforming function which returns the TransformOutput.
self.from_sorter(sorter, primary_key_field_id, fields_ids_map, users_ids_documents_ids) self.from_sorter(
sorter,
primary_key_field_id,
fields_ids_map,
documents_count,
users_ids_documents_ids,
progress_callback,
)
} }
/// Generate the `TransformOutput` based on the given sorter that can be generated from any /// Generate the `TransformOutput` based on the given sorter that can be generated from any
/// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document /// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document
/// id for the user side and the value must be an obkv where keys are valid fields ids. /// id for the user side and the value must be an obkv where keys are valid fields ids.
fn from_sorter( fn from_sorter<F>(
self, self,
sorter: grenad::Sorter<MergeFn>, sorter: grenad::Sorter<MergeFn>,
primary_key: u8, primary_key: u8,
fields_ids_map: FieldsIdsMap, fields_ids_map: FieldsIdsMap,
approximate_number_of_documents: usize,
users_ids_documents_ids: fst::Map<Cow<'_, [u8]>>, users_ids_documents_ids: fst::Map<Cow<'_, [u8]>>,
progress_callback: F,
) -> anyhow::Result<TransformOutput> ) -> anyhow::Result<TransformOutput>
where
F: Fn(UpdateIndexingStep) + Sync,
{ {
let documents_ids = self.index.documents_ids(self.rtxn)?; let documents_ids = self.index.documents_ids(self.rtxn)?;
let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids); let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids);
@ -332,6 +396,13 @@ impl Transform<'_, '_> {
let mut iter = sorter.into_iter()?; let mut iter = sorter.into_iter()?;
while let Some((user_id, update_obkv)) = iter.next()? { while let Some((user_id, update_obkv)) = iter.next()? {
if self.log_every_n.map_or(false, |len| documents_count % len == 0) {
progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments {
documents_seen: documents_count,
total_documents: approximate_number_of_documents,
});
}
let (docid, obkv) = match users_ids_documents_ids.get(user_id) { let (docid, obkv) = match users_ids_documents_ids.get(user_id) {
Some(docid) => { Some(docid) => {
// If we find the user id in the current users ids documents ids map // If we find the user id in the current users ids documents ids map
@ -369,6 +440,11 @@ impl Transform<'_, '_> {
documents_count += 1; documents_count += 1;
} }
progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments {
documents_seen: documents_count,
total_documents: documents_count,
});
// We create a final writer to write the new documents in order from the sorter. // We create a final writer to write the new documents in order from the sorter.
let file = tempfile::tempfile()?; let file = tempfile::tempfile()?;
let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?; let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;

View File

@ -3,7 +3,7 @@ use grenad::CompressionType;
use rayon::ThreadPool; use rayon::ThreadPool;
use crate::update::index_documents::{Transform, IndexDocumentsMethod}; use crate::update::index_documents::{Transform, IndexDocumentsMethod};
use crate::update::{ClearDocuments, IndexDocuments}; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep};
use crate::{Index, FieldsIdsMap}; use crate::{Index, FieldsIdsMap};
pub struct Settings<'a, 't, 'u, 'i> { pub struct Settings<'a, 't, 'u, 'i> {
@ -60,7 +60,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
pub fn execute<F>(self, progress_callback: F) -> anyhow::Result<()> pub fn execute<F>(self, progress_callback: F) -> anyhow::Result<()>
where where
F: Fn(usize, usize) + Sync F: Fn(UpdateIndexingStep) + Sync
{ {
// Check that the searchable attributes have been specified. // Check that the searchable attributes have been specified.
if let Some(value) = self.searchable_fields { if let Some(value) = self.searchable_fields {
@ -126,6 +126,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
let transform = Transform { let transform = Transform {
rtxn: &self.wtxn, rtxn: &self.wtxn,
index: self.index, index: self.index,
log_every_n: self.log_every_n,
chunk_compression_type: self.chunk_compression_type, chunk_compression_type: self.chunk_compression_type,
chunk_compression_level: self.chunk_compression_level, chunk_compression_level: self.chunk_compression_level,
chunk_fusing_shrink_size: self.chunk_fusing_shrink_size, chunk_fusing_shrink_size: self.chunk_fusing_shrink_size,
@ -231,14 +232,14 @@ mod tests {
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index); let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv); builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap(); builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// We change the searchable fields to be the "name" field only. // We change the searchable fields to be the "name" field only.
let mut wtxn = index.write_txn().unwrap(); let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index); let mut builder = Settings::new(&mut wtxn, &index);
builder.set_searchable_fields(vec!["name".into()]); builder.set_searchable_fields(vec!["name".into()]);
builder.execute(|_, _| ()).unwrap(); builder.execute(|_| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that the searchable field is correctly set to "name" only. // Check that the searchable field is correctly set to "name" only.
@ -260,7 +261,7 @@ mod tests {
let mut wtxn = index.write_txn().unwrap(); let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index); let mut builder = Settings::new(&mut wtxn, &index);
builder.reset_searchable_fields(); builder.reset_searchable_fields();
builder.execute(|_, _| ()).unwrap(); builder.execute(|_| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that the searchable field have been reset and documents are found now. // Check that the searchable field have been reset and documents are found now.
@ -286,7 +287,7 @@ mod tests {
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index); let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv); builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap(); builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// In the same transaction we change the displayed fields to be only the "age". // In the same transaction we change the displayed fields to be only the "age".
@ -295,7 +296,7 @@ mod tests {
let mut builder = Settings::new(&mut wtxn, &index); let mut builder = Settings::new(&mut wtxn, &index);
builder.set_displayed_fields(vec!["age".into()]); builder.set_displayed_fields(vec!["age".into()]);
builder.set_searchable_fields(vec!["name".into()]); builder.set_searchable_fields(vec!["name".into()]);
builder.execute(|_, _| ()).unwrap(); builder.execute(|_| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that the displayed fields are correctly set to `None` (default value). // Check that the displayed fields are correctly set to `None` (default value).
@ -310,7 +311,7 @@ mod tests {
let mut wtxn = index.write_txn().unwrap(); let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index); let mut builder = Settings::new(&mut wtxn, &index);
builder.reset_searchable_fields(); builder.reset_searchable_fields();
builder.execute(|_, _| ()).unwrap(); builder.execute(|_| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that the displayed fields always contains only the "age" field. // Check that the displayed fields always contains only the "age" field.
@ -334,7 +335,7 @@ mod tests {
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index); let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv); builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap(); builder.execute(content, |_| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that the displayed fields are correctly set to `None` (default value). // Check that the displayed fields are correctly set to `None` (default value).
@ -356,12 +357,12 @@ mod tests {
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index); let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv); builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap(); builder.execute(content, |_| ()).unwrap();
// In the same transaction we change the displayed fields to be only the age. // In the same transaction we change the displayed fields to be only the age.
let mut builder = Settings::new(&mut wtxn, &index); let mut builder = Settings::new(&mut wtxn, &index);
builder.set_displayed_fields(vec!["age".into()]); builder.set_displayed_fields(vec!["age".into()]);
builder.execute(|_, _| ()).unwrap(); builder.execute(|_| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that the displayed fields are correctly set to only the "age" field. // Check that the displayed fields are correctly set to only the "age" field.
@ -376,7 +377,7 @@ mod tests {
let mut wtxn = index.write_txn().unwrap(); let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index); let mut builder = Settings::new(&mut wtxn, &index);
builder.reset_displayed_fields(); builder.reset_displayed_fields();
builder.execute(|_, _| ()).unwrap(); builder.execute(|_| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
// Check that the displayed fields are correctly set to `None` (default value). // Check that the displayed fields are correctly set to `None` (default value).

View File

@ -21,7 +21,7 @@ pub enum UpdateIndexingStep {
} }
impl UpdateIndexingStep { impl UpdateIndexingStep {
pub const fn step_index(&self) -> usize { pub const fn step(&self) -> usize {
match self { match self {
TransformFromUserIntoGenericFormat { .. } => 0, TransformFromUserIntoGenericFormat { .. } => 0,
ComputeIdsAndMergeDocuments { .. } => 1, ComputeIdsAndMergeDocuments { .. } => 1,