Fix test and use progress callback

This commit is contained in:
many 2021-08-17 10:56:06 +02:00
parent 1d314328f0
commit 823da19745
No known key found for this signature in database
GPG Key ID: 2CEF23B75189EACA
3 changed files with 79 additions and 16 deletions

View File

@ -11,6 +11,7 @@ use std::collections::HashSet;
use std::fs::File; use std::fs::File;
use crossbeam_channel::Sender; use crossbeam_channel::Sender;
use log::debug;
use rayon::prelude::*; use rayon::prelude::*;
use self::extract_docid_word_positions::extract_docid_word_positions; use self::extract_docid_word_positions::extract_docid_word_positions;
@ -192,6 +193,7 @@ fn spawn_extraction_task<FE, FS>(
.map(|chunk| extract_fn(chunk, indexer.clone()).unwrap()) .map(|chunk| extract_fn(chunk, indexer.clone()).unwrap())
.collect(); .collect();
rayon::spawn(move || { rayon::spawn(move || {
debug!("merge {} database", name);
let reader = merge_readers(chunks, merge_fn, indexer).unwrap(); let reader = merge_readers(chunks, merge_fn, indexer).unwrap();
lmdb_writer_sx.send(serialize_fn(reader)).unwrap(); lmdb_writer_sx.send(serialize_fn(reader)).unwrap();
}); });

View File

@ -31,6 +31,10 @@ use crate::update::{
}; };
use crate::{Index, Result}; use crate::{Index, Result};
static MERGED_DATABASE_COUNT: usize = 7;
static PREFIX_DATABASE_COUNT: usize = 5;
static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT;
#[derive(Debug, Serialize, Deserialize, Clone)] #[derive(Debug, Serialize, Deserialize, Clone)]
pub struct DocumentAdditionResult { pub struct DocumentAdditionResult {
pub nb_documents: usize, pub nb_documents: usize,
@ -278,16 +282,35 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
let index_is_empty = index_documents_ids.len() == 0; let index_is_empty = index_documents_ids.len() == 0;
let mut final_documents_ids = RoaringBitmap::new(); let mut final_documents_ids = RoaringBitmap::new();
let mut databases_seen = 0;
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen,
total_databases: TOTAL_POSTING_DATABASE_COUNT,
});
for typed_chunk in lmdb_writer_rx { for typed_chunk in lmdb_writer_rx {
let docids = let (docids, is_merged_database) =
write_typed_chunk_into_index(typed_chunk, &self.index, self.wtxn, index_is_empty)?; write_typed_chunk_into_index(typed_chunk, &self.index, self.wtxn, index_is_empty)?;
if !docids.is_empty() {
final_documents_ids |= docids; final_documents_ids |= docids;
let documents_seen_count = final_documents_ids.len();
progress_callback(UpdateIndexingStep::IndexDocuments {
documents_seen: documents_seen_count as usize,
total_documents: documents_count,
});
debug!( debug!(
"We have seen {} documents on {} total document so far", "We have seen {} documents on {} total document so far",
final_documents_ids.len(), documents_seen_count, documents_count
documents_count
); );
} }
if is_merged_database {
databases_seen += 1;
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen: databases_seen,
total_databases: TOTAL_POSTING_DATABASE_COUNT,
});
}
}
// We write the field distribution into the main database // We write the field distribution into the main database
self.index.put_field_distribution(self.wtxn, &field_distribution)?; self.index.put_field_distribution(self.wtxn, &field_distribution)?;
@ -298,20 +321,19 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
// We write the external documents ids into the main database. // We write the external documents ids into the main database.
self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?; self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?;
let all_documents_ids = index_documents_ids | new_documents_ids; let all_documents_ids = index_documents_ids | new_documents_ids | replaced_documents_ids;
self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; self.index.put_documents_ids(self.wtxn, &all_documents_ids)?;
self.execute_prefix_databases(progress_callback) self.execute_prefix_databases(progress_callback)
} }
pub fn execute_prefix_databases<F>( pub fn execute_prefix_databases<F>(self, progress_callback: F) -> Result<()>
self,
// output: TransformOutput,
progress_callback: F,
) -> Result<()>
where where
F: Fn(UpdateIndexingStep) + Sync, F: Fn(UpdateIndexingStep) + Sync,
{ {
// Merged databases are already been indexed, we start from this count;
let mut databases_seen = MERGED_DATABASE_COUNT;
// Run the facets update operation. // Run the facets update operation.
let mut builder = Facets::new(self.wtxn, self.index, self.update_id); let mut builder = Facets::new(self.wtxn, self.index, self.update_id);
builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_type = self.chunk_compression_type;
@ -324,6 +346,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
} }
builder.execute()?; builder.execute()?;
databases_seen += 1;
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen: databases_seen,
total_databases: TOTAL_POSTING_DATABASE_COUNT,
});
// Run the words prefixes update operation. // Run the words prefixes update operation.
let mut builder = WordsPrefixesFst::new(self.wtxn, self.index, self.update_id); let mut builder = WordsPrefixesFst::new(self.wtxn, self.index, self.update_id);
if let Some(value) = self.words_prefix_threshold { if let Some(value) = self.words_prefix_threshold {
@ -334,6 +362,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
} }
builder.execute()?; builder.execute()?;
databases_seen += 1;
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen: databases_seen,
total_databases: TOTAL_POSTING_DATABASE_COUNT,
});
// Run the word prefix docids update operation. // Run the word prefix docids update operation.
let mut builder = WordPrefixDocids::new(self.wtxn, self.index); let mut builder = WordPrefixDocids::new(self.wtxn, self.index);
builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_type = self.chunk_compression_type;
@ -342,6 +376,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
builder.max_memory = self.max_memory; builder.max_memory = self.max_memory;
builder.execute()?; builder.execute()?;
databases_seen += 1;
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen: databases_seen,
total_databases: TOTAL_POSTING_DATABASE_COUNT,
});
// Run the word prefix pair proximity docids update operation. // Run the word prefix pair proximity docids update operation.
let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index); let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index);
builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_type = self.chunk_compression_type;
@ -350,6 +390,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
builder.max_memory = self.max_memory; builder.max_memory = self.max_memory;
builder.execute()?; builder.execute()?;
databases_seen += 1;
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen: databases_seen,
total_databases: TOTAL_POSTING_DATABASE_COUNT,
});
// Run the words level positions update operation. // Run the words level positions update operation.
let mut builder = WordsLevelPositions::new(self.wtxn, self.index); let mut builder = WordsLevelPositions::new(self.wtxn, self.index);
builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_type = self.chunk_compression_type;
@ -362,6 +408,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
} }
builder.execute()?; builder.execute()?;
databases_seen += 1;
progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen: databases_seen,
total_databases: TOTAL_POSTING_DATABASE_COUNT,
});
Ok(()) Ok(())
} }
} }

View File

@ -32,7 +32,8 @@ pub(crate) fn write_typed_chunk_into_index(
index: &Index, index: &Index,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
index_is_empty: bool, index_is_empty: bool,
) -> Result<RoaringBitmap> { ) -> Result<(RoaringBitmap, bool)> {
let mut is_merged_database = false;
match typed_chunk { match typed_chunk {
TypedChunk::DocidWordPositions(docid_word_positions_iter) => { TypedChunk::DocidWordPositions(docid_word_positions_iter) => {
write_entries_into_database( write_entries_into_database(
@ -71,8 +72,11 @@ pub(crate) fn write_typed_chunk_into_index(
|value, _buffer| Ok(value), |value, _buffer| Ok(value),
merge_cbo_roaring_bitmaps, merge_cbo_roaring_bitmaps,
)?; )?;
is_merged_database = true;
}
TypedChunk::NewDocumentsIds(documents_ids) => {
return Ok((documents_ids, is_merged_database))
} }
TypedChunk::NewDocumentsIds(documents_ids) => return Ok(documents_ids),
TypedChunk::WordDocids(word_docids_iter) => { TypedChunk::WordDocids(word_docids_iter) => {
let mut word_docids_iter = unsafe { into_clonable_grenad(word_docids_iter) }?; let mut word_docids_iter = unsafe { into_clonable_grenad(word_docids_iter) }?;
append_entries_into_database( append_entries_into_database(
@ -100,6 +104,7 @@ pub(crate) fn write_typed_chunk_into_index(
builder.extend_stream(union_stream)?; builder.extend_stream(union_stream)?;
let fst = builder.into_set(); let fst = builder.into_set();
index.put_words_fst(wtxn, &fst)?; index.put_words_fst(wtxn, &fst)?;
is_merged_database = true;
} }
TypedChunk::WordLevelPositionDocids(word_level_position_docids_iter) => { TypedChunk::WordLevelPositionDocids(word_level_position_docids_iter) => {
append_entries_into_database( append_entries_into_database(
@ -110,6 +115,7 @@ pub(crate) fn write_typed_chunk_into_index(
|value, _buffer| Ok(value), |value, _buffer| Ok(value),
merge_cbo_roaring_bitmaps, merge_cbo_roaring_bitmaps,
)?; )?;
is_merged_database = true;
} }
TypedChunk::FieldIdFacetNumberDocids(facet_id_f64_docids_iter) => { TypedChunk::FieldIdFacetNumberDocids(facet_id_f64_docids_iter) => {
append_entries_into_database( append_entries_into_database(
@ -120,6 +126,7 @@ pub(crate) fn write_typed_chunk_into_index(
|value, _buffer| Ok(value), |value, _buffer| Ok(value),
merge_cbo_roaring_bitmaps, merge_cbo_roaring_bitmaps,
)?; )?;
is_merged_database = true;
} }
TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => { TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => {
append_entries_into_database( append_entries_into_database(
@ -130,6 +137,7 @@ pub(crate) fn write_typed_chunk_into_index(
|value, _buffer| Ok(value), |value, _buffer| Ok(value),
merge_cbo_roaring_bitmaps, merge_cbo_roaring_bitmaps,
)?; )?;
is_merged_database = true;
} }
TypedChunk::FieldIdDocidFacetNumbers(mut fid_docid_facet_number) => { TypedChunk::FieldIdDocidFacetNumbers(mut fid_docid_facet_number) => {
let index_fid_docid_facet_numbers = let index_fid_docid_facet_numbers =
@ -166,10 +174,11 @@ pub(crate) fn write_typed_chunk_into_index(
Ok(values.serialize_into(buffer)?) Ok(values.serialize_into(buffer)?)
}, },
)?; )?;
is_merged_database = true;
} }
} }
Ok(RoaringBitmap::new()) Ok((RoaringBitmap::new(), is_merged_database))
} }
fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec<u8>) -> Result<()> { fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec<u8>) -> Result<()> {