mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-22 21:04:27 +01:00
First iteration on exposing puffin profiling
This commit is contained in:
parent
9daccdf7f0
commit
eef95de30e
39
Cargo.lock
generated
39
Cargo.lock
generated
@ -1973,6 +1973,7 @@ dependencies = [
|
|||||||
"meilisearch-types",
|
"meilisearch-types",
|
||||||
"nelson",
|
"nelson",
|
||||||
"page_size 0.5.0",
|
"page_size 0.5.0",
|
||||||
|
"puffin",
|
||||||
"roaring",
|
"roaring",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
@ -2498,6 +2499,12 @@ dependencies = [
|
|||||||
"syn 1.0.109",
|
"syn 1.0.109",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "lz4_flex"
|
||||||
|
version = "0.10.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8b8c72594ac26bfd34f2d99dfced2edfaddfe8a476e3ff2ca0eb293d925c4f83"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "manifest-dir-macros"
|
name = "manifest-dir-macros"
|
||||||
version = "0.1.17"
|
version = "0.1.17"
|
||||||
@ -2587,6 +2594,8 @@ dependencies = [
|
|||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"platform-dirs",
|
"platform-dirs",
|
||||||
"prometheus",
|
"prometheus",
|
||||||
|
"puffin",
|
||||||
|
"puffin_http",
|
||||||
"rand",
|
"rand",
|
||||||
"rayon",
|
"rayon",
|
||||||
"regex",
|
"regex",
|
||||||
@ -2731,6 +2740,7 @@ dependencies = [
|
|||||||
"obkv",
|
"obkv",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"ordered-float",
|
"ordered-float",
|
||||||
|
"puffin",
|
||||||
"rand",
|
"rand",
|
||||||
"rand_pcg",
|
"rand_pcg",
|
||||||
"rayon",
|
"rayon",
|
||||||
@ -3256,6 +3266,35 @@ version = "2.28.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94"
|
checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "puffin"
|
||||||
|
version = "0.16.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "76425abd4e1a0ad4bd6995dd974b52f414fca9974171df8e3708b3e660d05a21"
|
||||||
|
dependencies = [
|
||||||
|
"anyhow",
|
||||||
|
"bincode",
|
||||||
|
"byteorder",
|
||||||
|
"cfg-if",
|
||||||
|
"instant",
|
||||||
|
"lz4_flex",
|
||||||
|
"once_cell",
|
||||||
|
"parking_lot",
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "puffin_http"
|
||||||
|
version = "0.13.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "13bffc600c35913d282ae1e96a6ffcdf36dc7a7cdb9310e0ba15914d258c8193"
|
||||||
|
dependencies = [
|
||||||
|
"anyhow",
|
||||||
|
"crossbeam-channel",
|
||||||
|
"log",
|
||||||
|
"puffin",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "quote"
|
name = "quote"
|
||||||
version = "1.0.28"
|
version = "1.0.28"
|
||||||
|
@ -22,6 +22,7 @@ log = "0.4.17"
|
|||||||
meilisearch-auth = { path = "../meilisearch-auth" }
|
meilisearch-auth = { path = "../meilisearch-auth" }
|
||||||
meilisearch-types = { path = "../meilisearch-types" }
|
meilisearch-types = { path = "../meilisearch-types" }
|
||||||
page_size = "0.5.0"
|
page_size = "0.5.0"
|
||||||
|
puffin = "0.16.0"
|
||||||
roaring = { version = "0.10.1", features = ["serde"] }
|
roaring = { version = "0.10.1", features = ["serde"] }
|
||||||
serde = { version = "1.0.160", features = ["derive"] }
|
serde = { version = "1.0.160", features = ["derive"] }
|
||||||
serde_json = { version = "1.0.95", features = ["preserve_order"] }
|
serde_json = { version = "1.0.95", features = ["preserve_order"] }
|
||||||
|
@ -471,6 +471,8 @@ impl IndexScheduler {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
self.maybe_fail(crate::tests::FailureLocation::InsideCreateBatch)?;
|
self.maybe_fail(crate::tests::FailureLocation::InsideCreateBatch)?;
|
||||||
|
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
let enqueued = &self.get_status(rtxn, Status::Enqueued)?;
|
let enqueued = &self.get_status(rtxn, Status::Enqueued)?;
|
||||||
let to_cancel = self.get_kind(rtxn, Kind::TaskCancelation)? & enqueued;
|
let to_cancel = self.get_kind(rtxn, Kind::TaskCancelation)? & enqueued;
|
||||||
|
|
||||||
@ -575,6 +577,9 @@ impl IndexScheduler {
|
|||||||
self.maybe_fail(crate::tests::FailureLocation::PanicInsideProcessBatch)?;
|
self.maybe_fail(crate::tests::FailureLocation::PanicInsideProcessBatch)?;
|
||||||
self.breakpoint(crate::Breakpoint::InsideProcessBatch);
|
self.breakpoint(crate::Breakpoint::InsideProcessBatch);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
puffin::profile_function!(format!("{:?}", batch));
|
||||||
|
|
||||||
match batch {
|
match batch {
|
||||||
Batch::TaskCancelation { mut task, previous_started_at, previous_processing_tasks } => {
|
Batch::TaskCancelation { mut task, previous_started_at, previous_processing_tasks } => {
|
||||||
// 1. Retrieve the tasks that matched the query at enqueue-time.
|
// 1. Retrieve the tasks that matched the query at enqueue-time.
|
||||||
@ -1111,6 +1116,8 @@ impl IndexScheduler {
|
|||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
operation: IndexOperation,
|
operation: IndexOperation,
|
||||||
) -> Result<Vec<Task>> {
|
) -> Result<Vec<Task>> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
match operation {
|
match operation {
|
||||||
IndexOperation::DocumentClear { mut tasks, .. } => {
|
IndexOperation::DocumentClear { mut tasks, .. } => {
|
||||||
let count = milli::update::ClearDocuments::new(index_wtxn, index).execute()?;
|
let count = milli::update::ClearDocuments::new(index_wtxn, index).execute()?;
|
||||||
|
@ -1032,6 +1032,8 @@ impl IndexScheduler {
|
|||||||
self.breakpoint(Breakpoint::Start);
|
self.breakpoint(Breakpoint::Start);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
puffin::GlobalProfiler::lock().new_frame();
|
||||||
|
|
||||||
self.cleanup_task_queue()?;
|
self.cleanup_task_queue()?;
|
||||||
|
|
||||||
let rtxn = self.env.read_txn().map_err(Error::HeedTransaction)?;
|
let rtxn = self.env.read_txn().map_err(Error::HeedTransaction)?;
|
||||||
|
@ -67,6 +67,8 @@ permissive-json-pointer = { path = "../permissive-json-pointer" }
|
|||||||
pin-project-lite = "0.2.9"
|
pin-project-lite = "0.2.9"
|
||||||
platform-dirs = "0.3.0"
|
platform-dirs = "0.3.0"
|
||||||
prometheus = { version = "0.13.3", features = ["process"] }
|
prometheus = { version = "0.13.3", features = ["process"] }
|
||||||
|
puffin = "0.16.0"
|
||||||
|
puffin_http = "0.13.0"
|
||||||
rand = "0.8.5"
|
rand = "0.8.5"
|
||||||
rayon = "1.7.0"
|
rayon = "1.7.0"
|
||||||
regex = "1.7.3"
|
regex = "1.7.3"
|
||||||
|
@ -29,6 +29,9 @@ fn setup(opt: &Opt) -> anyhow::Result<()> {
|
|||||||
async fn main() -> anyhow::Result<()> {
|
async fn main() -> anyhow::Result<()> {
|
||||||
let (opt, config_read_from) = Opt::try_build()?;
|
let (opt, config_read_from) = Opt::try_build()?;
|
||||||
|
|
||||||
|
puffin::set_scopes_on(true);
|
||||||
|
let _server = puffin_http::Server::new(&format!("0.0.0.0:{}", puffin_http::DEFAULT_PORT))?;
|
||||||
|
|
||||||
anyhow::ensure!(
|
anyhow::ensure!(
|
||||||
!(cfg!(windows) && opt.experimental_reduce_indexing_memory_usage),
|
!(cfg!(windows) && opt.experimental_reduce_indexing_memory_usage),
|
||||||
"The `experimental-reduce-indexing-memory-usage` flag is not supported on Windows"
|
"The `experimental-reduce-indexing-memory-usage` flag is not supported on Windows"
|
||||||
|
@ -67,6 +67,9 @@ filter-parser = { path = "../filter-parser" }
|
|||||||
# documents words self-join
|
# documents words self-join
|
||||||
itertools = "0.10.5"
|
itertools = "0.10.5"
|
||||||
|
|
||||||
|
# profiling
|
||||||
|
puffin = "0.16.0"
|
||||||
|
|
||||||
# logging
|
# logging
|
||||||
log = "0.4.17"
|
log = "0.4.17"
|
||||||
logging_timer = "1.1.0"
|
logging_timer = "1.1.0"
|
||||||
|
@ -15,6 +15,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn execute(self) -> Result<u64> {
|
pub fn execute(self) -> Result<u64> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?;
|
self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?;
|
||||||
let Index {
|
let Index {
|
||||||
env: _env,
|
env: _env,
|
||||||
|
@ -110,6 +110,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
Some(docid)
|
Some(docid)
|
||||||
}
|
}
|
||||||
pub fn execute(self) -> Result<DocumentDeletionResult> {
|
pub fn execute(self) -> Result<DocumentDeletionResult> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
let DetailedDocumentDeletionResult { deleted_documents, remaining_documents } =
|
let DetailedDocumentDeletionResult { deleted_documents, remaining_documents } =
|
||||||
self.execute_inner()?;
|
self.execute_inner()?;
|
||||||
|
|
||||||
|
@ -31,6 +31,8 @@ pub fn enrich_documents_batch<R: Read + Seek>(
|
|||||||
autogenerate_docids: bool,
|
autogenerate_docids: bool,
|
||||||
reader: DocumentsBatchReader<R>,
|
reader: DocumentsBatchReader<R>,
|
||||||
) -> Result<StdResult<EnrichedDocumentsBatchReader<R>, UserError>> {
|
) -> Result<StdResult<EnrichedDocumentsBatchReader<R>, UserError>> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index();
|
let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index();
|
||||||
|
|
||||||
let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?;
|
let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?;
|
||||||
|
@ -30,6 +30,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
stop_words: Option<&fst::Set<&[u8]>>,
|
stop_words: Option<&fst::Set<&[u8]>>,
|
||||||
max_positions_per_attributes: Option<u32>,
|
max_positions_per_attributes: Option<u32>,
|
||||||
) -> Result<(RoaringBitmap, grenad::Reader<File>, ScriptLanguageDocidsMap)> {
|
) -> Result<(RoaringBitmap, grenad::Reader<File>, ScriptLanguageDocidsMap)> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_positions_per_attributes = max_positions_per_attributes
|
let max_positions_per_attributes = max_positions_per_attributes
|
||||||
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
@ -20,6 +20,8 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
|||||||
docid_fid_facet_number: grenad::Reader<R>,
|
docid_fid_facet_number: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<File>> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
let mut facet_number_docids_sorter = create_sorter(
|
let mut facet_number_docids_sorter = create_sorter(
|
||||||
|
@ -18,6 +18,8 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
|||||||
docid_fid_facet_string: grenad::Reader<R>,
|
docid_fid_facet_string: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<File>> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
let mut facet_string_docids_sorter = create_sorter(
|
let mut facet_string_docids_sorter = create_sorter(
|
||||||
|
@ -34,6 +34,8 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
faceted_fields: &HashSet<FieldId>,
|
faceted_fields: &HashSet<FieldId>,
|
||||||
) -> Result<ExtractedFacetValues> {
|
) -> Result<ExtractedFacetValues> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
let mut fid_docid_facet_numbers_sorter = create_sorter(
|
let mut fid_docid_facet_numbers_sorter = create_sorter(
|
||||||
|
@ -22,6 +22,8 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
|||||||
docid_word_positions: grenad::Reader<R>,
|
docid_word_positions: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<File>> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
let mut fid_word_count_docids_sorter = create_sorter(
|
let mut fid_word_count_docids_sorter = create_sorter(
|
||||||
|
@ -19,6 +19,8 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
|
|||||||
primary_key_id: FieldId,
|
primary_key_id: FieldId,
|
||||||
(lat_fid, lng_fid): (FieldId, FieldId),
|
(lat_fid, lng_fid): (FieldId, FieldId),
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<File>> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
let mut writer = create_writer(
|
let mut writer = create_writer(
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
|
@ -19,6 +19,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
primary_key_id: FieldId,
|
primary_key_id: FieldId,
|
||||||
vectors_fid: FieldId,
|
vectors_fid: FieldId,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<File>> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
let mut writer = create_writer(
|
let mut writer = create_writer(
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
|
@ -27,6 +27,8 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
|||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
exact_attributes: &HashSet<FieldId>,
|
exact_attributes: &HashSet<FieldId>,
|
||||||
) -> Result<(grenad::Reader<File>, grenad::Reader<File>)> {
|
) -> Result<(grenad::Reader<File>, grenad::Reader<File>)> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
let mut word_docids_sorter = create_sorter(
|
let mut word_docids_sorter = create_sorter(
|
||||||
|
@ -15,6 +15,8 @@ pub fn extract_word_fid_docids<R: io::Read + io::Seek>(
|
|||||||
docid_word_positions: grenad::Reader<R>,
|
docid_word_positions: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<File>> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
let mut word_fid_docids_sorter = create_sorter(
|
let mut word_fid_docids_sorter = create_sorter(
|
||||||
|
@ -21,6 +21,8 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
|||||||
docid_word_positions: grenad::Reader<R>,
|
docid_word_positions: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<File>> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
let mut word_pair_proximity_docids_sorter = create_sorter(
|
let mut word_pair_proximity_docids_sorter = create_sorter(
|
||||||
|
@ -18,6 +18,8 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
|||||||
docid_word_positions: grenad::Reader<R>,
|
docid_word_positions: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<File>> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
let mut word_position_docids_sorter = create_sorter(
|
let mut word_position_docids_sorter = create_sorter(
|
||||||
|
@ -52,6 +52,8 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
max_positions_per_attributes: Option<u32>,
|
max_positions_per_attributes: Option<u32>,
|
||||||
exact_attributes: HashSet<FieldId>,
|
exact_attributes: HashSet<FieldId>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
original_obkv_chunks
|
original_obkv_chunks
|
||||||
.par_bridge()
|
.par_bridge()
|
||||||
.map(|original_documents_chunk| {
|
.map(|original_documents_chunk| {
|
||||||
@ -238,11 +240,13 @@ fn spawn_extraction_task<FE, FS, M>(
|
|||||||
M::Output: Send,
|
M::Output: Send,
|
||||||
{
|
{
|
||||||
rayon::spawn(move || {
|
rayon::spawn(move || {
|
||||||
|
puffin::profile_scope!("extract_multiple_chunks", name);
|
||||||
let chunks: Result<M> =
|
let chunks: Result<M> =
|
||||||
chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer)).collect();
|
chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer)).collect();
|
||||||
rayon::spawn(move || match chunks {
|
rayon::spawn(move || match chunks {
|
||||||
Ok(chunks) => {
|
Ok(chunks) => {
|
||||||
debug!("merge {} database", name);
|
debug!("merge {} database", name);
|
||||||
|
puffin::profile_scope!("merge_multiple_chunks", name);
|
||||||
let reader = chunks.merge(merge_fn, &indexer);
|
let reader = chunks.merge(merge_fn, &indexer);
|
||||||
let _ = lmdb_writer_sx.send(reader.map(serialize_fn));
|
let _ = lmdb_writer_sx.send(reader.map(serialize_fn));
|
||||||
}
|
}
|
||||||
|
@ -214,6 +214,7 @@ pub fn sorter_into_lmdb_database(
|
|||||||
sorter: Sorter<MergeFn>,
|
sorter: Sorter<MergeFn>,
|
||||||
merge: MergeFn,
|
merge: MergeFn,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
|
puffin::profile_function!();
|
||||||
debug!("Writing MTBL sorter...");
|
debug!("Writing MTBL sorter...");
|
||||||
let before = Instant::now();
|
let before = Instant::now();
|
||||||
|
|
||||||
|
@ -137,6 +137,8 @@ where
|
|||||||
mut self,
|
mut self,
|
||||||
reader: DocumentsBatchReader<R>,
|
reader: DocumentsBatchReader<R>,
|
||||||
) -> Result<(Self, StdResult<u64, UserError>)> {
|
) -> Result<(Self, StdResult<u64, UserError>)> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
// Early return when there is no document to add
|
// Early return when there is no document to add
|
||||||
if reader.is_empty() {
|
if reader.is_empty() {
|
||||||
return Ok((self, Ok(0)));
|
return Ok((self, Ok(0)));
|
||||||
@ -175,6 +177,8 @@ where
|
|||||||
mut self,
|
mut self,
|
||||||
to_delete: Vec<String>,
|
to_delete: Vec<String>,
|
||||||
) -> Result<(Self, StdResult<u64, UserError>)> {
|
) -> Result<(Self, StdResult<u64, UserError>)> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
// Early return when there is no document to add
|
// Early return when there is no document to add
|
||||||
if to_delete.is_empty() {
|
if to_delete.is_empty() {
|
||||||
return Ok((self, Ok(0)));
|
return Ok((self, Ok(0)));
|
||||||
@ -194,6 +198,8 @@ where
|
|||||||
|
|
||||||
#[logging_timer::time("IndexDocuments::{}")]
|
#[logging_timer::time("IndexDocuments::{}")]
|
||||||
pub fn execute(mut self) -> Result<DocumentAdditionResult> {
|
pub fn execute(mut self) -> Result<DocumentAdditionResult> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
if self.added_documents == 0 {
|
if self.added_documents == 0 {
|
||||||
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
||||||
return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents });
|
return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents });
|
||||||
@ -232,6 +238,8 @@ where
|
|||||||
FP: Fn(UpdateIndexingStep) + Sync,
|
FP: Fn(UpdateIndexingStep) + Sync,
|
||||||
FA: Fn() -> bool + Sync,
|
FA: Fn() -> bool + Sync,
|
||||||
{
|
{
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
let TransformOutput {
|
let TransformOutput {
|
||||||
primary_key,
|
primary_key,
|
||||||
fields_ids_map,
|
fields_ids_map,
|
||||||
@ -322,6 +330,7 @@ where
|
|||||||
|
|
||||||
// Run extraction pipeline in parallel.
|
// Run extraction pipeline in parallel.
|
||||||
pool.install(|| {
|
pool.install(|| {
|
||||||
|
puffin::profile_scope!("extract_and_send_grenad_chunks");
|
||||||
// split obkv file into several chunks
|
// split obkv file into several chunks
|
||||||
let original_chunk_iter =
|
let original_chunk_iter =
|
||||||
grenad_obkv_into_chunks(original_documents, pool_params, documents_chunk_size);
|
grenad_obkv_into_chunks(original_documents, pool_params, documents_chunk_size);
|
||||||
@ -477,6 +486,8 @@ where
|
|||||||
FP: Fn(UpdateIndexingStep) + Sync,
|
FP: Fn(UpdateIndexingStep) + Sync,
|
||||||
FA: Fn() -> bool + Sync,
|
FA: Fn() -> bool + Sync,
|
||||||
{
|
{
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
// Merged databases are already been indexed, we start from this count;
|
// Merged databases are already been indexed, we start from this count;
|
||||||
let mut databases_seen = MERGED_DATABASE_COUNT;
|
let mut databases_seen = MERGED_DATABASE_COUNT;
|
||||||
|
|
||||||
@ -511,26 +522,36 @@ where
|
|||||||
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
||||||
}
|
}
|
||||||
|
|
||||||
let current_prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
|
let current_prefix_fst;
|
||||||
|
let common_prefix_fst_words_tmp;
|
||||||
|
let common_prefix_fst_words: Vec<_>;
|
||||||
|
let new_prefix_fst_words;
|
||||||
|
let del_prefix_fst_words;
|
||||||
|
|
||||||
// We retrieve the common words between the previous and new prefix word fst.
|
{
|
||||||
let common_prefix_fst_words = fst_stream_into_vec(
|
puffin::profile_scope!("compute_prefix_diffs");
|
||||||
previous_words_prefixes_fst.op().add(¤t_prefix_fst).intersection(),
|
|
||||||
);
|
|
||||||
let common_prefix_fst_words: Vec<_> = common_prefix_fst_words
|
|
||||||
.as_slice()
|
|
||||||
.linear_group_by_key(|x| x.chars().next().unwrap())
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
// We retrieve the newly added words between the previous and new prefix word fst.
|
current_prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
|
||||||
let new_prefix_fst_words = fst_stream_into_vec(
|
|
||||||
current_prefix_fst.op().add(&previous_words_prefixes_fst).difference(),
|
|
||||||
);
|
|
||||||
|
|
||||||
// We compute the set of prefixes that are no more part of the prefix fst.
|
// We retrieve the common words between the previous and new prefix word fst.
|
||||||
let del_prefix_fst_words = fst_stream_into_hashset(
|
common_prefix_fst_words_tmp = fst_stream_into_vec(
|
||||||
previous_words_prefixes_fst.op().add(¤t_prefix_fst).difference(),
|
previous_words_prefixes_fst.op().add(¤t_prefix_fst).intersection(),
|
||||||
);
|
);
|
||||||
|
common_prefix_fst_words = common_prefix_fst_words_tmp
|
||||||
|
.as_slice()
|
||||||
|
.linear_group_by_key(|x| x.chars().next().unwrap())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// We retrieve the newly added words between the previous and new prefix word fst.
|
||||||
|
new_prefix_fst_words = fst_stream_into_vec(
|
||||||
|
current_prefix_fst.op().add(&previous_words_prefixes_fst).difference(),
|
||||||
|
);
|
||||||
|
|
||||||
|
// We compute the set of prefixes that are no more part of the prefix fst.
|
||||||
|
del_prefix_fst_words = fst_stream_into_hashset(
|
||||||
|
previous_words_prefixes_fst.op().add(¤t_prefix_fst).difference(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
databases_seen += 1;
|
databases_seen += 1;
|
||||||
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||||
@ -668,6 +689,8 @@ fn execute_word_prefix_docids(
|
|||||||
common_prefix_fst_words: &[&[String]],
|
common_prefix_fst_words: &[&[String]],
|
||||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
let cursor = reader.into_cursor()?;
|
let cursor = reader.into_cursor()?;
|
||||||
let mut builder = WordPrefixDocids::new(txn, word_docids_db, word_prefix_docids_db);
|
let mut builder = WordPrefixDocids::new(txn, word_docids_db, word_prefix_docids_db);
|
||||||
builder.chunk_compression_type = indexer_config.chunk_compression_type;
|
builder.chunk_compression_type = indexer_config.chunk_compression_type;
|
||||||
|
@ -558,6 +558,8 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
where
|
where
|
||||||
F: Fn(UpdateIndexingStep) + Sync,
|
F: Fn(UpdateIndexingStep) + Sync,
|
||||||
{
|
{
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
let primary_key = self
|
let primary_key = self
|
||||||
.index
|
.index
|
||||||
.primary_key(wtxn)?
|
.primary_key(wtxn)?
|
||||||
|
@ -49,6 +49,66 @@ pub(crate) enum TypedChunk {
|
|||||||
ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
|
ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl TypedChunk {
|
||||||
|
pub fn to_debug_string(&self) -> String {
|
||||||
|
match self {
|
||||||
|
TypedChunk::FieldIdDocidFacetStrings(grenad) => {
|
||||||
|
format!("FieldIdDocidFacetStrings {{ number_of_entries: {} }}", grenad.len())
|
||||||
|
}
|
||||||
|
TypedChunk::FieldIdDocidFacetNumbers(grenad) => {
|
||||||
|
format!("FieldIdDocidFacetNumbers {{ number_of_entries: {} }}", grenad.len())
|
||||||
|
}
|
||||||
|
TypedChunk::Documents(grenad) => {
|
||||||
|
format!("Documents {{ number_of_entries: {} }}", grenad.len())
|
||||||
|
}
|
||||||
|
TypedChunk::FieldIdWordcountDocids(grenad) => {
|
||||||
|
format!("FieldIdWordcountDocids {{ number_of_entries: {} }}", grenad.len())
|
||||||
|
}
|
||||||
|
TypedChunk::NewDocumentsIds(grenad) => {
|
||||||
|
format!("NewDocumentsIds {{ number_of_entries: {} }}", grenad.len())
|
||||||
|
}
|
||||||
|
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => format!(
|
||||||
|
"WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {} }}",
|
||||||
|
word_docids_reader.len(),
|
||||||
|
exact_word_docids_reader.len()
|
||||||
|
),
|
||||||
|
TypedChunk::WordPositionDocids(grenad) => {
|
||||||
|
format!("WordPositionDocids {{ number_of_entries: {} }}", grenad.len())
|
||||||
|
}
|
||||||
|
TypedChunk::WordFidDocids(grenad) => {
|
||||||
|
format!("WordFidDocids {{ number_of_entries: {} }}", grenad.len())
|
||||||
|
}
|
||||||
|
TypedChunk::WordPairProximityDocids(grenad) => {
|
||||||
|
format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len())
|
||||||
|
}
|
||||||
|
TypedChunk::FieldIdFacetStringDocids(grenad) => {
|
||||||
|
format!("FieldIdFacetStringDocids {{ number_of_entries: {} }}", grenad.len())
|
||||||
|
}
|
||||||
|
TypedChunk::FieldIdFacetNumberDocids(grenad) => {
|
||||||
|
format!("FieldIdFacetNumberDocids {{ number_of_entries: {} }}", grenad.len())
|
||||||
|
}
|
||||||
|
TypedChunk::FieldIdFacetExistsDocids(grenad) => {
|
||||||
|
format!("FieldIdFacetExistsDocids {{ number_of_entries: {} }}", grenad.len())
|
||||||
|
}
|
||||||
|
TypedChunk::FieldIdFacetIsNullDocids(grenad) => {
|
||||||
|
format!("FieldIdFacetIsNullDocids {{ number_of_entries: {} }}", grenad.len())
|
||||||
|
}
|
||||||
|
TypedChunk::FieldIdFacetIsEmptyDocids(grenad) => {
|
||||||
|
format!("FieldIdFacetIsEmptyDocids {{ number_of_entries: {} }}", grenad.len())
|
||||||
|
}
|
||||||
|
TypedChunk::GeoPoints(grenad) => {
|
||||||
|
format!("GeoPoints {{ number_of_entries: {} }}", grenad.len())
|
||||||
|
}
|
||||||
|
TypedChunk::VectorPoints(grenad) => {
|
||||||
|
format!("VectorPoints {{ number_of_entries: {} }}", grenad.len())
|
||||||
|
}
|
||||||
|
TypedChunk::ScriptLanguageDocids(grenad) => {
|
||||||
|
format!("ScriptLanguageDocids {{ number_of_entries: {} }}", grenad.len())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Write typed chunk in the corresponding LMDB database of the provided index.
|
/// Write typed chunk in the corresponding LMDB database of the provided index.
|
||||||
/// Return new documents seen.
|
/// Return new documents seen.
|
||||||
pub(crate) fn write_typed_chunk_into_index(
|
pub(crate) fn write_typed_chunk_into_index(
|
||||||
@ -57,6 +117,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
index_is_empty: bool,
|
index_is_empty: bool,
|
||||||
) -> Result<(RoaringBitmap, bool)> {
|
) -> Result<(RoaringBitmap, bool)> {
|
||||||
|
puffin::profile_function!(typed_chunk.to_debug_string());
|
||||||
|
|
||||||
let mut is_merged_database = false;
|
let mut is_merged_database = false;
|
||||||
match typed_chunk {
|
match typed_chunk {
|
||||||
TypedChunk::Documents(obkv_documents_iter) => {
|
TypedChunk::Documents(obkv_documents_iter) => {
|
||||||
@ -336,6 +398,8 @@ where
|
|||||||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||||
FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>,
|
FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>,
|
||||||
{
|
{
|
||||||
|
puffin::profile_function!(format!("number of entries: {}", data.len()));
|
||||||
|
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
let database = database.remap_types::<ByteSlice, ByteSlice>();
|
let database = database.remap_types::<ByteSlice, ByteSlice>();
|
||||||
|
|
||||||
@ -378,6 +442,8 @@ where
|
|||||||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||||
FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>,
|
FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>,
|
||||||
{
|
{
|
||||||
|
puffin::profile_function!(format!("number of entries: {}", data.len()));
|
||||||
|
|
||||||
if !index_is_empty {
|
if !index_is_empty {
|
||||||
return write_entries_into_database(
|
return write_entries_into_database(
|
||||||
data,
|
data,
|
||||||
|
@ -50,6 +50,8 @@ impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> {
|
|||||||
common_prefix_fst_words: &[&'a [String]],
|
common_prefix_fst_words: &[&'a [String]],
|
||||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
index_word_prefix_database(
|
index_word_prefix_database(
|
||||||
self.wtxn,
|
self.wtxn,
|
||||||
self.index.word_pair_proximity_docids,
|
self.index.word_pair_proximity_docids,
|
||||||
|
@ -27,6 +27,8 @@ pub fn index_prefix_word_database(
|
|||||||
chunk_compression_type: CompressionType,
|
chunk_compression_type: CompressionType,
|
||||||
chunk_compression_level: Option<u32>,
|
chunk_compression_level: Option<u32>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_proximity = max_proximity - 1;
|
let max_proximity = max_proximity - 1;
|
||||||
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
|
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
|
||||||
|
|
||||||
|
@ -191,6 +191,7 @@ pub fn index_word_prefix_database(
|
|||||||
chunk_compression_type: CompressionType,
|
chunk_compression_type: CompressionType,
|
||||||
chunk_compression_level: Option<u32>,
|
chunk_compression_level: Option<u32>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
|
puffin::profile_function!();
|
||||||
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
|
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
|
||||||
|
|
||||||
// Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length
|
// Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length
|
||||||
|
@ -303,6 +303,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
FP: Fn(UpdateIndexingStep) + Sync,
|
FP: Fn(UpdateIndexingStep) + Sync,
|
||||||
FA: Fn() -> bool + Sync,
|
FA: Fn() -> bool + Sync,
|
||||||
{
|
{
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
let fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
|
let fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
|
||||||
// if the settings are set before any document update, we don't need to do anything, and
|
// if the settings are set before any document update, we don't need to do anything, and
|
||||||
// will set the primary key during the first document addition.
|
// will set the primary key during the first document addition.
|
||||||
|
@ -45,6 +45,8 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
|||||||
common_prefix_fst_words: &[&[String]],
|
common_prefix_fst_words: &[&[String]],
|
||||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
// It is forbidden to keep a mutable reference into the database
|
// It is forbidden to keep a mutable reference into the database
|
||||||
// and write into it at the same time, therefore we write into another file.
|
// and write into it at the same time, therefore we write into another file.
|
||||||
let mut prefix_docids_sorter = create_sorter(
|
let mut prefix_docids_sorter = create_sorter(
|
||||||
|
@ -50,6 +50,7 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> {
|
|||||||
common_prefix_fst_words: &[&[String]],
|
common_prefix_fst_words: &[&[String]],
|
||||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
|
puffin::profile_function!();
|
||||||
debug!("Computing and writing the word levels integers docids into LMDB on disk...");
|
debug!("Computing and writing the word levels integers docids into LMDB on disk...");
|
||||||
|
|
||||||
let mut prefix_integer_docids_sorter = create_sorter(
|
let mut prefix_integer_docids_sorter = create_sorter(
|
||||||
|
@ -42,6 +42,8 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> {
|
|||||||
|
|
||||||
#[logging_timer::time("WordsPrefixesFst::{}")]
|
#[logging_timer::time("WordsPrefixesFst::{}")]
|
||||||
pub fn execute(self) -> Result<()> {
|
pub fn execute(self) -> Result<()> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
let words_fst = self.index.words_fst(self.wtxn)?;
|
let words_fst = self.index.words_fst(self.wtxn)?;
|
||||||
|
|
||||||
let mut current_prefix = vec![SmallString32::new(); self.max_prefix_length];
|
let mut current_prefix = vec![SmallString32::new(); self.max_prefix_length];
|
||||||
|
Loading…
Reference in New Issue
Block a user