Merge #436

436: Speed up the word prefix databases computation time r=Kerollmops a=Kerollmops This PR depends on the fixes done in #431 and must be merged after it. In this PR we will bring the `WordPrefixPairProximityDocids`, `WordPrefixDocids` and, `WordPrefixPositionDocids` update structures to a new era, a better era, where computing the word prefix pair proximities costs much fewer CPU cycles, an era where this update structure can use the, previously computed, set of new word docids from the newly indexed batch of documents. --- The `WordPrefixPairProximityDocids` is an update structure, which means that it is an object that we feed with some parameters and which modifies the LMDB database of an index when asked for. This structure specifically computes the list of word prefix pair proximities, which correspond to a list of pairs of words associated with a proximity (the distance between both words) where the second word is not a word but a prefix e.g. `s`, `se`, `a`. This word prefix pair proximity is associated with the list of documents ids which contains the pair of words and prefix at the given proximity. The origin of the performances issue that this struct brings is related to the fact that it starts its job from the beginning, it clears the LMDB database before rewriting everything from scratch, using the other LMDB databases to achieve that. I hope you understand that this is absolutely not an optimized way of doing things. Co-authored-by: Clément Renault <clement@meilisearch.com> Co-authored-by: Kerollmops <clement@meilisearch.com>
2025-07-04 20:37:15 +02:00 · 2022-02-16 15:41:14 +00:00 · 2022-02-16 15:41:14 +00:00 · 25123af3b8
commit 25123af3b8
parent f2984f66e6 ff8d7a810d
21 changed files with 572 additions and 259 deletions
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -12,15 +12,18 @@ use crossbeam_channel::{Receiver, Sender};
 use log::debug;
 use roaring::RoaringBitmap;
 use serde::{Deserialize, Serialize};
+use slice_group_by::GroupBy;
 use typed_chunk::{write_typed_chunk_into_index, TypedChunk};

 pub use self::helpers::{
-    create_sorter, create_writer, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
-    sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, MergeFn,
+    as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
+    fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
+    sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, ClonableMmap, MergeFn,
 };
 use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
 pub use self::transform::{Transform, TransformOutput};
 use crate::documents::DocumentBatchReader;
+pub use crate::update::index_documents::helpers::CursorClonableMmap;
 use crate::update::{
    self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids,
    WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst,
@ -57,12 +60,6 @@ impl Default for IndexDocumentsMethod {
    }
 }

-#[derive(Debug, Copy, Clone)]
-pub enum WriteMethod {
-    Append,
-    GetMergePut,
-}
-
 pub struct IndexDocuments<'t, 'u, 'i, 'a, F> {
    wtxn: &'t mut heed::RwTxn<'i, 'u>,
    index: &'i Index,
@ -282,6 +279,9 @@ where
        let index_documents_ids = self.index.documents_ids(self.wtxn)?;
        let index_is_empty = index_documents_ids.len() == 0;
        let mut final_documents_ids = RoaringBitmap::new();
+        let mut word_pair_proximity_docids = Vec::new();
+        let mut word_position_docids = Vec::new();
+        let mut word_docids = Vec::new();

        let mut databases_seen = 0;
        (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
@ -289,9 +289,28 @@ where
            total_databases: TOTAL_POSTING_DATABASE_COUNT,
        });

-        for typed_chunk in lmdb_writer_rx {
+        for result in lmdb_writer_rx {
+            let typed_chunk = match result? {
+                TypedChunk::WordDocids(chunk) => {
+                    let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
+                    word_docids.push(cloneable_chunk);
+                    TypedChunk::WordDocids(chunk)
+                }
+                TypedChunk::WordPairProximityDocids(chunk) => {
+                    let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
+                    word_pair_proximity_docids.push(cloneable_chunk);
+                    TypedChunk::WordPairProximityDocids(chunk)
+                }
+                TypedChunk::WordPositionDocids(chunk) => {
+                    let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
+                    word_position_docids.push(cloneable_chunk);
+                    TypedChunk::WordPositionDocids(chunk)
+                }
+                otherwise => otherwise,
+            };
+
            let (docids, is_merged_database) =
-                write_typed_chunk_into_index(typed_chunk?, &self.index, self.wtxn, index_is_empty)?;
+                write_typed_chunk_into_index(typed_chunk, &self.index, self.wtxn, index_is_empty)?;
            if !docids.is_empty() {
                final_documents_ids |= docids;
                let documents_seen_count = final_documents_ids.len();
@ -325,13 +344,25 @@ where
        let all_documents_ids = index_documents_ids | new_documents_ids | replaced_documents_ids;
        self.index.put_documents_ids(self.wtxn, &all_documents_ids)?;

-        self.execute_prefix_databases()?;
+        self.execute_prefix_databases(
+            word_docids,
+            word_pair_proximity_docids,
+            word_position_docids,
+        )?;

        Ok(all_documents_ids.len())
    }

    #[logging_timer::time("IndexDocuments::{}")]
-    pub fn execute_prefix_databases(self) -> Result<()> {
+    pub fn execute_prefix_databases(
+        self,
+        word_docids: Vec<grenad::Reader<CursorClonableMmap>>,
+        word_pair_proximity_docids: Vec<grenad::Reader<CursorClonableMmap>>,
+        word_position_docids: Vec<grenad::Reader<CursorClonableMmap>>,
+    ) -> Result<()>
+    where
+        F: Fn(UpdateIndexingStep) + Sync,
+    {
        // Merged databases are already been indexed, we start from this count;
        let mut databases_seen = MERGED_DATABASE_COUNT;

@ -353,6 +384,9 @@ where
            total_databases: TOTAL_POSTING_DATABASE_COUNT,
        });

+        let previous_words_prefixes_fst =
+            self.index.words_prefixes_fst(self.wtxn)?.map_data(|cow| cow.into_owned())?;
+
        // Run the words prefixes update operation.
        let mut builder = WordsPrefixesFst::new(self.wtxn, self.index);
        if let Some(value) = self.config.words_prefix_threshold {
@ -363,6 +397,27 @@ where
        }
        builder.execute()?;

+        let current_prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
+
+        // We retrieve the common words between the previous and new prefix word fst.
+        let common_prefix_fst_words = fst_stream_into_vec(
+            previous_words_prefixes_fst.op().add(&current_prefix_fst).intersection(),
+        );
+        let common_prefix_fst_words: Vec<_> = common_prefix_fst_words
+            .as_slice()
+            .linear_group_by_key(|x| x.chars().nth(0).unwrap())
+            .collect();
+
+        // We retrieve the newly added words between the previous and new prefix word fst.
+        let new_prefix_fst_words = fst_stream_into_vec(
+            current_prefix_fst.op().add(&previous_words_prefixes_fst).difference(),
+        );
+
+        // We compute the set of prefixes that are no more part of the prefix fst.
+        let del_prefix_fst_words = fst_stream_into_hashset(
+            previous_words_prefixes_fst.op().add(&current_prefix_fst).difference(),
+        );
+
        databases_seen += 1;
        (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
            databases_seen,
@ -375,7 +430,12 @@ where
        builder.chunk_compression_level = self.indexer_config.chunk_compression_level;
        builder.max_nb_chunks = self.indexer_config.max_nb_chunks;
        builder.max_memory = self.indexer_config.max_memory;
-        builder.execute()?;
+        builder.execute(
+            word_docids,
+            &new_prefix_fst_words,
+            &common_prefix_fst_words,
+            &del_prefix_fst_words,
+        )?;

        databases_seen += 1;
        (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
@ -389,7 +449,12 @@ where
        builder.chunk_compression_level = self.indexer_config.chunk_compression_level;
        builder.max_nb_chunks = self.indexer_config.max_nb_chunks;
        builder.max_memory = self.indexer_config.max_memory;
-        builder.execute()?;
+        builder.execute(
+            word_pair_proximity_docids,
+            &new_prefix_fst_words,
+            &common_prefix_fst_words,
+            &del_prefix_fst_words,
+        )?;

        databases_seen += 1;
        (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
@ -409,7 +474,12 @@ where
        if let Some(value) = self.config.words_positions_min_level_size {
            builder.min_level_size(value);
        }
-        builder.execute()?;
+        builder.execute(
+            word_position_docids,
+            &new_prefix_fst_words,
+            &common_prefix_fst_words,
+            &del_prefix_fst_words,
+        )?;

        databases_seen += 1;
        (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {