Compute the words prefixes at the end of an update

2025-07-01 10:58:30 +02:00 · 2021-03-25 11:10:12 +01:00 · 2021-03-25 11:10:12 +01:00 · e65bad16cc
commit e65bad16cc
parent ab92c814c3
14 changed files with 409 additions and 323 deletions
--- a/http-ui/src/main.rs
+++ b/http-ui/src/main.rs
@ -228,8 +228,6 @@ enum UpdateMeta {
    ClearDocuments,
    Settings(Settings),
    Facets(Facets),
-    WordsPrefixes(WordsPrefixes),
-    WordsLevelPositions(WordsLevelPositions),
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
@ -290,14 +288,6 @@ struct WordsLevelPositions {
    min_level_size: Option<NonZeroU32>,
 }

-// Any value that is present is considered Some value, including null.
-fn deserialize_some<'de, T, D>(deserializer: D) -> Result<Option<T>, D::Error>
-where T: Deserialize<'de>,
-      D: Deserializer<'de>
-{
-    Deserialize::deserialize(deserializer).map(Some)
-}
-
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
    let opt = Opt::from_args();
@ -496,36 +486,6 @@ async fn main() -> anyhow::Result<()> {
                        Err(e) => Err(e)
                    }
                }
-                UpdateMeta::WordsPrefixes(settings) => {
-                    // We must use the write transaction of the update here.
-                    let mut wtxn = index_cloned.write_txn()?;
-                    let mut builder = update_builder.words_prefixes(&mut wtxn, &index_cloned);
-                    if let Some(value) = settings.threshold {
-                        builder.threshold(value);
-                    }
-                    if let Some(value) = settings.max_prefix_length {
-                        builder.max_prefix_length(value);
-                    }
-                    match builder.execute() {
-                        Ok(()) => wtxn.commit().map_err(Into::into),
-                        Err(e) => Err(e)
-                    }
-                },
-                UpdateMeta::WordsLevelPositions(levels) => {
-                    // We must use the write transaction of the update here.
-                    let mut wtxn = index_cloned.write_txn()?;
-                    let mut builder = update_builder.words_level_positions(&mut wtxn, &index_cloned);
-                    if let Some(value) = levels.level_group_size {
-                        builder.level_group_size(value);
-                    }
-                    if let Some(value) = levels.min_level_size {
-                        builder.min_level_size(value);
-                    }
-                    match builder.execute() {
-                        Ok(()) => wtxn.commit().map_err(Into::into),
-                        Err(e) => Err(e.into())
-                    }
-                }
            };

            let meta = match result {
@ -942,32 +902,6 @@ async fn main() -> anyhow::Result<()> {
            warp::reply()
        });

-    let update_store_cloned = update_store.clone();
-    let update_status_sender_cloned = update_status_sender.clone();
-    let change_words_prefixes_route = warp::filters::method::post()
-        .and(warp::path!("words-prefixes"))
-        .and(warp::body::json())
-        .map(move |settings: WordsPrefixes| {
-            let meta = UpdateMeta::WordsPrefixes(settings);
-            let update_id = update_store_cloned.register_update(&meta, &[]).unwrap();
-            let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta });
-            eprintln!("update {} registered", update_id);
-            warp::reply()
-        });
-
-    let update_store_cloned = update_store.clone();
-    let update_status_sender_cloned = update_status_sender.clone();
-    let change_words_level_positions_route = warp::filters::method::post()
-        .and(warp::path!("words-level-positions"))
-        .and(warp::body::json())
-        .map(move |levels: WordsLevelPositions| {
-            let meta = UpdateMeta::WordsLevelPositions(levels);
-            let update_id = update_store_cloned.register_update(&meta, &[]).unwrap();
-            let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta });
-            eprintln!("update {} registered", update_id);
-            warp::reply()
-        });
-
    let update_store_cloned = update_store.clone();
    let update_status_sender_cloned = update_status_sender.clone();
    let abort_update_id_route = warp::filters::method::delete()
@ -1042,8 +976,6 @@ async fn main() -> anyhow::Result<()> {
        .or(clearing_route)
        .or(change_settings_route)
        .or(change_facet_levels_route)
-        .or(change_words_prefixes_route)
-        .or(change_words_level_positions_route)
        .or(update_ws_route);

    let addr = SocketAddr::from_str(&opt.http_listen_addr)?;
--- a/infos/src/main.rs
+++ b/infos/src/main.rs
@ -338,6 +338,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
        facet_field_id_value_docids,
        field_id_docid_facet_values: _,
        documents,
+        ..
    } = index;

    let main_name = "main";
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -54,6 +54,8 @@ pub struct Index {
    pub word_prefix_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
    /// Maps the word, level and position range with the docids that corresponds to it.
    pub word_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
+    /// Maps the level positions of a word prefix with all the docids where this prefix appears.
+    pub word_prefix_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
    /// Maps the facet field id and the globally ordered value with the docids that corresponds to it.
    pub facet_field_id_value_docids: Database<ByteSlice, CboRoaringBitmapCodec>,
    /// Maps the document id, the facet field id and the globally ordered value.
@ -64,7 +66,7 @@ pub struct Index {

 impl Index {
    pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result<Index> {
-        options.max_dbs(10);
+        options.max_dbs(11);

        let env = options.open(path)?;
        let main = env.create_poly_database(Some("main"))?;
@ -74,6 +76,7 @@ impl Index {
        let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?;
        let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?;
        let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?;
+        let word_prefix_level_position_docids = env.create_database(Some("word-prefix-level-position-docids"))?;
        let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?;
        let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?;
        let documents = env.create_database(Some("documents"))?;
@ -98,6 +101,7 @@ impl Index {
            word_pair_proximity_docids,
            word_prefix_pair_proximity_docids,
            word_level_position_docids,
+            word_prefix_level_position_docids,
            facet_field_id_value_docids,
            field_id_docid_facet_values,
            documents,
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@ -29,6 +29,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
            word_pair_proximity_docids,
            word_prefix_pair_proximity_docids,
            word_level_position_docids,
+            word_prefix_level_position_docids,
            facet_field_id_value_docids,
            field_id_docid_facet_values,
            documents,
@ -57,6 +58,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
        word_pair_proximity_docids.clear(self.wtxn)?;
        word_prefix_pair_proximity_docids.clear(self.wtxn)?;
        word_level_position_docids.clear(self.wtxn)?;
+        word_prefix_level_position_docids.clear(self.wtxn)?;
        facet_field_id_value_docids.clear(self.wtxn)?;
        field_id_docid_facet_values.clear(self.wtxn)?;
        documents.clear(self.wtxn)?;
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@ -89,6 +89,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
            word_pair_proximity_docids,
            word_prefix_pair_proximity_docids,
            word_level_position_docids,
+            word_prefix_level_position_docids,
            facet_field_id_value_docids,
            field_id_docid_facet_values,
            documents,
@ -345,6 +346,21 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {

        drop(iter);

+        // We delete the documents ids that are under the word prefix level position docids.
+        let mut iter = word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
+        while let Some(result) = iter.next() {
+            let (bytes, mut docids) = result?;
+            let previous_len = docids.len();
+            docids.difference_with(&self.documents_ids);
+            if docids.is_empty() {
+                iter.del_current()?;
+            } else if docids.len() != previous_len {
+                iter.put_current(bytes, &docids)?;
+            }
+        }
+
+        drop(iter);
+
        Ok(self.documents_ids.len())
    }
 }
--- a/milli/src/update/index_documents/merge_function.rs
+++ b/milli/src/update/index_documents/merge_function.rs
@ -52,6 +52,10 @@ pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -
    cbo_roaring_bitmap_merge(values)
 }

+pub fn word_prefix_level_positions_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
+    cbo_roaring_bitmap_merge(values)
+}
+
 pub fn word_level_position_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
    cbo_roaring_bitmap_merge(values)
 }
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -3,6 +3,7 @@ use std::collections::HashSet;
 use std::fs::File;
 use std::io::{self, Seek, SeekFrom};
 use std::num::{NonZeroU32, NonZeroUsize};
+use std::str;
 use std::sync::mpsc::sync_channel;
 use std::time::Instant;

@ -13,18 +14,21 @@ use grenad::{MergerIter, Writer, Sorter, Merger, Reader, FileFuse, CompressionTy
 use heed::types::ByteSlice;
 use log::{debug, info, error};
 use memmap::Mmap;
-use rayon::ThreadPool;
 use rayon::prelude::*;
+use rayon::ThreadPool;
 use serde::{Serialize, Deserialize};

 use crate::index::Index;
-use crate::update::{Facets, WordsLevelPositions, WordsPrefixes, UpdateIndexingStep};
+use crate::update::{
+    Facets, WordsLevelPositions, WordPrefixDocids, WordsPrefixesFst, UpdateIndexingStep,
+    WordPrefixPairProximityDocids,
+};
 use self::store::{Store, Readers};
 pub use self::merge_function::{
    main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
    docid_word_positions_merge, documents_merge,
-    word_level_position_docids_merge, facet_field_value_docids_merge,
-    field_id_docid_facet_values_merge,
+    word_level_position_docids_merge, word_prefix_level_positions_docids_merge,
+    facet_field_value_docids_merge, field_id_docid_facet_values_merge,
 };
 pub use self::transform::{Transform, TransformOutput};

@ -719,10 +723,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
        builder.execute()?;

        // Run the words prefixes update operation.
-        let mut builder = WordsPrefixes::new(self.wtxn, self.index, self.update_id);
-        builder.chunk_compression_type = self.chunk_compression_type;
-        builder.chunk_compression_level = self.chunk_compression_level;
-        builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
+        let mut builder = WordsPrefixesFst::new(self.wtxn, self.index, self.update_id);
        if let Some(value) = self.words_prefix_threshold {
            builder.threshold(value);
        }
@ -731,8 +732,26 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
        }
        builder.execute()?;

+        // Run the word prefix docids update operation.
+        let mut builder = WordPrefixDocids::new(self.wtxn, self.index);
+        builder.chunk_compression_type = self.chunk_compression_type;
+        builder.chunk_compression_level = self.chunk_compression_level;
+        builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
+        builder.max_nb_chunks = self.max_nb_chunks;
+        builder.max_memory = self.max_memory;
+        builder.execute()?;
+
+        // Run the word prefix pair proximity docids update operation.
+        let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index);
+        builder.chunk_compression_type = self.chunk_compression_type;
+        builder.chunk_compression_level = self.chunk_compression_level;
+        builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
+        builder.max_nb_chunks = self.max_nb_chunks;
+        builder.max_memory = self.max_memory;
+        builder.execute()?;
+
        // Run the words level positions update operation.
-        let mut builder = WordsLevelPositions::new(self.wtxn, self.index, self.update_id);
+        let mut builder = WordsLevelPositions::new(self.wtxn, self.index);
        builder.chunk_compression_type = self.chunk_compression_type;
        builder.chunk_compression_level = self.chunk_compression_level;
        builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
--- a/milli/src/update/mod.rs
+++ b/milli/src/update/mod.rs
@ -6,8 +6,10 @@ pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDoc
 pub use self::settings::{Setting, Settings};
 pub use self::update_builder::UpdateBuilder;
 pub use self::update_step::UpdateIndexingStep;
+pub use self::word_prefix_docids::WordPrefixDocids;
+pub use self::word_prefix_pair_proximity_docids::WordPrefixPairProximityDocids;
 pub use self::words_level_positions::WordsLevelPositions;
-pub use self::words_prefixes::WordsPrefixes;
+pub use self::words_prefixes_fst::WordsPrefixesFst;

 mod available_documents_ids;
 mod clear_documents;
@ -17,6 +19,7 @@ mod index_documents;
 mod settings;
 mod update_builder;
 mod update_step;
+mod word_prefix_docids;
+mod word_prefix_pair_proximity_docids;
 mod words_level_positions;
-mod words_prefixes;
-
+mod words_prefixes_fst;
--- a/milli/src/update/update_builder.rs
+++ b/milli/src/update/update_builder.rs
@ -2,10 +2,7 @@ use grenad::CompressionType;
 use rayon::ThreadPool;

 use crate::Index;
-use super::{
-    ClearDocuments, DeleteDocuments, IndexDocuments, Settings,
-    Facets, WordsPrefixes, WordsLevelPositions,
-};
+use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets};

 pub struct UpdateBuilder<'a> {
    pub(crate) log_every_n: Option<usize>,
@ -138,34 +135,4 @@ impl<'a> UpdateBuilder<'a> {

        builder
    }
-
-    pub fn words_prefixes<'t, 'u, 'i>(
-        self,
-        wtxn: &'t mut heed::RwTxn<'i, 'u>,
-        index: &'i Index,
-    ) -> WordsPrefixes<'t, 'u, 'i>
-    {
-        let mut builder = WordsPrefixes::new(wtxn, index, self.update_id);
-
-        builder.chunk_compression_type = self.chunk_compression_type;
-        builder.chunk_compression_level = self.chunk_compression_level;
-        builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
-
-        builder
-    }
-
-    pub fn words_level_positions<'t, 'u, 'i>(
-        self,
-        wtxn: &'t mut heed::RwTxn<'i, 'u>,
-        index: &'i Index,
-    ) -> WordsLevelPositions<'t, 'u, 'i>
-    {
-        let mut builder = WordsLevelPositions::new(wtxn, index, self.update_id);
-
-        builder.chunk_compression_type = self.chunk_compression_type;
-        builder.chunk_compression_level = self.chunk_compression_level;
-        builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
-
-        builder
-    }
 }
--- a/milli/src/update/word_prefix_docids.rs
+++ b/milli/src/update/word_prefix_docids.rs
@ -0,0 +1,75 @@
+use std::str;
+
+use crate::Index;
+use fst::Streamer;
+use grenad::CompressionType;
+use heed::types::ByteSlice;
+
+use crate::update::index_documents::WriteMethod;
+use crate::update::index_documents::{create_sorter, word_docids_merge, sorter_into_lmdb_database};
+
+pub struct WordPrefixDocids<'t, 'u, 'i> {
+    wtxn: &'t mut heed::RwTxn<'i, 'u>,
+    index: &'i Index,
+    pub(crate) chunk_compression_type: CompressionType,
+    pub(crate) chunk_compression_level: Option<u32>,
+    pub(crate) chunk_fusing_shrink_size: Option<u64>,
+    pub(crate) max_nb_chunks: Option<usize>,
+    pub(crate) max_memory: Option<usize>,
+}
+
+impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
+    pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordPrefixDocids<'t, 'u, 'i> {
+        WordPrefixDocids {
+            wtxn,
+            index,
+            chunk_compression_type: CompressionType::None,
+            chunk_compression_level: None,
+            chunk_fusing_shrink_size: None,
+            max_nb_chunks: None,
+            max_memory: None,
+        }
+    }
+
+    pub fn execute(self) -> anyhow::Result<()> {
+        // Clear the word prefix docids database.
+        self.index.word_prefix_docids.clear(self.wtxn)?;
+
+        let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
+
+        // It is forbidden to keep a mutable reference into the database
+        // and write into it at the same time, therefore we write into another file.
+        let mut prefix_docids_sorter = create_sorter(
+            word_docids_merge,
+            self.chunk_compression_type,
+            self.chunk_compression_level,
+            self.chunk_fusing_shrink_size,
+            self.max_nb_chunks,
+            self.max_memory,
+        );
+
+        // We iterate over all the prefixes and retrieve the corresponding docids.
+        let mut prefix_stream = prefix_fst.stream();
+        while let Some(bytes) = prefix_stream.next() {
+            let prefix = str::from_utf8(bytes)?;
+            let db = self.index.word_docids.remap_data_type::<ByteSlice>();
+            for result in db.prefix_iter(self.wtxn, prefix)? {
+                let (_word, data) = result?;
+                prefix_docids_sorter.insert(prefix, data)?;
+            }
+        }
+
+        drop(prefix_fst);
+
+        // We finally write the word prefix docids into the LMDB database.
+        sorter_into_lmdb_database(
+            self.wtxn,
+            *self.index.word_prefix_docids.as_polymorph(),
+            prefix_docids_sorter,
+            word_docids_merge,
+            WriteMethod::Append,
+        )?;
+
+        Ok(())
+    }
+}
--- a/milli/src/update/word_prefix_pair_proximity_docids.rs
+++ b/milli/src/update/word_prefix_pair_proximity_docids.rs
@ -0,0 +1,89 @@
+use std::str;
+
+use fst::automaton::{Automaton, Str};
+use fst::{Streamer, IntoStreamer};
+use grenad::CompressionType;
+use heed::BytesEncode;
+use heed::types::ByteSlice;
+use log::debug;
+
+use crate::Index;
+use crate::heed_codec::StrStrU8Codec;
+use crate::update::index_documents::{
+    WriteMethod, create_sorter, sorter_into_lmdb_database,
+    words_pairs_proximities_docids_merge,
+};
+
+pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
+    wtxn: &'t mut heed::RwTxn<'i, 'u>,
+    index: &'i Index,
+    pub(crate) chunk_compression_type: CompressionType,
+    pub(crate) chunk_compression_level: Option<u32>,
+    pub(crate) chunk_fusing_shrink_size: Option<u64>,
+    pub(crate) max_nb_chunks: Option<usize>,
+    pub(crate) max_memory: Option<usize>,
+}
+
+impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
+    pub fn new(
+        wtxn: &'t mut heed::RwTxn<'i, 'u>,
+        index: &'i Index,
+    ) -> WordPrefixPairProximityDocids<'t, 'u, 'i>
+    {
+        WordPrefixPairProximityDocids {
+            wtxn,
+            index,
+            chunk_compression_type: CompressionType::None,
+            chunk_compression_level: None,
+            chunk_fusing_shrink_size: None,
+            max_nb_chunks: None,
+            max_memory: None,
+        }
+    }
+
+    pub fn execute(self) -> anyhow::Result<()> {
+        debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
+
+        self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?;
+
+        let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
+
+        // Here we create a sorter akin to the previous one.
+        let mut word_prefix_pair_proximity_docids_sorter = create_sorter(
+            words_pairs_proximities_docids_merge,
+            self.chunk_compression_type,
+            self.chunk_compression_level,
+            self.chunk_fusing_shrink_size,
+            self.max_nb_chunks,
+            self.max_memory,
+        );
+
+        // We insert all the word pairs corresponding to the word-prefix pairs
+        // where the prefixes appears in the prefix FST previously constructed.
+        let db = self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>();
+        for result in db.iter(self.wtxn)? {
+            let ((word1, word2, prox), data) = result?;
+            let automaton = Str::new(word2).starts_with();
+            let mut matching_prefixes = prefix_fst.search(automaton).into_stream();
+            while let Some(prefix) = matching_prefixes.next() {
+                let prefix = str::from_utf8(prefix)?;
+                let pair = (word1, prefix, prox);
+                let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap();
+                word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?;
+            }
+        }
+
+        drop(prefix_fst);
+
+        // We finally write the word prefix pair proximity docids into the LMDB database.
+        sorter_into_lmdb_database(
+            self.wtxn,
+            *self.index.word_prefix_pair_proximity_docids.as_polymorph(),
+            word_prefix_pair_proximity_docids_sorter,
+            words_pairs_proximities_docids_merge,
+            WriteMethod::Append,
+        )?;
+
+        Ok(())
+    }
+}
--- a/milli/src/update/words_level_positions.rs
+++ b/milli/src/update/words_level_positions.rs
@ -1,17 +1,22 @@
-use std::cmp;
+use std::{cmp, str};
 use std::convert::TryFrom;
 use std::fs::File;
 use std::num::NonZeroU32;

+use fst::automaton::{self, Automaton};
+use fst::{Streamer, IntoStreamer};
 use grenad::{CompressionType, Reader, Writer, FileFuse};
-use heed::types::{DecodeIgnore, Str};
+use heed::types::{ByteSlice, DecodeIgnore, Str};
 use heed::{BytesEncode, Error};
 use log::debug;
 use roaring::RoaringBitmap;

 use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec};
 use crate::update::index_documents::WriteMethod;
-use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database};
+use crate::update::index_documents::{
+    create_writer, create_sorter, writer_into_reader, write_into_lmdb_database,
+    word_prefix_level_positions_docids_merge, sorter_into_lmdb_database
+};
 use crate::{Index, TreeLevel};

 pub struct WordsLevelPositions<'t, 'u, 'i> {
@ -20,27 +25,24 @@ pub struct WordsLevelPositions<'t, 'u, 'i> {
    pub(crate) chunk_compression_type: CompressionType,
    pub(crate) chunk_compression_level: Option<u32>,
    pub(crate) chunk_fusing_shrink_size: Option<u64>,
+    pub(crate) max_nb_chunks: Option<usize>,
+    pub(crate) max_memory: Option<usize>,
    level_group_size: NonZeroU32,
    min_level_size: NonZeroU32,
-    _update_id: u64,
 }

 impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
-    pub fn new(
-        wtxn: &'t mut heed::RwTxn<'i, 'u>,
-        index: &'i Index,
-        update_id: u64,
-    ) -> WordsLevelPositions<'t, 'u, 'i>
-    {
+    pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordsLevelPositions<'t, 'u, 'i> {
        WordsLevelPositions {
            wtxn,
            index,
            chunk_compression_type: CompressionType::None,
            chunk_compression_level: None,
            chunk_fusing_shrink_size: None,
+            max_nb_chunks: None,
+            max_memory: None,
            level_group_size: NonZeroU32::new(4).unwrap(),
            min_level_size: NonZeroU32::new(5).unwrap(),
-            _update_id: update_id,
        }
    }

@ -76,7 +78,71 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
            self.wtxn,
            *self.index.word_level_position_docids.as_polymorph(),
            entries,
-            |_, _| anyhow::bail!("invalid facet level merging"),
+            |_, _| anyhow::bail!("invalid word level position merging"),
+            WriteMethod::Append,
+        )?;
+
+        // We compute the word prefix level positions database.
+        self.index.word_prefix_level_position_docids.clear(self.wtxn)?;
+
+        let mut word_prefix_level_positions_docids_sorter = create_sorter(
+            word_prefix_level_positions_docids_merge,
+            self.chunk_compression_type,
+            self.chunk_compression_level,
+            self.chunk_fusing_shrink_size,
+            self.max_nb_chunks,
+            self.max_memory,
+        );
+
+        // We insert the word prefix level positions where the level is equal to 0 and
+        // corresponds to the word-prefix level positions where the prefixes appears
+        // in the prefix FST previously constructed.
+        let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
+        let db = self.index.word_level_position_docids.remap_data_type::<ByteSlice>();
+        for result in db.iter(self.wtxn)? {
+            let ((word, level, left, right), data) = result?;
+            if level == TreeLevel::min_value() {
+                let automaton = automaton::Str::new(word).starts_with();
+                let mut matching_prefixes = prefix_fst.search(automaton).into_stream();
+                while let Some(prefix) = matching_prefixes.next() {
+                    let prefix = str::from_utf8(prefix)?;
+                    let key = (prefix, level, left, right);
+                    let bytes = StrLevelPositionCodec::bytes_encode(&key).unwrap();
+                    word_prefix_level_positions_docids_sorter.insert(bytes, data)?;
+                }
+            }
+        }
+
+        // We finally write all the word prefix level positions docids with
+        // a level equal to 0 into the LMDB database.
+        sorter_into_lmdb_database(
+            self.wtxn,
+            *self.index.word_prefix_level_position_docids.as_polymorph(),
+            word_prefix_level_positions_docids_sorter,
+            word_prefix_level_positions_docids_merge,
+            WriteMethod::Append,
+        )?;
+
+        let entries = compute_positions_levels(
+            self.wtxn,
+            self.index.word_prefix_docids.remap_data_type::<DecodeIgnore>(),
+            self.index.word_prefix_level_position_docids,
+            self.chunk_compression_type,
+            self.chunk_compression_level,
+            self.chunk_fusing_shrink_size,
+            self.level_group_size,
+            self.min_level_size,
+        )?;
+
+        // The previously computed entries also defines the level 0 entries
+        // so we can clear the database and append all of these entries.
+        self.index.word_prefix_level_position_docids.clear(self.wtxn)?;
+
+        write_into_lmdb_database(
+            self.wtxn,
+            *self.index.word_prefix_level_position_docids.as_polymorph(),
+            entries,
+            |_, _| anyhow::bail!("invalid word prefix level position merging"),
            WriteMethod::Append,
        )?;

--- a/milli/src/update/words_prefixes.rs
+++ b/milli/src/update/words_prefixes.rs
@ -1,196 +0,0 @@
-use std::iter::FromIterator;
-use std::str;
-
-use chrono::Utc;
-use fst::automaton::Str;
-use fst::{Automaton, Streamer, IntoStreamer};
-use grenad::CompressionType;
-use heed::BytesEncode;
-use heed::types::ByteSlice;
-
-use crate::heed_codec::StrStrU8Codec;
-use crate::update::index_documents::WriteMethod;
-use crate::update::index_documents::{create_sorter, sorter_into_lmdb_database};
-use crate::update::index_documents::{word_docids_merge, words_pairs_proximities_docids_merge};
-use crate::{Index, SmallString32};
-
-pub struct WordsPrefixes<'t, 'u, 'i> {
-    wtxn: &'t mut heed::RwTxn<'i, 'u>,
-    index: &'i Index,
-    pub(crate) chunk_compression_type: CompressionType,
-    pub(crate) chunk_compression_level: Option<u32>,
-    pub(crate) chunk_fusing_shrink_size: Option<u64>,
-    pub(crate) max_nb_chunks: Option<usize>,
-    pub(crate) max_memory: Option<usize>,
-    threshold: f64,
-    max_prefix_length: usize,
-    _update_id: u64,
-}
-
-impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> {
-    pub fn new(
-        wtxn: &'t mut heed::RwTxn<'i, 'u>,
-        index: &'i Index,
-        update_id: u64,
-    ) -> WordsPrefixes<'t, 'u, 'i>
-    {
-        WordsPrefixes {
-            wtxn,
-            index,
-            chunk_compression_type: CompressionType::None,
-            chunk_compression_level: None,
-            chunk_fusing_shrink_size: None,
-            max_nb_chunks: None,
-            max_memory: None,
-            threshold: 0.1 / 100.0, // .01%
-            max_prefix_length: 4,
-            _update_id: update_id,
-        }
-    }
-
-    /// Set the ratio of concerned words required to make a prefix be part of the words prefixes
-    /// database. If a word prefix is supposed to match more than this number of words in the
-    /// dictionnary, therefore this prefix is added to the words prefixes datastructures.
-    ///
-    /// Default value is `0.01` or `1%`. This value must be between 0 and 1 and will be clamped
-    /// to these bounds otherwise.
-    pub fn threshold(&mut self, value: f64) -> &mut Self {
-        self.threshold = value.min(1.0).max(0.0); // clamp [0, 1]
-        self
-    }
-
-    /// Set the maximum length of prefixes in bytes.
-    ///
-    /// Default value is `4` bytes. This value must be between 1 and 25 will be clamped
-    /// to these bounds, otherwise.
-    pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
-        self.max_prefix_length = value.min(25).max(1); // clamp [1, 25]
-        self
-    }
-
-    pub fn execute(self) -> anyhow::Result<()> {
-        self.index.set_updated_at(self.wtxn, &Utc::now())?;
-        // Clear the words prefixes datastructures.
-        self.index.word_prefix_docids.clear(self.wtxn)?;
-        self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?;
-
-        let words_fst = self.index.words_fst(&self.wtxn)?;
-        let number_of_words = words_fst.len();
-        let min_number_of_words = (number_of_words as f64 * self.threshold) as usize;
-
-        // It is forbidden to keep a mutable reference into the database
-        // and write into it at the same time, therefore we write into another file.
-        let mut prefix_docids_sorter = create_sorter(
-            word_docids_merge,
-            self.chunk_compression_type,
-            self.chunk_compression_level,
-            self.chunk_fusing_shrink_size,
-            self.max_nb_chunks,
-            self.max_memory,
-        );
-
-        let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length);
-        for n in 1..=self.max_prefix_length {
-
-            let mut current_prefix = SmallString32::new();
-            let mut current_prefix_count = 0;
-            let mut builder = fst::SetBuilder::memory();
-
-            let mut stream = words_fst.stream();
-            while let Some(bytes) = stream.next() {
-                // We try to get the first n bytes out of this string but we only want
-                // to split at valid characters bounds. If we try to split in the middle of
-                // a character we ignore this word and go to the next one.
-                let word = str::from_utf8(bytes)?;
-                let prefix = match word.get(..n) {
-                    Some(prefix) => prefix,
-                    None => continue,
-                };
-
-                // This is the first iteration of the loop,
-                // or the current word doesn't starts with the current prefix.
-                if current_prefix_count == 0 || prefix != current_prefix.as_str() {
-                    current_prefix = SmallString32::from(prefix);
-                    current_prefix_count = 0;
-                }
-
-                current_prefix_count += 1;
-
-                // There is enough words corresponding to this prefix to add it to the cache.
-                if current_prefix_count == min_number_of_words {
-                    builder.insert(prefix)?;
-                }
-            }
-
-            // We construct the final set for prefixes of size n.
-            prefix_fsts.push(builder.into_set());
-        }
-
-        // We merge all of the previously computed prefixes into on final set.
-        let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter());
-        let mut builder = fst::SetBuilder::memory();
-        builder.extend_stream(op.r#union())?;
-        let prefix_fst = builder.into_set();
-
-        // We iterate over all the prefixes and retrieve the corresponding docids.
-        let mut prefix_stream = prefix_fst.stream();
-        while let Some(bytes) = prefix_stream.next() {
-            let prefix = str::from_utf8(bytes)?;
-            let db = self.index.word_docids.remap_data_type::<ByteSlice>();
-            for result in db.prefix_iter(self.wtxn, prefix)? {
-                let (_word, data) = result?;
-                prefix_docids_sorter.insert(prefix, data)?;
-            }
-        }
-
-        // Set the words prefixes FST in the dtabase.
-        self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?;
-
-        // We finally write the word prefix docids into the LMDB database.
-        sorter_into_lmdb_database(
-            self.wtxn,
-            *self.index.word_prefix_docids.as_polymorph(),
-            prefix_docids_sorter,
-            word_docids_merge,
-            WriteMethod::Append,
-        )?;
-
-        // We compute the word prefix pair proximity database.
-
-        // Here we create a sorter akin to the previous one.
-        let mut word_prefix_pair_proximity_docids_sorter = create_sorter(
-            words_pairs_proximities_docids_merge,
-            self.chunk_compression_type,
-            self.chunk_compression_level,
-            self.chunk_fusing_shrink_size,
-            self.max_nb_chunks,
-            self.max_memory,
-        );
-
-        // We insert all the word pairs corresponding to the word-prefix pairs
-        // where the prefixes appears in the prefix FST previously constructed.
-        let db = self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>();
-        for result in db.iter(self.wtxn)? {
-            let ((word1, word2, prox), data) = result?;
-            let automaton = Str::new(word2).starts_with();
-            let mut matching_prefixes = prefix_fst.search(automaton).into_stream();
-            while let Some(prefix) = matching_prefixes.next() {
-                let prefix = str::from_utf8(prefix)?;
-                let pair = (word1, prefix, prox);
-                let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap();
-                word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?;
-            }
-        }
-
-        // We finally write the word prefix pair proximity docids into the LMDB database.
-        sorter_into_lmdb_database(
-            self.wtxn,
-            *self.index.word_prefix_pair_proximity_docids.as_polymorph(),
-            word_prefix_pair_proximity_docids_sorter,
-            words_pairs_proximities_docids_merge,
-            WriteMethod::Append,
-        )?;
-
-        Ok(())
-    }
-}
--- a/milli/src/update/words_prefixes_fst.rs
+++ b/milli/src/update/words_prefixes_fst.rs
@ -0,0 +1,104 @@
+use std::iter::FromIterator;
+use std::str;
+
+use fst::Streamer;
+use crate::{Index, SmallString32};
+
+pub struct WordsPrefixesFst<'t, 'u, 'i> {
+    wtxn: &'t mut heed::RwTxn<'i, 'u>,
+    index: &'i Index,
+    threshold: f64,
+    max_prefix_length: usize,
+    _update_id: u64,
+}
+
+impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> {
+    pub fn new(
+        wtxn: &'t mut heed::RwTxn<'i, 'u>,
+        index: &'i Index,
+        update_id: u64,
+    ) -> WordsPrefixesFst<'t, 'u, 'i>
+    {
+        WordsPrefixesFst {
+            wtxn,
+            index,
+            threshold: 0.1 / 100.0, // .01%
+            max_prefix_length: 4,
+            _update_id: update_id,
+        }
+    }
+
+    /// Set the ratio of concerned words required to make a prefix be part of the words prefixes
+    /// database. If a word prefix is supposed to match more than this number of words in the
+    /// dictionnary, therefore this prefix is added to the words prefixes datastructures.
+    ///
+    /// Default value is `0.01` or `1%`. This value must be between 0 and 1 and will be clamped
+    /// to these bounds otherwise.
+    pub fn threshold(&mut self, value: f64) -> &mut Self {
+        self.threshold = value.min(1.0).max(0.0); // clamp [0, 1]
+        self
+    }
+
+    /// Set the maximum length of prefixes in bytes.
+    ///
+    /// Default value is `4` bytes. This value must be between 1 and 25 will be clamped
+    /// to these bounds, otherwise.
+    pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
+        self.max_prefix_length = value.min(25).max(1); // clamp [1, 25]
+        self
+    }
+
+    pub fn execute(self) -> anyhow::Result<()> {
+        let words_fst = self.index.words_fst(&self.wtxn)?;
+        let number_of_words = words_fst.len();
+        let min_number_of_words = (number_of_words as f64 * self.threshold) as usize;
+
+        let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length);
+        for n in 1..=self.max_prefix_length {
+
+            let mut current_prefix = SmallString32::new();
+            let mut current_prefix_count = 0;
+            let mut builder = fst::SetBuilder::memory();
+
+            let mut stream = words_fst.stream();
+            while let Some(bytes) = stream.next() {
+                // We try to get the first n bytes out of this string but we only want
+                // to split at valid characters bounds. If we try to split in the middle of
+                // a character we ignore this word and go to the next one.
+                let word = str::from_utf8(bytes)?;
+                let prefix = match word.get(..n) {
+                    Some(prefix) => prefix,
+                    None => continue,
+                };
+
+                // This is the first iteration of the loop,
+                // or the current word doesn't starts with the current prefix.
+                if current_prefix_count == 0 || prefix != current_prefix.as_str() {
+                    current_prefix = SmallString32::from(prefix);
+                    current_prefix_count = 0;
+                }
+
+                current_prefix_count += 1;
+
+                // There is enough words corresponding to this prefix to add it to the cache.
+                if current_prefix_count == min_number_of_words {
+                    builder.insert(prefix)?;
+                }
+            }
+
+            // We construct the final set for prefixes of size n.
+            prefix_fsts.push(builder.into_set());
+        }
+
+        // We merge all of the previously computed prefixes into on final set.
+        let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter());
+        let mut builder = fst::SetBuilder::memory();
+        builder.extend_stream(op.r#union())?;
+        let prefix_fst = builder.into_set();
+
+        // Set the words prefixes FST in the dtabase.
+        self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?;
+
+        Ok(())
+    }
+}