From b3a21d5a5003689fbb5e549695fe8df1b7fcb067 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 3 Feb 2021 10:30:33 +0100
Subject: [PATCH 01/14] Introduce the getters and setters for the words
 prefixes FST

---
 infos/src/main.rs                    |  1 +
 milli/src/index.rs                   | 24 +++++++++++++++++++++++-
 milli/src/update/clear_documents.rs  |  1 +
 milli/src/update/delete_documents.rs |  1 +
 4 files changed, 26 insertions(+), 1 deletion(-)
diff --git a/infos/src/main.rs b/infos/src/main.rs
index e874385e6..916b5ba50 100644
--- a/infos/src/main.rs
+++ b/infos/src/main.rs
@@ -311,6 +311,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
         env: _env,
         main,
         word_docids,
+        word_prefix_docids,
         docid_word_positions,
         word_pair_proximity_docids,
         facet_field_id_value_docids,
diff --git a/milli/src/index.rs b/milli/src/index.rs
index c0dd22986..5763f78ee 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -27,6 +27,7 @@ pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields";
 pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
 pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids";
 pub const WORDS_FST_KEY: &str = "words-fst";
+pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst";
 
 #[derive(Clone)]
 pub struct Index {
@@ -36,6 +37,8 @@ pub struct Index {
     pub main: PolyDatabase,
     /// A word and all the documents ids containing the word.
     pub word_docids: Database<Str, RoaringBitmapCodec>,
+    /// A prefix of word and all the documents ids containing this prefix.
+    pub word_prefix_docids: Database<Str, RoaringBitmapCodec>,
     /// Maps a word and a document id (u32) to all the positions where the given word appears.
     pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
     /// Maps the proximity between a pair of words with all the docids where this relation appears.
@@ -50,11 +53,12 @@ pub struct Index {
 
 impl Index {
     pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result<Index> {
-        options.max_dbs(7);
+        options.max_dbs(8);
 
         let env = options.open(path)?;
         let main = env.create_poly_database(Some("main"))?;
         let word_docids = env.create_database(Some("word-docids"))?;
+        let word_prefix_docids = env.create_database(Some("word-prefix-docids"))?;
         let docid_word_positions = env.create_database(Some("docid-word-positions"))?;
         let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?;
         let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?;
@@ -65,6 +69,7 @@ impl Index {
             env,
             main,
             word_docids,
+            word_prefix_docids,
             docid_word_positions,
             word_pair_proximity_docids,
             facet_field_id_value_docids,
@@ -328,6 +333,23 @@ impl Index {
         }
     }
 
+    /* words prefixes fst */
+
+    /// Writes the FST which is the words prefixes dictionnary of the engine.
+    pub fn put_words_prefixes_fst<A: AsRef<[u8]>>(&self, wtxn: &mut RwTxn, fst: &fst::Set<A>) -> heed::Result<()> {
+        self.main.put::<_, Str, ByteSlice>(wtxn, WORDS_PREFIXES_FST_KEY, fst.as_fst().as_bytes())
+    }
+
+    /// Returns the FST which is the words prefixes dictionnary of the engine.
+    pub fn words_prefixes_fst<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result<fst::Set<Cow<'t, [u8]>>> {
+        match self.main.get::<_, Str, ByteSlice>(rtxn, WORDS_PREFIXES_FST_KEY)? {
+            Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?),
+            None => Ok(fst::Set::default().map_data(Cow::Owned)?),
+        }
+    }
+
+    /* documents */
+
     /// Returns a [`Vec`] of the requested documents. Returns an error if a document is missing.
     pub fn documents<'t>(
         &self,
diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs
index a84596901..6f0d457b7 100644
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@@ -22,6 +22,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
             env: _env,
             main: _main,
             word_docids,
+            word_prefix_docids,
             docid_word_positions,
             word_pair_proximity_docids,
             facet_field_id_value_docids,
diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
index 932589dd7..2efed359f 100644
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -79,6 +79,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
             env: _env,
             main: _main,
             word_docids,
+            word_prefix_docids,
             docid_word_positions,
             word_pair_proximity_docids,
             facet_field_id_value_docids,

From 5e7b26791b7b8492c410fa9e938b019dd7b6585f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 3 Feb 2021 10:35:19 +0100
Subject: [PATCH 02/14] Take the words-prefixes into account while computing
 the biggest values

---
 infos/src/main.rs | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/infos/src/main.rs b/infos/src/main.rs
index 916b5ba50..305bfd0d5 100644
--- a/infos/src/main.rs
+++ b/infos/src/main.rs
@@ -321,6 +321,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
 
     let main_name = "main";
     let word_docids_name = "word_docids";
+    let word_prefix_docids_name = "word_prefix_docids";
     let docid_word_positions_name = "docid_word_positions";
     let word_pair_proximity_docids_name = "word_pair_proximity_docids";
     let facet_field_id_value_docids_name = "facet_field_id_value_docids";
@@ -329,8 +330,16 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
     let mut heap = BinaryHeap::with_capacity(limit + 1);
 
     if limit > 0 {
+        // Fetch the words FST
         let words_fst = index.words_fst(rtxn)?;
-        heap.push(Reverse((words_fst.as_fst().as_bytes().len(), format!("words-fst"), main_name)));
+        let length = words_fst.as_fst().as_bytes().len();
+        heap.push(Reverse((length, format!("words-fst"), main_name)));
+        if heap.len() > limit { heap.pop(); }
+
+        // Fetch the word prefix FST
+        let words_prefixes_fst = index.words_prefixes_fst(rtxn)?;
+        let length = words_prefixes_fst.as_fst().as_bytes().len();
+        heap.push(Reverse((length, format!("words-prefixes-fst"), main_name)));
         if heap.len() > limit { heap.pop(); }
 
         if let Some(documents_ids) = main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? {
@@ -344,6 +353,12 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
             if heap.len() > limit { heap.pop(); }
         }
 
+        for result in word_prefix_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
+            let (word, value) = result?;
+            heap.push(Reverse((value.len(), word.to_string(), word_prefix_docids_name)));
+            if heap.len() > limit { heap.pop(); }
+        }
+
         for result in docid_word_positions.remap_data_type::<ByteSlice>().iter(rtxn)? {
             let ((docid, word), value) = result?;
             let key = format!("{} {}", docid, word);

From ee5a60e1c5f3b4f438485a27885932e8870fec7e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 3 Feb 2021 10:36:07 +0100
Subject: [PATCH 03/14] Clear the words prefixes when clearing an index

---
 milli/src/update/clear_documents.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs
index 6f0d457b7..d20263d38 100644
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@@ -36,6 +36,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
 
         // We clean some of the main engine datastructures.
         self.index.put_words_fst(self.wtxn, &fst::Set::default())?;
+        self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?;
         self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?;
         self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?;
 
@@ -46,6 +47,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
 
         // Clear the other databases.
         word_docids.clear(self.wtxn)?;
+        word_prefix_docids.clear(self.wtxn)?;
         docid_word_positions.clear(self.wtxn)?;
         word_pair_proximity_docids.clear(self.wtxn)?;
         facet_field_id_value_docids.clear(self.wtxn)?;

From f365de636fe6dbeeee74ecf1ae4dad6b6149f5b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 17 Feb 2021 11:12:38 +0100
Subject: [PATCH 04/14] Compute and write the word-prefix-docids database

---
 milli/src/lib.rs                        |   2 +-
 milli/src/update/delete_documents.rs    |   3 +
 milli/src/update/facets.rs              |   3 +-
 milli/src/update/index_documents/mod.rs |   2 +-
 milli/src/update/mod.rs                 |   4 +-
 milli/src/update/words_prefixes.rs      | 161 ++++++++++++++++++++++++
 6 files changed, 171 insertions(+), 4 deletions(-)
 create mode 100644 milli/src/update/words_prefixes.rs

diff --git a/milli/src/lib.rs b/milli/src/lib.rs
index 7a9afde2d..66d134f4e 100644
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@@ -33,8 +33,8 @@ pub use self::update_store::UpdateStore;
 pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
 pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>;
 pub type SmallString32 = smallstr::SmallString<[u8; 32]>;
-pub type SmallVec32<T> = smallvec::SmallVec<[T; 32]>;
 pub type SmallVec16<T> = smallvec::SmallVec<[T; 16]>;
+pub type SmallVec32<T> = smallvec::SmallVec<[T; 32]>;
 pub type SmallVec8<T> = smallvec::SmallVec<[T; 8]>;
 pub type BEU32 = heed::zerocopy::U32<heed::byteorder::BE>;
 pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>;
diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
index 2efed359f..1e0064f22 100644
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -158,6 +158,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
             }
         }
 
+        // FIXME we must recompute the words prefixes docids.
+        todo!("recompute words prefixes docids");
+
         // We construct an FST set that contains the words to delete from the words FST.
         let words_to_delete = words.iter().filter_map(|(word, must_remove)| {
             if *must_remove { Some(word.as_ref()) } else { None }
diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs
index 522a4d350..bac5f3c86 100644
--- a/milli/src/update/facets.rs
+++ b/milli/src/update/facets.rs
@@ -32,7 +32,8 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
         wtxn: &'t mut heed::RwTxn<'i, 'u>,
         index: &'i Index,
         update_id: u64,
-    ) -> Facets<'t, 'u, 'i> {
+    ) -> Facets<'t, 'u, 'i>
+    {
         Facets {
             wtxn,
             index,
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index e38c640a0..d53173b71 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -19,7 +19,7 @@ use serde::{Serialize, Deserialize};
 use crate::index::Index;
 use crate::update::{Facets, UpdateIndexingStep};
 use self::store::{Store, Readers};
-use self::merge_function::{
+pub use self::merge_function::{
     main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
     docid_word_positions_merge, documents_merge, facet_field_value_docids_merge,
     field_id_docid_facet_values_merge,
diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs
index 2cd532c83..fcdcb33e9 100644
--- a/milli/src/update/mod.rs
+++ b/milli/src/update/mod.rs
@@ -6,12 +6,14 @@ mod index_documents;
 mod settings;
 mod update_builder;
 mod update_step;
+mod words_prefixes;
 
 pub use self::available_documents_ids::AvailableDocumentsIds;
 pub use self::clear_documents::ClearDocuments;
 pub use self::delete_documents::DeleteDocuments;
-pub use self::index_documents::{IndexDocuments, IndexDocumentsMethod, UpdateFormat, DocumentAdditionResult};
 pub use self::facets::Facets;
+pub use self::index_documents::{IndexDocuments, IndexDocumentsMethod, UpdateFormat, DocumentAdditionResult};
 pub use self::settings::Settings;
 pub use self::update_builder::UpdateBuilder;
 pub use self::update_step::UpdateIndexingStep;
+pub use self::words_prefixes::WordsPrefixes;
diff --git a/milli/src/update/words_prefixes.rs b/milli/src/update/words_prefixes.rs
new file mode 100644
index 000000000..bb8d3a6f8
--- /dev/null
+++ b/milli/src/update/words_prefixes.rs
@@ -0,0 +1,161 @@
+use std::iter::FromIterator;
+use std::str;
+
+use fst::Streamer;
+use grenad::CompressionType;
+use heed::types::ByteSlice;
+
+use crate::{Index, SmallString32};
+use crate::update::index_documents::WriteMethod;
+use crate::update::index_documents::{create_sorter, create_writer, writer_into_reader};
+use crate::update::index_documents::{word_docids_merge, write_into_lmdb_database};
+
+pub struct WordsPrefixes<'t, 'u, 'i> {
+    wtxn: &'t mut heed::RwTxn<'i, 'u>,
+    index: &'i Index,
+    pub(crate) chunk_compression_type: CompressionType,
+    pub(crate) chunk_compression_level: Option<u32>,
+    pub(crate) chunk_fusing_shrink_size: Option<u64>,
+    pub(crate) max_nb_chunks: Option<usize>,
+    pub(crate) max_memory: Option<usize>,
+    threshold: f64,
+    max_prefix_length: usize,
+    _update_id: u64,
+}
+
+impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> {
+    pub fn new(
+        wtxn: &'t mut heed::RwTxn<'i, 'u>,
+        index: &'i Index,
+        update_id: u64,
+    ) -> WordsPrefixes<'t, 'u, 'i>
+    {
+        WordsPrefixes {
+            wtxn,
+            index,
+            chunk_compression_type: CompressionType::None,
+            chunk_compression_level: None,
+            chunk_fusing_shrink_size: None,
+            max_nb_chunks: None,
+            max_memory: None,
+            threshold: 0.01, // 1%
+            max_prefix_length: 4,
+            _update_id: update_id,
+        }
+    }
+
+    /// Set the ratio of concerned words required to make a prefix be part of the words prefixes
+    /// database. If a word prefix is supposed to match more than this number of words in the
+    /// dictionnary, therefore this prefix is added to the words prefixes datastructures.
+    ///
+    /// Default value is `0.01` or `1%`. This value must be between 0 and 1 and will be clamped
+    /// to these bounds otherwise.
+    pub fn threshold(&mut self, value: f64) -> &mut Self {
+        self.threshold = value.min(1.0).max(0.0); // clamp [0, 1]
+        self
+    }
+
+    /// Set the maximum length of prefixes in bytes.
+    ///
+    /// Default value is `4` bytes. This value must be between 1 and 25 will be clamped
+    /// to these bounds, otherwise.
+    pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
+        self.max_prefix_length = value.min(25).max(1); // clamp [1, 25]
+        self
+    }
+
+    pub fn execute(self) -> anyhow::Result<()> {
+        // Clear the words prefixes datastructures.
+        self.index.word_prefix_docids.clear(self.wtxn)?;
+
+        let words_fst = self.index.words_fst(&self.wtxn)?;
+        let number_of_words = words_fst.len();
+        let min_number_of_words = (number_of_words as f64 * self.threshold) as usize;
+
+        // It is forbidden to keep a mutable reference into the database
+        // and write into it at the same time, therefore we write into another file.
+        let mut docids_sorter = create_sorter(
+            word_docids_merge,
+            self.chunk_compression_type,
+            self.chunk_compression_level,
+            self.chunk_fusing_shrink_size,
+            self.max_nb_chunks,
+            self.max_memory,
+        );
+
+        let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length);
+        for n in 1..=self.max_prefix_length {
+
+            let mut current_prefix = SmallString32::new();
+            let mut current_prefix_count = 0;
+            let mut builder = fst::SetBuilder::memory();
+
+            let mut stream = words_fst.stream();
+            while let Some(bytes) = stream.next() {
+                // We try to get the first n bytes out of this string but we only want
+                // to split at valid characters bounds. If we try to split in the middle of
+                // a character we ignore this word and go to the next one.
+                let word = str::from_utf8(bytes)?;
+                let prefix = match word.get(..n) {
+                    Some(prefix) => prefix,
+                    None => continue,
+                };
+
+                // This is the first iteration of the loop,
+                // or the current word doesn't starts with the current prefix.
+                if current_prefix_count == 0 || prefix != current_prefix.as_str() {
+                    current_prefix = SmallString32::from(prefix);
+                    current_prefix_count = 0;
+                }
+
+                current_prefix_count += 1;
+
+                // There is enough words corresponding to this prefix to add it to the cache.
+                if current_prefix_count == min_number_of_words {
+                    builder.insert(prefix)?;
+                }
+            }
+
+            // We construct the final set for prefixes of size n.
+            prefix_fsts.push(builder.into_set());
+        }
+
+        // We merge all of the previously computed prefixes into on final set.
+        let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter());
+        let mut builder = fst::SetBuilder::memory();
+        builder.extend_stream(op.r#union())?;
+        let prefix_fst = builder.into_set();
+
+        // We iterate over all the prefixes and retrieve the corresponding docids.
+        let mut prefix_stream = prefix_fst.stream();
+        while let Some(bytes) = prefix_stream.next() {
+            let prefix = str::from_utf8(bytes)?;
+            let db = self.index.word_docids.remap_data_type::<ByteSlice>();
+            for result in db.prefix_iter(self.wtxn, prefix)? {
+                let (_word, data) = result?;
+                docids_sorter.insert(prefix, data)?;
+            }
+        }
+
+        // Set the words prefixes FST in the dtabase.
+        self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?;
+
+        // We write the sorter into a reader to be able to read it back.
+        let mut docids_writer = tempfile::tempfile().and_then(|file| {
+            create_writer(self.chunk_compression_type, self.chunk_compression_level, file)
+        })?;
+        docids_sorter.write_into(&mut docids_writer)?;
+        let docids_reader = writer_into_reader(docids_writer, self.chunk_fusing_shrink_size)?;
+
+        // We finally write the word prefix docids into the LMDB database.
+        write_into_lmdb_database(
+            self.wtxn,
+            *self.index.word_prefix_docids.as_polymorph(),
+            docids_reader,
+            word_docids_merge,
+            WriteMethod::Append,
+        )?;
+
+        Ok(())
+    }
+}

From 9b03b0a1b2e63c2ea434fec26b8741c9a800043c Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 10 Feb 2021 10:28:15 +0100
Subject: [PATCH 05/14] Introduce the word prefix pair proximity docids
 database

---
 infos/src/main.rs                    | 10 ++++++++++
 milli/src/index.rs                   |  6 +++++-
 milli/src/update/clear_documents.rs  |  2 ++
 milli/src/update/delete_documents.rs |  2 ++
 4 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/infos/src/main.rs b/infos/src/main.rs
index 305bfd0d5..fcfab8bc5 100644
--- a/infos/src/main.rs
+++ b/infos/src/main.rs
@@ -18,6 +18,7 @@ const MAIN_DB_NAME: &str = "main";
 const WORD_DOCIDS_DB_NAME: &str = "word-docids";
 const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions";
 const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids";
+const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids";
 const DOCUMENTS_DB_NAME: &str = "documents";
 const USERS_IDS_DOCUMENTS_IDS: &[u8] = b"users-ids-documents-ids";
 
@@ -314,6 +315,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
         word_prefix_docids,
         docid_word_positions,
         word_pair_proximity_docids,
+        word_prefix_pair_proximity_docids,
         facet_field_id_value_docids,
         field_id_docid_facet_values: _,
         documents,
@@ -323,6 +325,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
     let word_docids_name = "word_docids";
     let word_prefix_docids_name = "word_prefix_docids";
     let docid_word_positions_name = "docid_word_positions";
+    let word_prefix_pair_proximity_docids_name = "word_prefix_pair_proximity_docids";
     let word_pair_proximity_docids_name = "word_pair_proximity_docids";
     let facet_field_id_value_docids_name = "facet_field_id_value_docids";
     let documents_name = "documents";
@@ -373,6 +376,13 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
             if heap.len() > limit { heap.pop(); }
         }
 
+        for result in word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
+            let ((word, prefix, prox), value) = result?;
+            let key = format!("{} {} {}", word, prefix, prox);
+            heap.push(Reverse((value.len(), key, word_prefix_pair_proximity_docids_name)));
+            if heap.len() > limit { heap.pop(); }
+        }
+
         let faceted_fields = index.faceted_fields_ids(rtxn)?;
         let fields_ids_map = index.fields_ids_map(rtxn)?;
         for (field_id, field_type) in faceted_fields {
diff --git a/milli/src/index.rs b/milli/src/index.rs
index 5763f78ee..12ad86b22 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -43,6 +43,8 @@ pub struct Index {
     pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
     /// Maps the proximity between a pair of words with all the docids where this relation appears.
     pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
+    /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears.
+    pub word_prefix_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
     /// Maps the facet field id and the globally ordered value with the docids that corresponds to it.
     pub facet_field_id_value_docids: Database<ByteSlice, CboRoaringBitmapCodec>,
     /// Maps the document id, the facet field id and the globally ordered value.
@@ -53,7 +55,7 @@ pub struct Index {
 
 impl Index {
     pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result<Index> {
-        options.max_dbs(8);
+        options.max_dbs(9);
 
         let env = options.open(path)?;
         let main = env.create_poly_database(Some("main"))?;
@@ -61,6 +63,7 @@ impl Index {
         let word_prefix_docids = env.create_database(Some("word-prefix-docids"))?;
         let docid_word_positions = env.create_database(Some("docid-word-positions"))?;
         let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?;
+        let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?;
         let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?;
         let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?;
         let documents = env.create_database(Some("documents"))?;
@@ -72,6 +75,7 @@ impl Index {
             word_prefix_docids,
             docid_word_positions,
             word_pair_proximity_docids,
+            word_prefix_pair_proximity_docids,
             facet_field_id_value_docids,
             field_id_docid_facet_values,
             documents,
diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs
index d20263d38..1523a95b2 100644
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@@ -25,6 +25,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
             word_prefix_docids,
             docid_word_positions,
             word_pair_proximity_docids,
+            word_prefix_pair_proximity_docids,
             facet_field_id_value_docids,
             field_id_docid_facet_values,
             documents,
@@ -50,6 +51,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
         word_prefix_docids.clear(self.wtxn)?;
         docid_word_positions.clear(self.wtxn)?;
         word_pair_proximity_docids.clear(self.wtxn)?;
+        word_prefix_pair_proximity_docids.clear(self.wtxn)?;
         facet_field_id_value_docids.clear(self.wtxn)?;
         field_id_docid_facet_values.clear(self.wtxn)?;
         documents.clear(self.wtxn)?;
diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
index 1e0064f22..27686960d 100644
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -82,6 +82,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
             word_prefix_docids,
             docid_word_positions,
             word_pair_proximity_docids,
+            word_prefix_pair_proximity_docids,
             facet_field_id_value_docids,
             field_id_docid_facet_values,
             documents,
@@ -160,6 +161,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
 
         // FIXME we must recompute the words prefixes docids.
         todo!("recompute words prefixes docids");
+        todo!("recompute words prefixes pairs proximity docids");
 
         // We construct an FST set that contains the words to delete from the words FST.
         let words_to_delete = words.iter().filter_map(|(word, must_remove)| {

From b5b89990ebc5367748e5072ad640e984db55ee5c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 17 Feb 2021 11:04:27 +0100
Subject: [PATCH 06/14] Compute and write the word prefix pair proximities
 database

---
 milli/src/update/words_prefixes.rs | 74 ++++++++++++++++++++++++++----
 1 file changed, 64 insertions(+), 10 deletions(-)

diff --git a/milli/src/update/words_prefixes.rs b/milli/src/update/words_prefixes.rs
index bb8d3a6f8..b020ed28b 100644
--- a/milli/src/update/words_prefixes.rs
+++ b/milli/src/update/words_prefixes.rs
@@ -1,14 +1,17 @@
 use std::iter::FromIterator;
 use std::str;
 
-use fst::Streamer;
+use fst::automaton::Str;
+use fst::{Automaton, Streamer, IntoStreamer};
 use grenad::CompressionType;
+use heed::BytesEncode;
 use heed::types::ByteSlice;
 
-use crate::{Index, SmallString32};
+use crate::heed_codec::StrStrU8Codec;
 use crate::update::index_documents::WriteMethod;
-use crate::update::index_documents::{create_sorter, create_writer, writer_into_reader};
-use crate::update::index_documents::{word_docids_merge, write_into_lmdb_database};
+use crate::update::index_documents::{create_sorter, create_writer, writer_into_reader, write_into_lmdb_database};
+use crate::update::index_documents::{word_docids_merge, words_pairs_proximities_docids_merge};
+use crate::{Index, SmallString32};
 
 pub struct WordsPrefixes<'t, 'u, 'i> {
     wtxn: &'t mut heed::RwTxn<'i, 'u>,
@@ -67,6 +70,7 @@ impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> {
     pub fn execute(self) -> anyhow::Result<()> {
         // Clear the words prefixes datastructures.
         self.index.word_prefix_docids.clear(self.wtxn)?;
+        self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?;
 
         let words_fst = self.index.words_fst(&self.wtxn)?;
         let number_of_words = words_fst.len();
@@ -74,7 +78,7 @@ impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> {
 
         // It is forbidden to keep a mutable reference into the database
         // and write into it at the same time, therefore we write into another file.
-        let mut docids_sorter = create_sorter(
+        let mut prefix_docids_sorter = create_sorter(
             word_docids_merge,
             self.chunk_compression_type,
             self.chunk_compression_level,
@@ -133,7 +137,7 @@ impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> {
             let db = self.index.word_docids.remap_data_type::<ByteSlice>();
             for result in db.prefix_iter(self.wtxn, prefix)? {
                 let (_word, data) = result?;
-                docids_sorter.insert(prefix, data)?;
+                prefix_docids_sorter.insert(prefix, data)?;
             }
         }
 
@@ -141,21 +145,71 @@ impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> {
         self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?;
 
         // We write the sorter into a reader to be able to read it back.
-        let mut docids_writer = tempfile::tempfile().and_then(|file| {
+        let mut prefix_docids_writer = tempfile::tempfile().and_then(|file| {
             create_writer(self.chunk_compression_type, self.chunk_compression_level, file)
         })?;
-        docids_sorter.write_into(&mut docids_writer)?;
-        let docids_reader = writer_into_reader(docids_writer, self.chunk_fusing_shrink_size)?;
+        prefix_docids_sorter.write_into(&mut prefix_docids_writer)?;
+        let prefix_docids_reader = writer_into_reader(
+            prefix_docids_writer,
+            self.chunk_fusing_shrink_size,
+        )?;
 
         // We finally write the word prefix docids into the LMDB database.
         write_into_lmdb_database(
             self.wtxn,
             *self.index.word_prefix_docids.as_polymorph(),
-            docids_reader,
+            prefix_docids_reader,
             word_docids_merge,
             WriteMethod::Append,
         )?;
 
+        // We compute the word prefix pair proximity database.
+
+        // Here we create a sorter akin to the previous one.
+        let mut word_prefix_pair_proximity_docids_sorter = create_sorter(
+            words_pairs_proximities_docids_merge,
+            self.chunk_compression_type,
+            self.chunk_compression_level,
+            self.chunk_fusing_shrink_size,
+            self.max_nb_chunks,
+            self.max_memory,
+        );
+
+        // We insert all the word pairs corresponding to the word-prefix pairs
+        // where the prefixes appears in the prefix FST previously constructed.
+        let db = self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>();
+        for result in db.iter(self.wtxn)? {
+            let ((word1, word2, prox), data) = result?;
+            let automaton = Str::new(word2).starts_with();
+            let mut matching_prefixes = prefix_fst.search(automaton).into_stream();
+            while let Some(prefix) = matching_prefixes.next() {
+                let prefix = str::from_utf8(prefix)?;
+                let pair = (word1, prefix, prox);
+                let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap();
+                word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?;
+            }
+        }
+
+        // FIXME we should create a sorter_into_lmdb_database function
+        // We write the sorter into a reader to be able to read it back.
+        let mut word_prefix_pair_prox_docids_writer = tempfile::tempfile().and_then(|file| {
+            create_writer(self.chunk_compression_type, self.chunk_compression_level, file)
+        })?;
+        word_prefix_pair_proximity_docids_sorter.write_into(&mut word_prefix_pair_prox_docids_writer)?;
+        let word_prefix_pair_docids_reader = writer_into_reader(
+            word_prefix_pair_prox_docids_writer,
+            self.chunk_fusing_shrink_size,
+        )?;
+
+        // We finally write the word prefix pair proximity docids into the LMDB database.
+        write_into_lmdb_database(
+            self.wtxn,
+            *self.index.word_prefix_pair_proximity_docids.as_polymorph(),
+            word_prefix_pair_docids_reader,
+            words_pairs_proximities_docids_merge,
+            WriteMethod::Append,
+        )?;
+
         Ok(())
     }
 }

From 62eee9c69e4b5fdda6a8626119f489e0caf75933 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 17 Feb 2021 11:09:42 +0100
Subject: [PATCH 07/14] Introduce the sorter_into_lmdb_database helper function

---
 milli/src/update/index_documents/mod.rs | 104 +++++++++++++++++-------
 milli/src/update/words_prefixes.rs      |  31 ++-----
 2 files changed, 78 insertions(+), 57 deletions(-)

diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index d53173b71..b6fde7ef4 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -8,7 +8,7 @@ use std::time::Instant;
 
 use anyhow::Context;
 use bstr::ByteSlice as _;
-use grenad::{Writer, Sorter, Merger, Reader, FileFuse, CompressionType};
+use grenad::{MergerIter, Writer, Sorter, Merger, Reader, FileFuse, CompressionType};
 use heed::types::ByteSlice;
 use log::{debug, info, error};
 use memmap::Mmap;
@@ -102,39 +102,19 @@ pub fn merge_into_lmdb_database(
     sources: Vec<Reader<FileFuse>>,
     merge: MergeFn,
     method: WriteMethod,
-) -> anyhow::Result<()> {
+) -> anyhow::Result<()>
+{
     debug!("Merging {} MTBL stores...", sources.len());
     let before = Instant::now();
 
     let merger = merge_readers(sources, merge);
-    let mut in_iter = merger.into_merge_iter()?;
-
-    match method {
-        WriteMethod::Append => {
-            let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
-            while let Some((k, v)) = in_iter.next()? {
-                out_iter.append(k, v).with_context(|| {
-                    format!("writing {:?} into LMDB", k.as_bstr())
-                })?;
-            }
-        },
-        WriteMethod::GetMergePut => {
-            while let Some((k, v)) = in_iter.next()? {
-                let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
-                match iter.next().transpose()? {
-                    Some((key, old_val)) if key == k => {
-                        let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)];
-                        let val = merge(k, &vals).expect("merge failed");
-                        iter.put_current(k, &val)?;
-                    },
-                    _ => {
-                        drop(iter);
-                        database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
-                    },
-                }
-            }
-        },
-    }
+    merger_iter_into_lmdb_database(
+        wtxn,
+        database,
+        merger.into_merge_iter()?,
+        merge,
+        method,
+    )?;
 
     debug!("MTBL stores merged in {:.02?}!", before.elapsed());
     Ok(())
@@ -146,7 +126,8 @@ pub fn write_into_lmdb_database(
     mut reader: Reader<FileFuse>,
     merge: MergeFn,
     method: WriteMethod,
-) -> anyhow::Result<()> {
+) -> anyhow::Result<()>
+{
     debug!("Writing MTBL stores...");
     let before = Instant::now();
 
@@ -181,6 +162,67 @@ pub fn write_into_lmdb_database(
     Ok(())
 }
 
+pub fn sorter_into_lmdb_database(
+    wtxn: &mut heed::RwTxn,
+    database: heed::PolyDatabase,
+    sorter: Sorter<MergeFn>,
+    merge: MergeFn,
+    method: WriteMethod,
+) -> anyhow::Result<()>
+{
+    debug!("Writing MTBL sorter...");
+    let before = Instant::now();
+
+    merger_iter_into_lmdb_database(
+        wtxn,
+        database,
+        sorter.into_iter()?,
+        merge,
+        method,
+    )?;
+
+    debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
+    Ok(())
+}
+
+fn merger_iter_into_lmdb_database<R: io::Read>(
+    wtxn: &mut heed::RwTxn,
+    database: heed::PolyDatabase,
+    mut sorter: MergerIter<R, MergeFn>,
+    merge: MergeFn,
+    method: WriteMethod,
+) -> anyhow::Result<()>
+{
+    match method {
+        WriteMethod::Append => {
+            let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
+            while let Some((k, v)) = sorter.next()? {
+                out_iter.append(k, v).with_context(|| {
+                    format!("writing {:?} into LMDB", k.as_bstr())
+                })?;
+            }
+        },
+        WriteMethod::GetMergePut => {
+            while let Some((k, v)) = sorter.next()? {
+                let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
+                match iter.next().transpose()? {
+                    Some((key, old_val)) if key == k => {
+                        let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)];
+                        let val = merge(k, &vals).expect("merge failed");
+                        iter.put_current(k, &val)?;
+                    },
+                    _ => {
+                        drop(iter);
+                        database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
+                    },
+                }
+            }
+        },
+    }
+
+    Ok(())
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
 #[non_exhaustive]
 pub enum IndexDocumentsMethod {
diff --git a/milli/src/update/words_prefixes.rs b/milli/src/update/words_prefixes.rs
index b020ed28b..f7c898c89 100644
--- a/milli/src/update/words_prefixes.rs
+++ b/milli/src/update/words_prefixes.rs
@@ -9,7 +9,7 @@ use heed::types::ByteSlice;
 
 use crate::heed_codec::StrStrU8Codec;
 use crate::update::index_documents::WriteMethod;
-use crate::update::index_documents::{create_sorter, create_writer, writer_into_reader, write_into_lmdb_database};
+use crate::update::index_documents::{create_sorter, sorter_into_lmdb_database};
 use crate::update::index_documents::{word_docids_merge, words_pairs_proximities_docids_merge};
 use crate::{Index, SmallString32};
 
@@ -144,21 +144,11 @@ impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> {
         // Set the words prefixes FST in the dtabase.
         self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?;
 
-        // We write the sorter into a reader to be able to read it back.
-        let mut prefix_docids_writer = tempfile::tempfile().and_then(|file| {
-            create_writer(self.chunk_compression_type, self.chunk_compression_level, file)
-        })?;
-        prefix_docids_sorter.write_into(&mut prefix_docids_writer)?;
-        let prefix_docids_reader = writer_into_reader(
-            prefix_docids_writer,
-            self.chunk_fusing_shrink_size,
-        )?;
-
         // We finally write the word prefix docids into the LMDB database.
-        write_into_lmdb_database(
+        sorter_into_lmdb_database(
             self.wtxn,
             *self.index.word_prefix_docids.as_polymorph(),
-            prefix_docids_reader,
+            prefix_docids_sorter,
             word_docids_merge,
             WriteMethod::Append,
         )?;
@@ -190,22 +180,11 @@ impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> {
             }
         }
 
-        // FIXME we should create a sorter_into_lmdb_database function
-        // We write the sorter into a reader to be able to read it back.
-        let mut word_prefix_pair_prox_docids_writer = tempfile::tempfile().and_then(|file| {
-            create_writer(self.chunk_compression_type, self.chunk_compression_level, file)
-        })?;
-        word_prefix_pair_proximity_docids_sorter.write_into(&mut word_prefix_pair_prox_docids_writer)?;
-        let word_prefix_pair_docids_reader = writer_into_reader(
-            word_prefix_pair_prox_docids_writer,
-            self.chunk_fusing_shrink_size,
-        )?;
-
         // We finally write the word prefix pair proximity docids into the LMDB database.
-        write_into_lmdb_database(
+        sorter_into_lmdb_database(
             self.wtxn,
             *self.index.word_prefix_pair_proximity_docids.as_polymorph(),
-            word_prefix_pair_docids_reader,
+            word_prefix_pair_proximity_docids_sorter,
             words_pairs_proximities_docids_merge,
             WriteMethod::Append,
         )?;

From ea37fd821d646c9bec5378b2c3e486dcb21400b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 17 Feb 2021 11:22:25 +0100
Subject: [PATCH 08/14] Clean up the words prefixes when deleting documents and
 words

---
 milli/src/update/delete_documents.rs | 45 +++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 4 deletions(-)

diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
index 27686960d..754f320a5 100644
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -159,10 +159,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
             }
         }
 
-        // FIXME we must recompute the words prefixes docids.
-        todo!("recompute words prefixes docids");
-        todo!("recompute words prefixes pairs proximity docids");
-
         // We construct an FST set that contains the words to delete from the words FST.
         let words_to_delete = words.iter().filter_map(|(word, must_remove)| {
             if *must_remove { Some(word.as_ref()) } else { None }
@@ -185,6 +181,47 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
         // We write the new words FST into the main database.
         self.index.put_words_fst(self.wtxn, &new_words_fst)?;
 
+        // We iterate over the word prefix docids database and remove the deleted documents ids
+        // from every docids lists. We register the empty prefixes in an fst Set for futur deletion.
+        let mut prefixes_to_delete = fst::SetBuilder::memory();
+        let mut iter = word_prefix_docids.iter_mut(self.wtxn)?;
+        while let Some(result) = iter.next() {
+            let (prefix, mut docids) = result?;
+            let previous_len = docids.len();
+            docids.difference_with(&self.documents_ids);
+            if docids.is_empty() {
+                iter.del_current()?;
+                prefixes_to_delete.insert(prefix)?;
+            } else if docids.len() != previous_len {
+                iter.put_current(prefix, &docids)?;
+            }
+        }
+
+        drop(iter);
+
+        // We compute the new prefix FST and write it only if there is a change.
+        let prefixes_to_delete = prefixes_to_delete.into_set();
+        if !prefixes_to_delete.is_empty() {
+            let new_words_prefixes_fst = {
+                // We retrieve the current words prefixes FST from the database.
+                let words_prefixes_fst = self.index.words_prefixes_fst(self.wtxn)?;
+                let difference = words_prefixes_fst.op().add(&prefixes_to_delete).difference();
+
+                // We stream the new external ids that does no more contains the to-delete external ids.
+                let mut new_words_prefixes_fst_builder = fst::SetBuilder::memory();
+                new_words_prefixes_fst_builder.extend_stream(difference.into_stream())?;
+
+                // We create an words FST set from the above builder.
+                new_words_prefixes_fst_builder.into_set()
+            };
+
+            // We write the new words prefixes FST into the main database.
+            self.index.put_words_prefixes_fst(self.wtxn, &new_words_prefixes_fst)?;
+        }
+
+        // FIXME we must recompute the words prefixes docids.
+        todo!("recompute words prefixes pairs proximity docids");
+
         // We delete the documents ids that are under the pairs of words,
         // it is faster and use no memory to iterate over all the words pairs than
         // to compute the cartesian product of every words of the deleted documents.

From 616ed8f73c4d64f3d276c56d268b1ee3d9f47f30 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 10 Feb 2021 10:35:25 +0100
Subject: [PATCH 09/14] Clean up the word prefix pair proximities when deleting
 documents

---
 milli/src/update/delete_documents.rs | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
index 754f320a5..5430bb3af 100644
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -219,8 +219,22 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
             self.index.put_words_prefixes_fst(self.wtxn, &new_words_prefixes_fst)?;
         }
 
-        // FIXME we must recompute the words prefixes docids.
-        todo!("recompute words prefixes pairs proximity docids");
+        // We delete the documents ids from the word prefix pair proximity database docids
+        // and remove the empty pairs too.
+        let db = word_prefix_pair_proximity_docids.remap_key_type::<ByteSlice>();
+        let mut iter = db.iter_mut(self.wtxn)?;
+        while let Some(result) = iter.next() {
+            let (key, mut docids) = result?;
+            let previous_len = docids.len();
+            docids.difference_with(&self.documents_ids);
+            if docids.is_empty() {
+                iter.del_current()?;
+            } else if docids.len() != previous_len {
+                iter.put_current(key, &docids)?;
+            }
+        }
+
+        drop(iter);
 
         // We delete the documents ids that are under the pairs of words,
         // it is faster and use no memory to iterate over all the words pairs than

From 87884859247da16493888f7793e2291f341786e6 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 10 Feb 2021 11:20:00 +0100
Subject: [PATCH 10/14] Take the prefix databases into account in the infos
 subcommand

---
 infos/src/main.rs | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/infos/src/main.rs b/infos/src/main.rs
index fcfab8bc5..e88188217 100644
--- a/infos/src/main.rs
+++ b/infos/src/main.rs
@@ -16,6 +16,7 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
 
 const MAIN_DB_NAME: &str = "main";
 const WORD_DOCIDS_DB_NAME: &str = "word-docids";
+const WORD_PREFIX_DOCIDS_DB_NAME: &str = "word-prefix-docids";
 const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions";
 const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids";
 const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids";
@@ -25,15 +26,19 @@ const USERS_IDS_DOCUMENTS_IDS: &[u8] = b"users-ids-documents-ids";
 const ALL_DATABASE_NAMES: &[&str] = &[
     MAIN_DB_NAME,
     WORD_DOCIDS_DB_NAME,
+    WORD_PREFIX_DOCIDS_DB_NAME,
     DOCID_WORD_POSITIONS_DB_NAME,
     WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME,
+    WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME,
     DOCUMENTS_DB_NAME,
 ];
 
 const POSTINGS_DATABASE_NAMES: &[&str] = &[
     WORD_DOCIDS_DB_NAME,
+    WORD_PREFIX_DOCIDS_DB_NAME,
     DOCID_WORD_POSITIONS_DB_NAME,
     WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME,
+    WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME,
 ];
 
 #[derive(Debug, StructOpt)]
@@ -653,9 +658,11 @@ fn size_of_database(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Re
 
     let database = match name {
         MAIN_DB_NAME => &index.main,
+        WORD_PREFIX_DOCIDS_DB_NAME => index.word_prefix_docids.as_polymorph(),
         WORD_DOCIDS_DB_NAME => index.word_docids.as_polymorph(),
         DOCID_WORD_POSITIONS_DB_NAME => index.docid_word_positions.as_polymorph(),
         WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_pair_proximity_docids.as_polymorph(),
+        WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_prefix_pair_proximity_docids.as_polymorph(),
         DOCUMENTS_DB_NAME => index.documents.as_polymorph(),
         unknown => anyhow::bail!("unknown database {:?}", unknown),
     };
@@ -718,7 +725,7 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu
         let sum = values_length.iter().map(|l| *l as u64).sum::<u64>();
 
         println!("The {} database stats on the lengths", name);
-        println!("\tnumber of proximity pairs: {}", count);
+        println!("\tnumber of entries: {}", count);
         println!("\t25th percentile (first quartile): {}", twenty_five_percentile);
         println!("\t50th percentile (median): {}", fifty_percentile);
         println!("\t75th percentile (third quartile): {}", seventy_five_percentile);
@@ -740,6 +747,10 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu
             let db = index.word_docids.as_polymorph();
             compute_stats::<RoaringBitmapCodec>(*db, rtxn, name)
         },
+        WORD_PREFIX_DOCIDS_DB_NAME => {
+            let db = index.word_prefix_docids.as_polymorph();
+            compute_stats::<RoaringBitmapCodec>(*db, rtxn, name)
+        },
         DOCID_WORD_POSITIONS_DB_NAME => {
             let db = index.docid_word_positions.as_polymorph();
             compute_stats::<BoRoaringBitmapCodec>(*db, rtxn, name)
@@ -748,6 +759,10 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu
             let db = index.word_pair_proximity_docids.as_polymorph();
             compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name)
         },
+        WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => {
+            let db = index.word_prefix_pair_proximity_docids.as_polymorph();
+            compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name)
+        },
         unknown => anyhow::bail!("unknown database {:?}", unknown),
     }
 }

From a4a48be923f454dfee45b2f921c026d1c5fadb0d Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 10 Feb 2021 11:53:13 +0100
Subject: [PATCH 11/14] Run the words prefixes update inside of the indexing
 documents update

---
 milli/src/update/index_documents/mod.rs | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index b6fde7ef4..d55f421dc 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -17,7 +17,7 @@ use rayon::prelude::*;
 use serde::{Serialize, Deserialize};
 
 use crate::index::Index;
-use crate::update::{Facets, UpdateIndexingStep};
+use crate::update::{Facets, WordsPrefixes, UpdateIndexingStep};
 use self::store::{Store, Readers};
 pub use self::merge_function::{
     main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
@@ -259,6 +259,8 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> {
     pub(crate) thread_pool: Option<&'a ThreadPool>,
     facet_level_group_size: Option<NonZeroUsize>,
     facet_min_level_size: Option<NonZeroUsize>,
+    words_prefix_threshold: Option<f64>,
+    max_prefix_length: Option<usize>,
     update_method: IndexDocumentsMethod,
     update_format: UpdateFormat,
     autogenerate_docids: bool,
@@ -284,6 +286,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
             thread_pool: None,
             facet_level_group_size: None,
             facet_min_level_size: None,
+            words_prefix_threshold: None,
+            max_prefix_length: None,
             update_method: IndexDocumentsMethod::ReplaceDocuments,
             update_format: UpdateFormat::Json,
             autogenerate_docids: true,
@@ -667,6 +671,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
             });
         }
 
+        // Run the facets update operation.
         let mut builder = Facets::new(self.wtxn, self.index, self.update_id);
         builder.chunk_compression_type = self.chunk_compression_type;
         builder.chunk_compression_level = self.chunk_compression_level;
@@ -679,6 +684,19 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
         }
         builder.execute()?;
 
+        // Run the words prefixes update operation.
+        let mut builder = WordsPrefixes::new(self.wtxn, self.index, self.update_id);
+        builder.chunk_compression_type = self.chunk_compression_type;
+        builder.chunk_compression_level = self.chunk_compression_level;
+        builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
+        if let Some(value) = self.words_prefix_threshold {
+            builder.threshold(value);
+        }
+        if let Some(value) = self.max_prefix_length {
+            builder.max_prefix_length(value);
+        }
+        builder.execute()?;
+
         debug_assert_eq!(database_count, total_databases);
 
         info!("Transform output indexed in {:.02?}", before_indexing.elapsed());

From 7a0f86a04fb62f23d2821f0e06118ad65b3ef5ce Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 10 Feb 2021 12:18:56 +0100
Subject: [PATCH 12/14] Introduce an infos command to extract the words
 prefixes fst

---
 infos/src/main.rs | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/infos/src/main.rs b/infos/src/main.rs
index e88188217..ef23bf4ff 100644
--- a/infos/src/main.rs
+++ b/infos/src/main.rs
@@ -153,6 +153,12 @@ enum Command {
     /// you can install it using `cargo install fst-bin`.
     ExportWordsFst,
 
+    /// Outputs the words prefix FST to standard output.
+    ///
+    /// One can use the FST binary helper to dissect and analyze it,
+    /// you can install it using `cargo install fst-bin`.
+    ExportWordsPrefixFst,
+
     /// Outputs the documents as JSON lines to the standard output.
     ///
     /// All of the fields are extracted, not just the displayed ones.
@@ -207,6 +213,7 @@ fn run(opt: Opt) -> anyhow::Result<()> {
             word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2)
         },
         ExportWordsFst => export_words_fst(&index, &rtxn),
+        ExportWordsPrefixFst => export_words_prefix_fst(&index, &rtxn),
         ExportDocuments => export_documents(&index, &rtxn),
         PatchToNewExternalIds => {
             drop(rtxn);
@@ -548,6 +555,16 @@ fn export_words_fst(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> {
     Ok(())
 }
 
+fn export_words_prefix_fst(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> {
+    use std::io::Write as _;
+
+    let mut stdout = io::stdout();
+    let words_prefixes_fst = index.words_prefixes_fst(rtxn)?;
+    stdout.write_all(words_prefixes_fst.as_fst().as_bytes())?;
+
+    Ok(())
+}
+
 fn export_documents(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> {
     use std::io::{BufWriter, Write as _};
     use milli::obkv_to_json;

From 49aee6d02cb1ffb4d9df79af5b0ae9ebe82af55a Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 10 Feb 2021 12:19:10 +0100
Subject: [PATCH 13/14] Fix the database-stats infos subcommand

---
 infos/src/main.rs | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/infos/src/main.rs b/infos/src/main.rs
index ef23bf4ff..e33c2820f 100644
--- a/infos/src/main.rs
+++ b/infos/src/main.rs
@@ -725,17 +725,14 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu
         }
 
         values_length.sort_unstable();
+        let len = values_length.len();
 
-        let median = values_length.len() / 2;
-        let quartile = values_length.len() / 4;
-        let percentile = values_length.len() / 100;
-
-        let twenty_five_percentile = values_length.get(quartile).unwrap_or(&0);
-        let fifty_percentile = values_length.get(median).unwrap_or(&0);
-        let seventy_five_percentile = values_length.get(quartile * 3).unwrap_or(&0);
-        let ninety_percentile = values_length.get(percentile * 90).unwrap_or(&0);
-        let ninety_five_percentile = values_length.get(percentile * 95).unwrap_or(&0);
-        let ninety_nine_percentile = values_length.get(percentile * 99).unwrap_or(&0);
+        let twenty_five_percentile = values_length.get(len / 4).unwrap_or(&0);
+        let fifty_percentile = values_length.get(len / 2).unwrap_or(&0);
+        let seventy_five_percentile = values_length.get(len * 3 / 4).unwrap_or(&0);
+        let ninety_percentile = values_length.get(len * 90 / 100).unwrap_or(&0);
+        let ninety_five_percentile = values_length.get(len * 95 / 100).unwrap_or(&0);
+        let ninety_nine_percentile = values_length.get(len * 99 / 100).unwrap_or(&0);
         let minimum = values_length.first().unwrap_or(&0);
         let maximum = values_length.last().unwrap_or(&0);
         let count = values_length.len();

From aa4d9882d298eafa24c462b8395fd3869250089c Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 10 Feb 2021 12:28:46 +0100
Subject: [PATCH 14/14] Introduce the new words-prefixes-docids infos subcomand

---
 infos/src/main.rs | 50 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/infos/src/main.rs b/infos/src/main.rs
index e33c2820f..3f41b7d42 100644
--- a/infos/src/main.rs
+++ b/infos/src/main.rs
@@ -91,6 +91,16 @@ enum Command {
         words: Vec<String>,
     },
 
+    /// Outputs a CSV with the documents ids where the given words prefixes appears.
+    WordsPrefixesDocids {
+        /// Display the whole documents ids in details.
+        #[structopt(long)]
+        full_display: bool,
+
+        /// The prefixes to display the documents ids of.
+        prefixes: Vec<String>,
+    },
+
     /// Outputs a CSV with the documents ids along with the facet values where it appears.
     FacetValuesDocids {
         /// Display the whole documents ids in details.
@@ -198,6 +208,9 @@ fn run(opt: Opt) -> anyhow::Result<()> {
         MostCommonWords { limit } => most_common_words(&index, &rtxn, limit),
         BiggestValues { limit } => biggest_value_sizes(&index, &rtxn, limit),
         WordsDocids { full_display, words } => words_docids(&index, &rtxn, !full_display, words),
+        WordsPrefixesDocids { full_display, prefixes } => {
+            words_prefixes_docids(&index, &rtxn, !full_display, prefixes)
+        },
         FacetValuesDocids { full_display, field_name } => {
             facet_values_docids(&index, &rtxn, !full_display, field_name)
         },
@@ -464,6 +477,43 @@ fn words_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, words: Vec<Strin
     Ok(wtr.flush()?)
 }
 
+fn words_prefixes_docids(
+    index: &Index,
+    rtxn: &heed::RoTxn,
+    debug: bool,
+    prefixes: Vec<String>,
+) -> anyhow::Result<()>
+{
+    let stdout = io::stdout();
+    let mut wtr = csv::Writer::from_writer(stdout.lock());
+    wtr.write_record(&["prefix", "documents_ids"])?;
+
+    if prefixes.is_empty() {
+        for result in index.word_prefix_docids.iter(rtxn)? {
+            let (prefix, docids) = result?;
+            let docids = if debug {
+                format!("{:?}", docids)
+            } else {
+                format!("{:?}", docids.iter().collect::<Vec<_>>())
+            };
+            wtr.write_record(&[prefix, &docids])?;
+        }
+    } else {
+        for prefix in prefixes {
+            if let Some(docids) = index.word_prefix_docids.get(rtxn, &prefix)? {
+                let docids = if debug {
+                    format!("{:?}", docids)
+                } else {
+                    format!("{:?}", docids.iter().collect::<Vec<_>>())
+                };
+                wtr.write_record(&[prefix, docids])?;
+            }
+        }
+    }
+
+    Ok(wtr.flush()?)
+}
+
 fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_name: String) -> anyhow::Result<()> {
     let fields_ids_map = index.fields_ids_map(&rtxn)?;
     let faceted_fields = index.faceted_fields_ids(&rtxn)?;