From 7e9d56a9e75391724f2c24a6f892a17dd7c30c5b Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Mon, 21 Mar 2022 16:25:15 +0100
Subject: [PATCH 01/28] disable typos on exact words

---
 milli/src/search/query_tree.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs
index 4eccae8ce..7d13f27a3 100644
--- a/milli/src/search/query_tree.rs
+++ b/milli/src/search/query_tree.rs
@@ -623,7 +623,9 @@ mod test {
         }
 
         fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>> {
-            Ok(fst::Set::new(Cow::Borrowed(self.exact_words.as_slice())).unwrap())
+            let builder = fst::SetBuilder::new(Vec::new()).unwrap();
+            let data = builder.into_inner().unwrap();
+            Ok(fst::Set::new(Cow::Owned(data)).unwrap())
         }
     }
 

From c882d8daf0dd174c8bb8c51734493e6814780d24 Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Tue, 22 Mar 2022 09:55:49 +0100
Subject: [PATCH 02/28] add test for exact words

---
 milli/src/search/query_tree.rs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs
index 7d13f27a3..ff9d3f4e9 100644
--- a/milli/src/search/query_tree.rs
+++ b/milli/src/search/query_tree.rs
@@ -623,9 +623,7 @@ mod test {
         }
 
         fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>> {
-            let builder = fst::SetBuilder::new(Vec::new()).unwrap();
-            let data = builder.into_inner().unwrap();
-            Ok(fst::Set::new(Cow::Owned(data)).unwrap())
+            Ok(fst::Set::new(Cow::Borrowed(self.exact_words.as_slice())).unwrap())
         }
     }
 
@@ -1269,6 +1267,7 @@ mod test {
             QueryKind::Tolerant { typo: 2, word: "verylongword".to_string() }
         );
     }
+
     #[test]
     fn disable_typo_on_word() {
         let query = "goodbye";

From f82d4b36eb37212df5b3b9f42120fcef50419108 Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Tue, 22 Mar 2022 19:07:59 +0100
Subject: [PATCH 03/28] introduce exact attribute setting

---
 milli/src/index.rs           | 18 ++++++++++++++++++
 milli/src/update/settings.rs | 34 +++++++++++++++++++++++++++++++++-
 2 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/milli/src/index.rs b/milli/src/index.rs
index c0be985da..f4e17d93c 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -53,6 +53,7 @@ pub mod main_key {
     pub const ONE_TYPO_WORD_LEN: &str = "one-typo-word-len";
     pub const TWO_TYPOS_WORD_LEN: &str = "two-typos-word-len";
     pub const EXACT_WORDS: &str = "exact-words";
+    pub const EXACT_ATTRIBUTES: &str = "exact-attributes";
 }
 
 pub mod db_name {
@@ -949,6 +950,23 @@ impl Index {
         )?;
         Ok(())
     }
+
+    pub fn exact_attributes<'t>(&self, txn: &'t RoTxn) -> Result<Vec<&'t str>> {
+        Ok(self
+            .main
+            .get::<_, Str, SerdeBincode<Vec<&str>>>(txn, main_key::EXACT_ATTRIBUTES)?
+            .unwrap_or_default())
+    }
+
+    pub(crate) fn put_exact_attributes(&self, txn: &mut RwTxn, attrs: &[&str]) -> Result<()> {
+        self.main.put::<_, Str, SerdeBincode<&[&str]>>(txn, main_key::EXACT_ATTRIBUTES, &attrs)?;
+        Ok(())
+    }
+
+    pub(crate) fn delete_exact_attributes(&self, txn: &mut RwTxn) -> Result<()> {
+        self.main.delete::<_, Str>(txn, main_key::EXACT_ATTRIBUTES)?;
+        Ok(())
+    }
 }
 
 #[cfg(test)]
diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs
index 503fbd06e..3ed2a4152 100644
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@@ -93,6 +93,8 @@ pub struct Settings<'a, 't, 'u, 'i> {
     min_word_len_two_typos: Setting<u8>,
     min_word_len_one_typo: Setting<u8>,
     exact_words: Setting<BTreeSet<String>>,
+    /// attributes on which typo tolerance is not enabled.
+    exact_attributes: Setting<HashSet<String>>,
 }
 
 impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
@@ -117,6 +119,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
             exact_words: Setting::NotSet,
             min_word_len_two_typos: Setting::Reset,
             min_word_len_one_typo: Setting::Reset,
+            exact_attributes: Setting::Reset,
             indexer_config,
         }
     }
@@ -226,6 +229,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
         self.exact_words = Setting::Reset;
     }
 
+    pub fn set_exact_attributes(&mut self, attrs: HashSet<String>) {
+        self.exact_attributes = Setting::Set(attrs);
+    }
+
+    pub fn reset_exact_attributes(&mut self) {
+        self.exact_attributes = Setting::Reset;
+    }
+
     fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()>
     where
         F: Fn(UpdateIndexingStep) + Sync,
@@ -411,6 +422,21 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
         }
     }
 
+    fn update_exact_attributes(&mut self) -> Result<bool> {
+        match self.exact_attributes {
+            Setting::Set(ref attrs) => {
+                let attrs = attrs.iter().map(String::as_str).collect::<Vec<_>>();
+                self.index.put_exact_attributes(&mut self.wtxn, &attrs)?;
+                Ok(true)
+            }
+            Setting::Reset => {
+                self.index.delete_exact_attributes(&mut self.wtxn)?;
+                Ok(true)
+            }
+            Setting::NotSet => Ok(false),
+        }
+    }
+
     fn update_filterable(&mut self) -> Result<()> {
         match self.filterable_fields {
             Setting::Set(ref fields) => {
@@ -579,8 +605,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
         let stop_words_updated = self.update_stop_words()?;
         let synonyms_updated = self.update_synonyms()?;
         let searchable_updated = self.update_searchable()?;
+        let exact_attributes_updated = self.update_exact_attributes()?;
 
-        if stop_words_updated || faceted_updated || synonyms_updated || searchable_updated {
+        if stop_words_updated
+            || faceted_updated
+            || synonyms_updated
+            || searchable_updated
+            || exact_attributes_updated
+        {
             self.reindex(&progress_callback, old_fields_ids_map)?;
         }
 

From 5f9f82757dbebec7087cd56b2e624e372c3bbb4f Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Wed, 23 Mar 2022 14:48:15 +0100
Subject: [PATCH 04/28] refactor spawn_extraction_task

---
 .../src/update/index_documents/extract/mod.rs | 26 +++----
 .../index_documents/helpers/grenad_helpers.rs | 69 ++++++++++++++-----
 .../src/update/index_documents/helpers/mod.rs |  4 +-
 3 files changed, 69 insertions(+), 30 deletions(-)

diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index 4c81b9334..100431237 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -26,7 +26,7 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids
 use self::extract_word_position_docids::extract_word_position_docids;
 use super::helpers::{
     as_cloneable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps,
-    merge_readers, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn,
+    merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, MergeableReader,
 };
 use super::{helpers, TypedChunk};
 use crate::{FieldId, Result};
@@ -66,7 +66,7 @@ pub(crate) fn data_from_obkv_documents(
         (docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks),
     ) = result?;
 
-    spawn_extraction_task(
+    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
         docid_word_positions_chunks.clone(),
         indexer.clone(),
         lmdb_writer_sx.clone(),
@@ -76,7 +76,7 @@ pub(crate) fn data_from_obkv_documents(
         "word-pair-proximity-docids",
     );
 
-    spawn_extraction_task(
+    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
         docid_word_positions_chunks.clone(),
         indexer.clone(),
         lmdb_writer_sx.clone(),
@@ -86,7 +86,7 @@ pub(crate) fn data_from_obkv_documents(
         "field-id-wordcount-docids",
     );
 
-    spawn_extraction_task(
+    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
         docid_word_positions_chunks.clone(),
         indexer.clone(),
         lmdb_writer_sx.clone(),
@@ -96,7 +96,7 @@ pub(crate) fn data_from_obkv_documents(
         "word-docids",
     );
 
-    spawn_extraction_task(
+    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
         docid_word_positions_chunks.clone(),
         indexer.clone(),
         lmdb_writer_sx.clone(),
@@ -106,7 +106,7 @@ pub(crate) fn data_from_obkv_documents(
         "word-position-docids",
     );
 
-    spawn_extraction_task(
+    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
         docid_fid_facet_strings_chunks.clone(),
         indexer.clone(),
         lmdb_writer_sx.clone(),
@@ -116,7 +116,7 @@ pub(crate) fn data_from_obkv_documents(
         "field-id-facet-string-docids",
     );
 
-    spawn_extraction_task(
+    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
         docid_fid_facet_numbers_chunks.clone(),
         indexer.clone(),
         lmdb_writer_sx.clone(),
@@ -133,7 +133,7 @@ pub(crate) fn data_from_obkv_documents(
 /// Generated grenad chunks are merged using the merge_fn.
 /// The result of merged chunks is serialized as TypedChunk using the serialize_fn
 /// and sent into lmdb_writer_sx.
-fn spawn_extraction_task<FE, FS>(
+fn spawn_extraction_task<FE, FS, M>(
     chunks: Vec<grenad::Reader<CursorClonableMmap>>,
     indexer: GrenadParameters,
     lmdb_writer_sx: Sender<Result<TypedChunk>>,
@@ -142,19 +142,21 @@ fn spawn_extraction_task<FE, FS>(
     serialize_fn: FS,
     name: &'static str,
 ) where
-    FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<grenad::Reader<File>>
+    FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<M::Output>
         + Sync
         + Send
         + 'static,
-    FS: Fn(grenad::Reader<File>) -> TypedChunk + Sync + Send + 'static,
+    FS: Fn(M::Output) -> TypedChunk + Sync + Send + 'static,
+    M: MergeableReader + FromParallelIterator<M::Output> + Send + 'static,
+    M::Output: Send,
 {
     rayon::spawn(move || {
-        let chunks: Result<Vec<_>> =
+        let chunks: Result<M> =
             chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer.clone())).collect();
         rayon::spawn(move || match chunks {
             Ok(chunks) => {
                 debug!("merge {} database", name);
-                let reader = merge_readers(chunks, merge_fn, indexer);
+                let reader = chunks.merge(merge_fn, &indexer);
                 let _ = lmdb_writer_sx.send(reader.map(|r| serialize_fn(r)));
             }
             Err(e) => {
diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs
index e0ac3a175..fc28860b2 100644
--- a/milli/src/update/index_documents/helpers/grenad_helpers.rs
+++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs
@@ -78,25 +78,62 @@ pub unsafe fn as_cloneable_grenad(
     Ok(reader)
 }
 
-pub fn merge_readers<R: io::Read + io::Seek>(
-    readers: Vec<grenad::Reader<R>>,
-    merge_fn: MergeFn,
-    indexer: GrenadParameters,
-) -> Result<grenad::Reader<File>> {
-    let mut merger_builder = grenad::MergerBuilder::new(merge_fn);
-    for reader in readers {
-        merger_builder.push(reader.into_cursor()?);
+pub trait MergeableReader
+where
+    Self: Sized,
+{
+    type Output;
+
+    fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result<Self::Output>;
+}
+
+impl MergeableReader for Vec<grenad::Reader<File>> {
+    type Output = grenad::Reader<File>;
+
+    fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
+        let mut merger = MergerBuilder::new(merge_fn);
+        self.into_iter().try_for_each(|r| merger.push(r))?;
+        merger.finish(params)
+    }
+}
+
+impl MergeableReader for Vec<(grenad::Reader<File>, grenad::Reader<File>)> {
+    type Output = (grenad::Reader<File>, grenad::Reader<File>);
+
+    fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
+        let mut m1 = MergerBuilder::new(merge_fn);
+        let mut m2 = MergerBuilder::new(merge_fn);
+        for (r1, r2) in self.into_iter() {
+            m1.push(r1)?;
+            m2.push(r2)?;
+        }
+        Ok((m1.finish(params)?, m2.finish(params)?))
+    }
+}
+
+struct MergerBuilder<R>(grenad::MergerBuilder<R, MergeFn>);
+
+impl<R: io::Read + io::Seek> MergerBuilder<R> {
+    fn new(merge_fn: MergeFn) -> Self {
+        Self(grenad::MergerBuilder::new(merge_fn))
     }
 
-    let merger = merger_builder.build();
-    let mut writer = create_writer(
-        indexer.chunk_compression_type,
-        indexer.chunk_compression_level,
-        tempfile::tempfile()?,
-    );
-    merger.write_into_stream_writer(&mut writer)?;
+    fn push(&mut self, reader: grenad::Reader<R>) -> Result<()> {
+        self.0.push(reader.into_cursor()?);
+        Ok(())
+    }
 
-    Ok(writer_into_reader(writer)?)
+    fn finish(self, params: &GrenadParameters) -> Result<grenad::Reader<File>> {
+        let merger = self.0.build();
+        let mut writer = create_writer(
+            params.chunk_compression_type,
+            params.chunk_compression_level,
+            tempfile::tempfile()?,
+        );
+        merger.write_into_stream_writer(&mut writer)?;
+
+        Ok(writer_into_reader(writer)?)
+    }
 }
 
 #[derive(Debug, Clone, Copy)]
diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs
index 22c1cfd6c..f4940af1d 100644
--- a/milli/src/update/index_documents/helpers/mod.rs
+++ b/milli/src/update/index_documents/helpers/mod.rs
@@ -8,9 +8,9 @@ use std::convert::{TryFrom, TryInto};
 pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
 use fst::{IntoStreamer, Streamer};
 pub use grenad_helpers::{
-    as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, merge_readers,
+    as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
     sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader,
-    GrenadParameters,
+    GrenadParameters, MergeableReader,
 };
 pub use merge_functions::{
     concat_u32s_array, keep_first, keep_first_prefix_value_merge_roaring_bitmaps, keep_latest_obkv,

From 0a77be4ec02f29df26242a6ffa7a94ddcb3b0724 Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Thu, 24 Mar 2022 15:22:57 +0100
Subject: [PATCH 05/28] introduce exact_word_docids db

---
 milli/src/index.rs                            |  9 ++-
 milli/src/update/clear_documents.rs           |  2 +
 milli/src/update/delete_documents.rs          | 71 +++++++++++++------
 .../extract/extract_word_docids.rs            | 12 +++-
 .../src/update/index_documents/extract/mod.rs |  7 +-
 .../index_documents/helpers/grenad_helpers.rs |  5 ++
 .../src/update/index_documents/helpers/mod.rs |  2 +-
 milli/src/update/index_documents/mod.rs       | 18 +++--
 .../src/update/index_documents/typed_chunk.rs | 49 +++++++++----
 milli/src/update/word_prefix_docids.rs        |  5 +-
 10 files changed, 133 insertions(+), 47 deletions(-)

diff --git a/milli/src/index.rs b/milli/src/index.rs
index f4e17d93c..8f9c9beb7 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -59,6 +59,7 @@ pub mod main_key {
 pub mod db_name {
     pub const MAIN: &str = "main";
     pub const WORD_DOCIDS: &str = "word-docids";
+    pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids";
     pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids";
     pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions";
     pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids";
@@ -83,6 +84,10 @@ pub struct Index {
 
     /// A word and all the documents ids containing the word.
     pub word_docids: Database<Str, RoaringBitmapCodec>,
+
+    /// A word and all the documents ids containing the word, from attributes for which typos are not allowed.
+    pub exact_word_docids: Database<Str, RoaringBitmapCodec>,
+
     /// A prefix of word and all the documents ids containing this prefix.
     pub word_prefix_docids: Database<Str, RoaringBitmapCodec>,
 
@@ -119,12 +124,13 @@ impl Index {
     pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> {
         use db_name::*;
 
-        options.max_dbs(14);
+        options.max_dbs(15);
         unsafe { options.flag(Flags::MdbAlwaysFreePages) };
 
         let env = options.open(path)?;
         let main = env.create_poly_database(Some(MAIN))?;
         let word_docids = env.create_database(Some(WORD_DOCIDS))?;
+        let exact_word_docids = env.create_database(Some(EXACT_WORD_DOCIDS))?;
         let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?;
         let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?;
         let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
@@ -146,6 +152,7 @@ impl Index {
             env,
             main,
             word_docids,
+            exact_word_docids,
             word_prefix_docids,
             docid_word_positions,
             word_pair_proximity_docids,
diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs
index 644547b91..57c0969c7 100644
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@@ -19,6 +19,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
             env: _env,
             main: _main,
             word_docids,
+            exact_word_docids,
             word_prefix_docids,
             docid_word_positions,
             word_pair_proximity_docids,
@@ -55,6 +56,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
 
         // Clear the other databases.
         word_docids.clear(self.wtxn)?;
+        exact_word_docids.clear(self.wtxn)?;
         word_prefix_docids.clear(self.wtxn)?;
         docid_word_positions.clear(self.wtxn)?;
         word_pair_proximity_docids.clear(self.wtxn)?;
diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
index 402cc61dd..46a4721c0 100644
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -2,7 +2,7 @@ use std::collections::btree_map::Entry;
 use std::collections::HashMap;
 
 use fst::IntoStreamer;
-use heed::types::ByteSlice;
+use heed::types::{ByteSlice, Str};
 use heed::{BytesDecode, BytesEncode};
 use roaring::RoaringBitmap;
 use serde::{Deserialize, Serialize};
@@ -16,7 +16,10 @@ use crate::heed_codec::facet::{
 };
 use crate::heed_codec::CboRoaringBitmapCodec;
 use crate::index::{db_name, main_key};
-use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32};
+use crate::{
+    DocumentId, ExternalDocumentsIds, FieldId, Index, Result, RoaringBitmapCodec, SmallString32,
+    BEU32,
+};
 
 pub struct DeleteDocuments<'t, 'u, 'i> {
     wtxn: &'t mut heed::RwTxn<'i, 'u>,
@@ -108,6 +111,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
             env: _env,
             main: _main,
             word_docids,
+            exact_word_docids,
             word_prefix_docids,
             docid_word_positions,
             word_pair_proximity_docids,
@@ -204,25 +208,21 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
         // We iterate over the words and delete the documents ids
         // from the word docids database.
         for (word, must_remove) in &mut words {
-            // We create an iterator to be able to get the content and delete the word docids.
-            // It's faster to acquire a cursor to get and delete or put, as we avoid traversing
-            // the LMDB B-Tree two times but only once.
-            let mut iter = word_docids.prefix_iter_mut(self.wtxn, &word)?;
-            if let Some((key, mut docids)) = iter.next().transpose()? {
-                if key == word.as_str() {
-                    let previous_len = docids.len();
-                    docids -= &self.documents_ids;
-                    if docids.is_empty() {
-                        // safety: we don't keep references from inside the LMDB database.
-                        unsafe { iter.del_current()? };
-                        *must_remove = true;
-                    } else if docids.len() != previous_len {
-                        let key = key.to_owned();
-                        // safety: we don't keep references from inside the LMDB database.
-                        unsafe { iter.put_current(&key, &docids)? };
-                    }
-                }
-            }
+            remove_from_word_docids(
+                self.wtxn,
+                word_docids,
+                word.as_str(),
+                must_remove,
+                &self.documents_ids,
+            )?;
+
+            remove_from_word_docids(
+                self.wtxn,
+                exact_word_docids,
+                word.as_str(),
+                must_remove,
+                &self.documents_ids,
+            )?;
         }
 
         // We construct an FST set that contains the words to delete from the words FST.
@@ -457,6 +457,35 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
     }
 }
 
+fn remove_from_word_docids(
+    txn: &mut heed::RwTxn,
+    db: &heed::Database<Str, RoaringBitmapCodec>,
+    word: &str,
+    must_remove: &mut bool,
+    to_remove: &RoaringBitmap,
+) -> Result<()> {
+    // We create an iterator to be able to get the content and delete the word docids.
+    // It's faster to acquire a cursor to get and delete or put, as we avoid traversing
+    // the LMDB B-Tree two times but only once.
+    let mut iter = db.prefix_iter_mut(txn, &word)?;
+    if let Some((key, mut docids)) = iter.next().transpose()? {
+        if key == word {
+            let previous_len = docids.len();
+            docids -= to_remove;
+            if docids.is_empty() {
+                // safety: we don't keep references from inside the LMDB database.
+                unsafe { iter.del_current()? };
+                *must_remove = true;
+            } else if docids.len() != previous_len {
+                let key = key.to_owned();
+                // safety: we don't keep references from inside the LMDB database.
+                unsafe { iter.put_current(&key, &docids)? };
+            }
+        }
+    }
+    Ok(())
+}
+
 fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F, DC, V>(
     wtxn: &'a mut heed::RwTxn,
     db: &heed::Database<C, DC>,
diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs
index 80d68298a..03bfada21 100644
--- a/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_docids.rs
@@ -10,17 +10,21 @@ use super::helpers::{
 };
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
+use crate::update::index_documents::MergeFn;
 use crate::Result;
 
 /// Extracts the word and the documents ids where this word appear.
 ///
 /// Returns a grenad reader with the list of extracted words and
 /// documents ids from the given chunk of docid word positions.
+///
+/// The first returned reader in the one for normal word_docids, and the second one is for
+/// exact_word_docids
 #[logging_timer::time]
 pub fn extract_word_docids<R: io::Read + io::Seek>(
     docid_word_positions: grenad::Reader<R>,
     indexer: GrenadParameters,
-) -> Result<grenad::Reader<File>> {
+) -> Result<(grenad::Reader<File>, grenad::Reader<File>)> {
     let max_memory = indexer.max_memory_by_thread();
 
     let mut word_docids_sorter = create_sorter(
@@ -43,5 +47,9 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
         word_docids_sorter.insert(word_bytes, &value_buffer)?;
     }
 
-    sorter_into_reader(word_docids_sorter, indexer)
+    let empty_sorter = grenad::Sorter::new(merge_roaring_bitmaps as MergeFn);
+    Ok((
+        sorter_into_reader(word_docids_sorter, indexer)?,
+        sorter_into_reader(empty_sorter, indexer)?,
+    ))
 }
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index 100431237..4e7f211ce 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -86,13 +86,16 @@ pub(crate) fn data_from_obkv_documents(
         "field-id-wordcount-docids",
     );
 
-    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
+    spawn_extraction_task::<_, _, Vec<(grenad::Reader<File>, grenad::Reader<File>)>>(
         docid_word_positions_chunks.clone(),
         indexer.clone(),
         lmdb_writer_sx.clone(),
         extract_word_docids,
         merge_roaring_bitmaps,
-        TypedChunk::WordDocids,
+        |(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids {
+            word_docids_reader,
+            exact_word_docids_reader,
+        },
         "word-docids",
     );
 
diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs
index fc28860b2..fb5242910 100644
--- a/milli/src/update/index_documents/helpers/grenad_helpers.rs
+++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs
@@ -277,3 +277,8 @@ pub fn sorter_into_lmdb_database(
     debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
     Ok(())
 }
+
+/// Used when trying to merge readers, but you don't actually care about the values.
+pub fn merge_nothing<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
+    Ok(Cow::Owned(Vec::new()))
+}
diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs
index f4940af1d..4642bcf14 100644
--- a/milli/src/update/index_documents/helpers/mod.rs
+++ b/milli/src/update/index_documents/helpers/mod.rs
@@ -8,7 +8,7 @@ use std::convert::{TryFrom, TryInto};
 pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
 use fst::{IntoStreamer, Streamer};
 pub use grenad_helpers::{
-    as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
+    as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, merge_nothing,
     sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader,
     GrenadParameters, MergeableReader,
 };
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 2d3004444..633b72cc9 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -20,7 +20,7 @@ pub use self::helpers::{
     fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
     sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, ClonableMmap, MergeFn,
 };
-use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
+use self::helpers::{grenad_obkv_into_chunks, merge_nothing, GrenadParameters};
 pub use self::transform::{Transform, TransformOutput};
 use crate::documents::DocumentBatchReader;
 pub use crate::update::index_documents::helpers::CursorClonableMmap;
@@ -282,6 +282,7 @@ where
         let mut word_pair_proximity_docids = None;
         let mut word_position_docids = None;
         let mut word_docids = None;
+        let mut _exact_word_docids = None;
 
         let mut databases_seen = 0;
         (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
@@ -291,10 +292,13 @@ where
 
         for result in lmdb_writer_rx {
             let typed_chunk = match result? {
-                TypedChunk::WordDocids(chunk) => {
-                    let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
+                TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
+                    let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
                     word_docids = Some(cloneable_chunk);
-                    TypedChunk::WordDocids(chunk)
+                    let cloneable_chunk =
+                        unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
+                    _exact_word_docids = Some(cloneable_chunk);
+                    TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader }
                 }
                 TypedChunk::WordPairProximityDocids(chunk) => {
                     let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
@@ -425,6 +429,10 @@ where
         });
 
         if let Some(word_docids) = word_docids {
+            let mut word_docids_builder = grenad::MergerBuilder::new(merge_nothing as MergeFn);
+            word_docids_builder.push(word_docids.into_cursor()?);
+            // TODO: push exact_word_docids
+            let word_docids_iter = word_docids_builder.build().into_stream_merger_iter()?;
             // Run the word prefix docids update operation.
             let mut builder = WordPrefixDocids::new(self.wtxn, self.index);
             builder.chunk_compression_type = self.indexer_config.chunk_compression_type;
@@ -432,7 +440,7 @@ where
             builder.max_nb_chunks = self.indexer_config.max_nb_chunks;
             builder.max_memory = self.indexer_config.max_memory;
             builder.execute(
-                word_docids,
+                word_docids_iter,
                 &new_prefix_fst_words,
                 &common_prefix_fst_words,
                 &del_prefix_fst_words,
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index 77ea31138..be440114f 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -3,14 +3,16 @@ use std::convert::TryInto;
 use std::fs::File;
 use std::io;
 
+use grenad::MergerBuilder;
 use heed::types::ByteSlice;
 use heed::{BytesDecode, RwTxn};
 use roaring::RoaringBitmap;
 
 use super::helpers::{
-    self, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key,
+    self, merge_nothing, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key,
     CursorClonableMmap,
 };
+use super::{ClonableMmap, MergeFn};
 use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string};
 use crate::update::index_documents::helpers::as_cloneable_grenad;
 use crate::{
@@ -25,7 +27,10 @@ pub(crate) enum TypedChunk {
     Documents(grenad::Reader<CursorClonableMmap>),
     FieldIdWordcountDocids(grenad::Reader<File>),
     NewDocumentsIds(RoaringBitmap),
-    WordDocids(grenad::Reader<File>),
+    WordDocids {
+        word_docids_reader: grenad::Reader<File>,
+        exact_word_docids_reader: grenad::Reader<File>,
+    },
     WordPositionDocids(grenad::Reader<File>),
     WordPairProximityDocids(grenad::Reader<File>),
     FieldIdFacetStringDocids(grenad::Reader<File>),
@@ -86,8 +91,8 @@ pub(crate) fn write_typed_chunk_into_index(
         TypedChunk::NewDocumentsIds(documents_ids) => {
             return Ok((documents_ids, is_merged_database))
         }
-        TypedChunk::WordDocids(word_docids_iter) => {
-            let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_iter) }?;
+        TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
+            let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
             append_entries_into_database(
                 word_docids_iter.clone(),
                 &index.word_docids,
@@ -97,15 +102,18 @@ pub(crate) fn write_typed_chunk_into_index(
                 merge_roaring_bitmaps,
             )?;
 
+            let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
+            append_entries_into_database(
+                exact_word_docids_iter.clone(),
+                &index.exact_word_docids,
+                wtxn,
+                index_is_empty,
+                |value, _buffer| Ok(value),
+                merge_roaring_bitmaps,
+            )?;
+
             // create fst from word docids
-            let mut builder = fst::SetBuilder::memory();
-            let mut cursor = word_docids_iter.into_cursor()?;
-            while let Some((word, _value)) = cursor.move_on_next()? {
-                // This is a lexicographically ordered word position
-                // we use the key to construct the words fst.
-                builder.insert(word)?;
-            }
-            let fst = builder.into_set().map_data(std::borrow::Cow::Owned)?;
+            let fst = merge_word_docids_reader_into_fst(word_docids_iter, exact_word_docids_iter)?;
             let db_fst = index.words_fst(wtxn)?;
 
             // merge new fst with database fst
@@ -214,6 +222,23 @@ pub(crate) fn write_typed_chunk_into_index(
     Ok((RoaringBitmap::new(), is_merged_database))
 }
 
+fn merge_word_docids_reader_into_fst(
+    word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
+    exact_word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
+) -> Result<fst::Set<Vec<u8>>> {
+    let mut merger_builder = MergerBuilder::new(merge_nothing as MergeFn);
+    merger_builder.push(word_docids_iter.into_cursor()?);
+    merger_builder.push(exact_word_docids_iter.into_cursor()?);
+    let mut iter = merger_builder.build().into_stream_merger_iter()?;
+    let mut builder = fst::SetBuilder::memory();
+
+    while let Some((k, _)) = iter.next()? {
+        builder.insert(k)?;
+    }
+
+    Ok(builder.into_set())
+}
+
 fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec<u8>) -> Result<()> {
     let new_value = RoaringBitmap::deserialize_from(new_value)?;
     let db_value = RoaringBitmap::deserialize_from(db_value)?;
diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs
index 076816f09..4114f8baf 100644
--- a/milli/src/update/word_prefix_docids.rs
+++ b/milli/src/update/word_prefix_docids.rs
@@ -35,7 +35,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
     #[logging_timer::time("WordPrefixDocids::{}")]
     pub fn execute(
         self,
-        new_word_docids: grenad::Reader<CursorClonableMmap>,
+        mut new_word_docids_iter: grenad::MergerIter<CursorClonableMmap, MergeFn>,
         new_prefix_fst_words: &[String],
         common_prefix_fst_words: &[&[String]],
         del_prefix_fst_words: &HashSet<Vec<u8>>,
@@ -51,10 +51,9 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
         );
 
         if !common_prefix_fst_words.is_empty() {
-            let mut new_word_docids_iter = new_word_docids.into_cursor()?;
             let mut current_prefixes: Option<&&[String]> = None;
             let mut prefixes_cache = HashMap::new();
-            while let Some((word, data)) = new_word_docids_iter.move_on_next()? {
+            while let Some((word, data)) = new_word_docids_iter.next()? {
                 current_prefixes = match current_prefixes.take() {
                     Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes),
                     _otherwise => {

From 5451c64d5d84ecbc154dc7708ad1c72c62336f6e Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Thu, 24 Mar 2022 15:55:29 +0100
Subject: [PATCH 06/28] increase criteria asc desc test map size

---
 milli/tests/search/query_criteria.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs
index ef080db9f..786fdbcae 100644
--- a/milli/tests/search/query_criteria.rs
+++ b/milli/tests/search/query_criteria.rs
@@ -373,7 +373,7 @@ fn criteria_mixup() {
 fn criteria_ascdesc() {
     let path = tempfile::tempdir().unwrap();
     let mut options = EnvOpenOptions::new();
-    options.map_size(10 * 1024 * 1024); // 10 MB
+    options.map_size(12 * 1024 * 1024); // 10 MB
     let index = Index::new(options, &path).unwrap();
 
     let mut wtxn = index.write_txn().unwrap();

From 8d46a5b0b5d86b85a4c865a72522b915d540ccce Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Thu, 24 Mar 2022 17:00:29 +0100
Subject: [PATCH 07/28] extract exact word docids

---
 milli/src/index.rs                            |  5 +++
 milli/src/lib.rs                              |  4 ++
 .../extract/extract_word_docids.rs            | 43 ++++++++++++++++---
 .../src/update/index_documents/extract/mod.rs |  3 +-
 milli/src/update/index_documents/mod.rs       |  2 +
 5 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/milli/src/index.rs b/milli/src/index.rs
index 8f9c9beb7..3d6d954f0 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -964,6 +964,11 @@ impl Index {
             .get::<_, Str, SerdeBincode<Vec<&str>>>(txn, main_key::EXACT_ATTRIBUTES)?
             .unwrap_or_default())
     }
+    pub fn exact_attributes_ids(&self, txn: &RoTxn) -> Result<HashSet<FieldId>> {
+        let attrs = self.exact_attributes(txn)?;
+        let fid_map = self.fields_ids_map(txn)?;
+        Ok(attrs.iter().filter_map(|attr| fid_map.id(attr)).collect())
+    }
 
     pub(crate) fn put_exact_attributes(&self, txn: &mut RwTxn, attrs: &[&str]) -> Result<()> {
         self.main.put::<_, Str, SerdeBincode<&[&str]>>(txn, main_key::EXACT_ATTRIBUTES, &attrs)?;
diff --git a/milli/src/lib.rs b/milli/src/lib.rs
index ba2bd9b0f..b68c76048 100644
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@@ -74,6 +74,10 @@ pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosi
     (field_id as u32) << 16 | (relative as u32)
 }
 
+pub fn field_id_from_position(position: u32) -> FieldId {
+    (position >> 16 & 0xffff) as u16
+}
+
 /// Transform a raw obkv store into a JSON Object.
 pub fn obkv_to_json(
     displayed_fields: &[FieldId],
diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs
index 03bfada21..5f231e5aa 100644
--- a/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_docids.rs
@@ -1,3 +1,4 @@
+use std::collections::HashSet;
 use std::fs::File;
 use std::io;
 use std::iter::FromIterator;
@@ -10,8 +11,8 @@ use super::helpers::{
 };
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
-use crate::update::index_documents::MergeFn;
-use crate::Result;
+use crate::update::index_documents::helpers::read_u32_ne_bytes;
+use crate::{field_id_from_position, FieldId, Result};
 
 /// Extracts the word and the documents ids where this word appear.
 ///
@@ -24,6 +25,7 @@ use crate::Result;
 pub fn extract_word_docids<R: io::Read + io::Seek>(
     docid_word_positions: grenad::Reader<R>,
     indexer: GrenadParameters,
+    exact_attributes: &HashSet<FieldId>,
 ) -> Result<(grenad::Reader<File>, grenad::Reader<File>)> {
     let max_memory = indexer.max_memory_by_thread();
 
@@ -35,21 +37,50 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
         max_memory,
     );
 
+    let mut exact_word_docids_sorter = create_sorter(
+        merge_roaring_bitmaps,
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        indexer.max_nb_chunks,
+        max_memory,
+    );
+
     let mut value_buffer = Vec::new();
     let mut cursor = docid_word_positions.into_cursor()?;
-    while let Some((key, _value)) = cursor.move_on_next()? {
+    while let Some((key, positions)) = cursor.move_on_next()? {
         let (document_id_bytes, word_bytes) = try_split_array_at(key)
             .ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
         let document_id = u32::from_be_bytes(document_id_bytes);
 
         let bitmap = RoaringBitmap::from_iter(Some(document_id));
         serialize_roaring_bitmap(&bitmap, &mut value_buffer)?;
-        word_docids_sorter.insert(word_bytes, &value_buffer)?;
+
+        // If there are no exact attributes, we do not need to iterate over positions.
+        if exact_attributes.is_empty() {
+            word_docids_sorter.insert(word_bytes, &value_buffer)?;
+        } else {
+            let mut added_to_exact = false;
+            let mut added_to_word_docids = false;
+            for position in read_u32_ne_bytes(positions) {
+                // as soon as we know that this word had been to both readers, we don't need to
+                // iterate over the positions.
+                if added_to_exact && added_to_word_docids {
+                    break;
+                }
+                let fid = field_id_from_position(position);
+                if exact_attributes.contains(&fid) && !added_to_exact {
+                    exact_word_docids_sorter.insert(word_bytes, &value_buffer)?;
+                    added_to_exact = true;
+                } else if !added_to_word_docids {
+                    word_docids_sorter.insert(word_bytes, &value_buffer)?;
+                    added_to_word_docids = true;
+                }
+            }
+        }
     }
 
-    let empty_sorter = grenad::Sorter::new(merge_roaring_bitmaps as MergeFn);
     Ok((
         sorter_into_reader(word_docids_sorter, indexer)?,
-        sorter_into_reader(empty_sorter, indexer)?,
+        sorter_into_reader(exact_word_docids_sorter, indexer)?,
     ))
 }
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index 4e7f211ce..8f6797a3b 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -43,6 +43,7 @@ pub(crate) fn data_from_obkv_documents(
     geo_field_id: Option<FieldId>,
     stop_words: Option<fst::Set<&[u8]>>,
     max_positions_per_attributes: Option<u32>,
+    exact_attributes: HashSet<FieldId>,
 ) -> Result<()> {
     let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks
         .par_bridge()
@@ -90,7 +91,7 @@ pub(crate) fn data_from_obkv_documents(
         docid_word_positions_chunks.clone(),
         indexer.clone(),
         lmdb_writer_sx.clone(),
-        extract_word_docids,
+        move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes),
         merge_roaring_bitmaps,
         |(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids {
             word_docids_reader,
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 633b72cc9..c490e93da 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -226,6 +226,7 @@ where
         };
 
         let stop_words = self.index.stop_words(self.wtxn)?;
+        let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
 
         // Run extraction pipeline in parallel.
         pool.install(|| {
@@ -255,6 +256,7 @@ where
                     geo_field_id,
                     stop_words,
                     self.indexer_config.max_positions_per_attributes,
+                    exact_attributes,
                 )
             });
 

From c4c6e3535290c88016e6a74f0f015563432e7fc9 Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Thu, 24 Mar 2022 19:25:11 +0100
Subject: [PATCH 08/28] query exact_word_docids in resolve_query_tree

---
 milli/src/search/criteria/mod.rs        | 20 ++++++++++++++++++--
 milli/src/update/index_documents/mod.rs | 11 ++++++++---
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs
index 1dbfd2524..df9189239 100644
--- a/milli/src/search/criteria/mod.rs
+++ b/milli/src/search/criteria/mod.rs
@@ -68,6 +68,7 @@ impl Default for Candidates {
 pub trait Context<'c> {
     fn documents_ids(&self) -> heed::Result<RoaringBitmap>;
     fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
+    fn exact_word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
     fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
     fn word_pair_proximity_docids(
         &self,
@@ -118,6 +119,10 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
         self.index.word_docids.get(self.rtxn, &word)
     }
 
+    fn exact_word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
+        self.index.exact_word_docids.get(self.rtxn, &word)
+    }
+
     fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
         self.index.word_prefix_docids.get(self.rtxn, &word)
     }
@@ -400,11 +405,14 @@ fn query_docids(
                 let mut docids = RoaringBitmap::new();
                 for (word, _typo) in words {
                     let current_docids = ctx.word_docids(&word)?.unwrap_or_default();
-                    docids |= current_docids;
+                    let exact_current_docids = ctx.exact_word_docids(&word)?.unwrap_or_default();
+                    docids |= current_docids | exact_current_docids;
                 }
                 Ok(docids)
             } else {
-                Ok(ctx.word_docids(&word)?.unwrap_or_default())
+                let word_docids = ctx.word_docids(&word)?.unwrap_or_default();
+                let exact_word_docids = ctx.exact_word_docids(&word)?.unwrap_or_default();
+                Ok(word_docids | exact_word_docids)
             }
         }
         QueryKind::Tolerant { typo, word } => {
@@ -512,6 +520,7 @@ pub mod test {
     pub struct TestContext<'t> {
         words_fst: fst::Set<Cow<'t, [u8]>>,
         word_docids: HashMap<String, RoaringBitmap>,
+        exact_word_docids: HashMap<String, RoaringBitmap>,
         word_prefix_docids: HashMap<String, RoaringBitmap>,
         word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
         word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
@@ -527,6 +536,10 @@ pub mod test {
             Ok(self.word_docids.get(&word.to_string()).cloned())
         }
 
+        fn exact_word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
+            Ok(self.exact_word_docids.get(&word.to_string()).cloned())
+        }
+
         fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
             Ok(self.word_prefix_docids.get(&word.to_string()).cloned())
         }
@@ -643,6 +656,8 @@ pub mod test {
                 s("morning")    => random_postings(rng,    125),
             };
 
+            let exact_word_docids = HashMap::new();
+
             let mut docid_words = HashMap::new();
             for (word, docids) in word_docids.iter() {
                 for docid in docids {
@@ -712,6 +727,7 @@ pub mod test {
             TestContext {
                 words_fst,
                 word_docids,
+                exact_word_docids,
                 word_prefix_docids,
                 word_pair_proximity_docids,
                 word_prefix_pair_proximity_docids,
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index c490e93da..54d30f8fb 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -284,7 +284,7 @@ where
         let mut word_pair_proximity_docids = None;
         let mut word_position_docids = None;
         let mut word_docids = None;
-        let mut _exact_word_docids = None;
+        let mut exact_word_docids = None;
 
         let mut databases_seen = 0;
         (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
@@ -299,7 +299,7 @@ where
                     word_docids = Some(cloneable_chunk);
                     let cloneable_chunk =
                         unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
-                    _exact_word_docids = Some(cloneable_chunk);
+                    exact_word_docids = Some(cloneable_chunk);
                     TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader }
                 }
                 TypedChunk::WordPairProximityDocids(chunk) => {
@@ -352,6 +352,7 @@ where
 
         self.execute_prefix_databases(
             word_docids,
+            exact_word_docids,
             word_pair_proximity_docids,
             word_position_docids,
         )?;
@@ -363,6 +364,7 @@ where
     pub fn execute_prefix_databases(
         self,
         word_docids: Option<grenad::Reader<CursorClonableMmap>>,
+        exact_word_docids: Option<grenad::Reader<CursorClonableMmap>>,
         word_pair_proximity_docids: Option<grenad::Reader<CursorClonableMmap>>,
         word_position_docids: Option<grenad::Reader<CursorClonableMmap>>,
     ) -> Result<()>
@@ -433,7 +435,10 @@ where
         if let Some(word_docids) = word_docids {
             let mut word_docids_builder = grenad::MergerBuilder::new(merge_nothing as MergeFn);
             word_docids_builder.push(word_docids.into_cursor()?);
-            // TODO: push exact_word_docids
+            if let Some(exact_word_docids) = exact_word_docids {
+                word_docids_builder.push(exact_word_docids.into_cursor()?);
+            }
+
             let word_docids_iter = word_docids_builder.build().into_stream_merger_iter()?;
             // Run the word prefix docids update operation.
             let mut builder = WordPrefixDocids::new(self.wtxn, self.index);

From ba0bb29cd8a1b748b325c2854ce9ea6daaf127a1 Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Fri, 25 Mar 2022 10:20:39 +0100
Subject: [PATCH 09/28] refactor WordPrefixDocids to take dbs instead of
 indexes

---
 milli/src/update/index_documents/mod.rs |  6 +++++-
 milli/src/update/word_prefix_docids.rs  | 20 ++++++++++++--------
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 54d30f8fb..91d108c72 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -441,7 +441,11 @@ where
 
             let word_docids_iter = word_docids_builder.build().into_stream_merger_iter()?;
             // Run the word prefix docids update operation.
-            let mut builder = WordPrefixDocids::new(self.wtxn, self.index);
+            let mut builder = WordPrefixDocids::new(
+                self.wtxn,
+                self.index.word_docids.clone(),
+                self.index.word_prefix_docids.clone(),
+            );
             builder.chunk_compression_type = self.indexer_config.chunk_compression_type;
             builder.chunk_compression_level = self.indexer_config.chunk_compression_level;
             builder.max_nb_chunks = self.indexer_config.max_nb_chunks;
diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs
index 4114f8baf..b166812a5 100644
--- a/milli/src/update/word_prefix_docids.rs
+++ b/milli/src/update/word_prefix_docids.rs
@@ -1,16 +1,18 @@
 use std::collections::{HashMap, HashSet};
 
 use grenad::CompressionType;
-use heed::types::ByteSlice;
+use heed::types::{ByteSlice, Str};
+use heed::Database;
 
 use crate::update::index_documents::{
     create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, CursorClonableMmap, MergeFn,
 };
-use crate::{Index, Result};
+use crate::{Result, RoaringBitmapCodec};
 
 pub struct WordPrefixDocids<'t, 'u, 'i> {
     wtxn: &'t mut heed::RwTxn<'i, 'u>,
-    index: &'i Index,
+    word_docids: Database<Str, RoaringBitmapCodec>,
+    word_prefix_docids: Database<Str, RoaringBitmapCodec>,
     pub(crate) chunk_compression_type: CompressionType,
     pub(crate) chunk_compression_level: Option<u32>,
     pub(crate) max_nb_chunks: Option<usize>,
@@ -20,11 +22,13 @@ pub struct WordPrefixDocids<'t, 'u, 'i> {
 impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
     pub fn new(
         wtxn: &'t mut heed::RwTxn<'i, 'u>,
-        index: &'i Index,
+        word_docids: Database<Str, RoaringBitmapCodec>,
+        word_prefixes_docids: Database<Str, RoaringBitmapCodec>,
     ) -> WordPrefixDocids<'t, 'u, 'i> {
         WordPrefixDocids {
             wtxn,
-            index,
+            word_docids,
+            word_prefix_docids: word_prefixes_docids,
             chunk_compression_type: CompressionType::None,
             chunk_compression_level: None,
             max_nb_chunks: None,
@@ -83,7 +87,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
         }
 
         // We fetch the docids associated to the newly added word prefix fst only.
-        let db = self.index.word_docids.remap_data_type::<ByteSlice>();
+        let db = self.word_docids.remap_data_type::<ByteSlice>();
         for prefix in new_prefix_fst_words {
             let prefix = std::str::from_utf8(prefix.as_bytes())?;
             for result in db.prefix_iter(self.wtxn, prefix)? {
@@ -93,7 +97,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
         }
 
         // We remove all the entries that are no more required in this word prefix docids database.
-        let mut iter = self.index.word_prefix_docids.iter_mut(self.wtxn)?.lazily_decode_data();
+        let mut iter = self.word_prefix_docids.iter_mut(self.wtxn)?.lazily_decode_data();
         while let Some((prefix, _)) = iter.next().transpose()? {
             if del_prefix_fst_words.contains(prefix.as_bytes()) {
                 unsafe { iter.del_current()? };
@@ -105,7 +109,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
         // We finally write the word prefix docids into the LMDB database.
         sorter_into_lmdb_database(
             self.wtxn,
-            *self.index.word_prefix_docids.as_polymorph(),
+            *self.word_prefix_docids.as_polymorph(),
             prefix_docids_sorter,
             merge_roaring_bitmaps,
         )?;

From 6dd2e4ffbd97bac64d0d3a7a5c39a51b0a5639a5 Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Fri, 25 Mar 2022 10:49:34 +0100
Subject: [PATCH 10/28] introduce exact_word_prefix database in index

---
 milli/src/index.rs                   |  8 +++-
 milli/src/update/clear_documents.rs  |  2 +
 milli/src/update/delete_documents.rs | 67 ++++++++++++++++++----------
 3 files changed, 53 insertions(+), 24 deletions(-)

diff --git a/milli/src/index.rs b/milli/src/index.rs
index 3d6d954f0..80f62f684 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -61,6 +61,7 @@ pub mod db_name {
     pub const WORD_DOCIDS: &str = "word-docids";
     pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids";
     pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids";
+    pub const EXACT_WORD_PREFIX_DOCIDS: &str = "exact-word-prefix-docids";
     pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions";
     pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids";
     pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids";
@@ -91,6 +92,9 @@ pub struct Index {
     /// A prefix of word and all the documents ids containing this prefix.
     pub word_prefix_docids: Database<Str, RoaringBitmapCodec>,
 
+    /// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
+    pub exact_word_prefix_docids: Database<Str, RoaringBitmapCodec>,
+
     /// Maps a word and a document id (u32) to all the positions where the given word appears.
     pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
 
@@ -124,7 +128,7 @@ impl Index {
     pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> {
         use db_name::*;
 
-        options.max_dbs(15);
+        options.max_dbs(16);
         unsafe { options.flag(Flags::MdbAlwaysFreePages) };
 
         let env = options.open(path)?;
@@ -132,6 +136,7 @@ impl Index {
         let word_docids = env.create_database(Some(WORD_DOCIDS))?;
         let exact_word_docids = env.create_database(Some(EXACT_WORD_DOCIDS))?;
         let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?;
+        let exact_word_prefix_docids = env.create_database(Some(EXACT_WORD_PREFIX_DOCIDS))?;
         let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?;
         let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
         let word_prefix_pair_proximity_docids =
@@ -154,6 +159,7 @@ impl Index {
             word_docids,
             exact_word_docids,
             word_prefix_docids,
+            exact_word_prefix_docids,
             docid_word_positions,
             word_pair_proximity_docids,
             word_prefix_pair_proximity_docids,
diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs
index 57c0969c7..3665d2313 100644
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@@ -21,6 +21,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
             word_docids,
             exact_word_docids,
             word_prefix_docids,
+            exact_word_prefix_docids,
             docid_word_positions,
             word_pair_proximity_docids,
             word_prefix_pair_proximity_docids,
@@ -58,6 +59,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
         word_docids.clear(self.wtxn)?;
         exact_word_docids.clear(self.wtxn)?;
         word_prefix_docids.clear(self.wtxn)?;
+        exact_word_prefix_docids.clear(self.wtxn)?;
         docid_word_positions.clear(self.wtxn)?;
         word_pair_proximity_docids.clear(self.wtxn)?;
         word_prefix_pair_proximity_docids.clear(self.wtxn)?;
diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
index 46a4721c0..58c4d4f70 100644
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -3,7 +3,7 @@ use std::collections::HashMap;
 
 use fst::IntoStreamer;
 use heed::types::{ByteSlice, Str};
-use heed::{BytesDecode, BytesEncode};
+use heed::{BytesDecode, BytesEncode, Database};
 use roaring::RoaringBitmap;
 use serde::{Deserialize, Serialize};
 use serde_json::Value;
@@ -113,6 +113,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
             word_docids,
             exact_word_docids,
             word_prefix_docids,
+            exact_word_prefix_docids,
             docid_word_positions,
             word_pair_proximity_docids,
             field_id_word_count_docids,
@@ -254,34 +255,24 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
         // We write the new words FST into the main database.
         self.index.put_words_fst(self.wtxn, &new_words_fst)?;
 
-        // We iterate over the word prefix docids database and remove the deleted documents ids
-        // from every docids lists. We register the empty prefixes in an fst Set for futur deletion.
-        let mut prefixes_to_delete = fst::SetBuilder::memory();
-        let mut iter = word_prefix_docids.iter_mut(self.wtxn)?;
-        while let Some(result) = iter.next() {
-            let (prefix, mut docids) = result?;
-            let prefix = prefix.to_owned();
-            let previous_len = docids.len();
-            docids -= &self.documents_ids;
-            if docids.is_empty() {
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.del_current()? };
-                prefixes_to_delete.insert(prefix)?;
-            } else if docids.len() != previous_len {
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.put_current(&prefix, &docids)? };
-            }
-        }
+        let prefixes_to_delete =
+            remove_from_word_prefix_docids(self.wtxn, word_prefix_docids, &self.documents_ids)?;
 
-        drop(iter);
+        let exact_prefix_to_delete = remove_from_word_prefix_docids(
+            self.wtxn,
+            exact_word_prefix_docids,
+            &self.documents_ids,
+        )?;
+
+        let all_prefixes_to_delete = prefixes_to_delete.op().add(&exact_prefix_to_delete).union();
 
         // We compute the new prefix FST and write it only if there is a change.
-        let prefixes_to_delete = prefixes_to_delete.into_set();
-        if !prefixes_to_delete.is_empty() {
+        if !prefixes_to_delete.is_empty() || !exact_prefix_to_delete.is_empty() {
             let new_words_prefixes_fst = {
                 // We retrieve the current words prefixes FST from the database.
                 let words_prefixes_fst = self.index.words_prefixes_fst(self.wtxn)?;
-                let difference = words_prefixes_fst.op().add(&prefixes_to_delete).difference();
+                let difference =
+                    words_prefixes_fst.op().add(all_prefixes_to_delete.into_stream()).difference();
 
                 // We stream the new external ids that does no more contains the to-delete external ids.
                 let mut new_words_prefixes_fst_builder = fst::SetBuilder::memory();
@@ -457,6 +448,36 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
     }
 }
 
+fn remove_from_word_prefix_docids(
+    txn: &mut heed::RwTxn,
+    db: &Database<Str, RoaringBitmapCodec>,
+    to_remove: &RoaringBitmap,
+) -> Result<fst::Set<Vec<u8>>> {
+    let mut prefixes_to_delete = fst::SetBuilder::memory();
+
+    // We iterate over the word prefix docids database and remove the deleted documents ids
+    // from every docids lists. We register the empty prefixes in an fst Set for futur deletion.
+    let mut iter = db.iter_mut(txn)?;
+    while let Some(result) = iter.next() {
+        let (prefix, mut docids) = result?;
+        let prefix = prefix.to_owned();
+        let previous_len = docids.len();
+        docids -= to_remove;
+        if docids.is_empty() {
+            // safety: we don't keep references from inside the LMDB database.
+            unsafe { iter.del_current()? };
+            prefixes_to_delete.insert(prefix)?;
+        } else if docids.len() != previous_len {
+            // safety: we don't keep references from inside the LMDB database.
+            unsafe { iter.put_current(&prefix, &docids)? };
+        }
+    }
+
+    drop(iter);
+
+    Ok(prefixes_to_delete.into_set())
+}
+
 fn remove_from_word_docids(
     txn: &mut heed::RwTxn,
     db: &heed::Database<Str, RoaringBitmapCodec>,

From e8f06f6c0606b130b2e398246bb55ceeb51602b3 Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Fri, 25 Mar 2022 16:17:55 +0100
Subject: [PATCH 11/28] extract exact_word_prefix_docids

---
 milli/src/update/index_documents/mod.rs | 66 ++++++++++++++++++-------
 milli/src/update/word_prefix_docids.rs  |  8 +--
 2 files changed, 51 insertions(+), 23 deletions(-)

diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 91d108c72..0e6e59e10 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -4,11 +4,13 @@ mod transform;
 mod typed_chunk;
 
 use std::collections::HashSet;
-use std::io::{Read, Seek};
+use std::io::{Cursor, Read, Seek};
 use std::iter::FromIterator;
 use std::num::{NonZeroU32, NonZeroUsize};
 
 use crossbeam_channel::{Receiver, Sender};
+use heed::types::Str;
+use heed::Database;
 use log::debug;
 use roaring::RoaringBitmap;
 use serde::{Deserialize, Serialize};
@@ -20,7 +22,7 @@ pub use self::helpers::{
     fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
     sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, ClonableMmap, MergeFn,
 };
-use self::helpers::{grenad_obkv_into_chunks, merge_nothing, GrenadParameters};
+use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
 pub use self::transform::{Transform, TransformOutput};
 use crate::documents::DocumentBatchReader;
 pub use crate::update::index_documents::helpers::CursorClonableMmap;
@@ -28,7 +30,7 @@ use crate::update::{
     self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids,
     WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst,
 };
-use crate::{Index, Result};
+use crate::{Index, Result, RoaringBitmapCodec};
 
 static MERGED_DATABASE_COUNT: usize = 7;
 static PREFIX_DATABASE_COUNT: usize = 5;
@@ -433,25 +435,25 @@ where
         });
 
         if let Some(word_docids) = word_docids {
-            let mut word_docids_builder = grenad::MergerBuilder::new(merge_nothing as MergeFn);
-            word_docids_builder.push(word_docids.into_cursor()?);
-            if let Some(exact_word_docids) = exact_word_docids {
-                word_docids_builder.push(exact_word_docids.into_cursor()?);
-            }
-
-            let word_docids_iter = word_docids_builder.build().into_stream_merger_iter()?;
-            // Run the word prefix docids update operation.
-            let mut builder = WordPrefixDocids::new(
+            execute_word_prefix_docids(
                 self.wtxn,
+                word_docids,
                 self.index.word_docids.clone(),
                 self.index.word_prefix_docids.clone(),
-            );
-            builder.chunk_compression_type = self.indexer_config.chunk_compression_type;
-            builder.chunk_compression_level = self.indexer_config.chunk_compression_level;
-            builder.max_nb_chunks = self.indexer_config.max_nb_chunks;
-            builder.max_memory = self.indexer_config.max_memory;
-            builder.execute(
-                word_docids_iter,
+                &self.indexer_config,
+                &new_prefix_fst_words,
+                &common_prefix_fst_words,
+                &del_prefix_fst_words,
+            )?;
+        }
+
+        if let Some(exact_word_docids) = exact_word_docids {
+            execute_word_prefix_docids(
+                self.wtxn,
+                exact_word_docids,
+                self.index.exact_word_docids.clone(),
+                self.index.exact_word_prefix_docids.clone(),
+                &self.indexer_config,
                 &new_prefix_fst_words,
                 &common_prefix_fst_words,
                 &del_prefix_fst_words,
@@ -516,6 +518,32 @@ where
     }
 }
 
+/// Run the word prefix docids update operation.
+fn execute_word_prefix_docids(
+    txn: &mut heed::RwTxn,
+    reader: grenad::Reader<Cursor<ClonableMmap>>,
+    word_docids_db: Database<Str, RoaringBitmapCodec>,
+    word_prefix_docids_db: Database<Str, RoaringBitmapCodec>,
+    indexer_config: &IndexerConfig,
+    new_prefix_fst_words: &[String],
+    common_prefix_fst_words: &[&[String]],
+    del_prefix_fst_words: &HashSet<Vec<u8>>,
+) -> Result<()> {
+    let cursor = reader.into_cursor()?;
+    let mut builder = WordPrefixDocids::new(txn, word_docids_db, word_prefix_docids_db);
+    builder.chunk_compression_type = indexer_config.chunk_compression_type;
+    builder.chunk_compression_level = indexer_config.chunk_compression_level;
+    builder.max_nb_chunks = indexer_config.max_nb_chunks;
+    builder.max_memory = indexer_config.max_memory;
+    builder.execute(
+        cursor,
+        &new_prefix_fst_words,
+        &common_prefix_fst_words,
+        &del_prefix_fst_words,
+    )?;
+    Ok(())
+}
+
 #[cfg(test)]
 mod tests {
     use std::io::Cursor;
diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs
index b166812a5..2887b5583 100644
--- a/milli/src/update/word_prefix_docids.rs
+++ b/milli/src/update/word_prefix_docids.rs
@@ -23,12 +23,12 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
     pub fn new(
         wtxn: &'t mut heed::RwTxn<'i, 'u>,
         word_docids: Database<Str, RoaringBitmapCodec>,
-        word_prefixes_docids: Database<Str, RoaringBitmapCodec>,
+        word_prefix_docids: Database<Str, RoaringBitmapCodec>,
     ) -> WordPrefixDocids<'t, 'u, 'i> {
         WordPrefixDocids {
             wtxn,
             word_docids,
-            word_prefix_docids: word_prefixes_docids,
+            word_prefix_docids,
             chunk_compression_type: CompressionType::None,
             chunk_compression_level: None,
             max_nb_chunks: None,
@@ -39,7 +39,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
     #[logging_timer::time("WordPrefixDocids::{}")]
     pub fn execute(
         self,
-        mut new_word_docids_iter: grenad::MergerIter<CursorClonableMmap, MergeFn>,
+        mut new_word_docids_iter: grenad::ReaderCursor<CursorClonableMmap>,
         new_prefix_fst_words: &[String],
         common_prefix_fst_words: &[&[String]],
         del_prefix_fst_words: &HashSet<Vec<u8>>,
@@ -57,7 +57,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
         if !common_prefix_fst_words.is_empty() {
             let mut current_prefixes: Option<&&[String]> = None;
             let mut prefixes_cache = HashMap::new();
-            while let Some((word, data)) = new_word_docids_iter.next()? {
+            while let Some((word, data)) = new_word_docids_iter.move_on_next()? {
                 current_prefixes = match current_prefixes.take() {
                     Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes),
                     _otherwise => {

From 21ae4143b177389dde584411107f6559a5fbe4aa Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Fri, 25 Mar 2022 16:27:48 +0100
Subject: [PATCH 12/28] add exact_word_prefix to Context

---
 milli/src/search/criteria/mod.rs | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs
index df9189239..3daa258bf 100644
--- a/milli/src/search/criteria/mod.rs
+++ b/milli/src/search/criteria/mod.rs
@@ -70,6 +70,7 @@ pub trait Context<'c> {
     fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
     fn exact_word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
     fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
+    fn exact_word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
     fn word_pair_proximity_docids(
         &self,
         left: &str,
@@ -127,6 +128,10 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
         self.index.word_prefix_docids.get(self.rtxn, &word)
     }
 
+    fn exact_word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
+        self.index.exact_word_prefix_docids.get(self.rtxn, &word)
+    }
+
     fn word_pair_proximity_docids(
         &self,
         left: &str,
@@ -522,6 +527,7 @@ pub mod test {
         word_docids: HashMap<String, RoaringBitmap>,
         exact_word_docids: HashMap<String, RoaringBitmap>,
         word_prefix_docids: HashMap<String, RoaringBitmap>,
+        exact_word_prefix_docids: HashMap<String, RoaringBitmap>,
         word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
         word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
         docid_words: HashMap<u32, Vec<String>>,
@@ -544,6 +550,10 @@ pub mod test {
             Ok(self.word_prefix_docids.get(&word.to_string()).cloned())
         }
 
+        fn exact_word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
+            Ok(self.exact_word_prefix_docids.get(&word.to_string()).cloned())
+        }
+
         fn word_pair_proximity_docids(
             &self,
             left: &str,
@@ -672,6 +682,8 @@ pub mod test {
                 s("20")  => &word_docids[&s("2020")]  | &word_docids[&s("2021")],
             };
 
+            let exact_word_prefix_docids = HashMap::new();
+
             let mut word_pair_proximity_docids = HashMap::new();
             let mut word_prefix_pair_proximity_docids = HashMap::new();
             for (lword, lcandidates) in &word_docids {
@@ -729,6 +741,7 @@ pub mod test {
                 word_docids,
                 exact_word_docids,
                 word_prefix_docids,
+                exact_word_prefix_docids,
                 word_pair_proximity_docids,
                 word_prefix_pair_proximity_docids,
                 docid_words,

From 56b4f5dce2a32505e6e25b973880b7d682e4d4be Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Fri, 25 Mar 2022 16:30:18 +0100
Subject: [PATCH 13/28] add exact prefix to query_docids

---
 milli/src/search/criteria/mod.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs
index 3daa258bf..6ac076ea4 100644
--- a/milli/src/search/criteria/mod.rs
+++ b/milli/src/search/criteria/mod.rs
@@ -404,7 +404,9 @@ fn query_docids(
     match &query.kind {
         QueryKind::Exact { word, .. } => {
             if query.prefix && ctx.in_prefix_cache(&word) {
-                Ok(ctx.word_prefix_docids(&word)?.unwrap_or_default())
+                let doc_ids = ctx.word_prefix_docids(&word)?.unwrap_or_default();
+                let exact_docids = ctx.exact_word_prefix_docids(&word)?.unwrap_or_default();
+                Ok(doc_ids | exact_docids)
             } else if query.prefix {
                 let words = word_derivations(&word, true, 0, ctx.words_fst(), wdcache)?;
                 let mut docids = RoaringBitmap::new();

From 6b2c2509b2e5bfcd5f522a3129f2c8c42bed2c07 Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Wed, 30 Mar 2022 16:07:59 +0200
Subject: [PATCH 14/28] fix bug in exact search

---
 milli/src/index.rs                            |  1 +
 milli/src/search/criteria/mod.rs              | 35 ++++++++++++-------
 .../extract/extract_word_docids.rs            |  2 ++
 3 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/milli/src/index.rs b/milli/src/index.rs
index 80f62f684..c7441c590 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -970,6 +970,7 @@ impl Index {
             .get::<_, Str, SerdeBincode<Vec<&str>>>(txn, main_key::EXACT_ATTRIBUTES)?
             .unwrap_or_default())
     }
+
     pub fn exact_attributes_ids(&self, txn: &RoTxn) -> Result<HashSet<FieldId>> {
         let attrs = self.exact_attributes(txn)?;
         let fid_map = self.fields_ids_map(txn)?;
diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs
index 6ac076ea4..05305d724 100644
--- a/milli/src/search/criteria/mod.rs
+++ b/milli/src/search/criteria/mod.rs
@@ -402,31 +402,42 @@ fn query_docids(
     wdcache: &mut WordDerivationsCache,
 ) -> Result<RoaringBitmap> {
     match &query.kind {
-        QueryKind::Exact { word, .. } => {
+        QueryKind::Exact { word, original_typo } => {
             if query.prefix && ctx.in_prefix_cache(&word) {
-                let doc_ids = ctx.word_prefix_docids(&word)?.unwrap_or_default();
-                let exact_docids = ctx.exact_word_prefix_docids(&word)?.unwrap_or_default();
-                Ok(doc_ids | exact_docids)
+                let mut docids = ctx.word_prefix_docids(&word)?.unwrap_or_default();
+                // only add the exact docids if the word hasn't been derived
+                if *original_typo == 0 {
+                    docids |= ctx.exact_word_prefix_docids(&word)?.unwrap_or_default();
+                }
+                Ok(docids)
             } else if query.prefix {
                 let words = word_derivations(&word, true, 0, ctx.words_fst(), wdcache)?;
                 let mut docids = RoaringBitmap::new();
                 for (word, _typo) in words {
-                    let current_docids = ctx.word_docids(&word)?.unwrap_or_default();
-                    let exact_current_docids = ctx.exact_word_docids(&word)?.unwrap_or_default();
-                    docids |= current_docids | exact_current_docids;
+                    docids |= ctx.word_docids(&word)?.unwrap_or_default();
+                    // only add the exact docids if the word hasn't been derived
+                    if *original_typo == 0 {
+                        docids |= ctx.exact_word_docids(&word)?.unwrap_or_default();
+                    }
                 }
                 Ok(docids)
             } else {
-                let word_docids = ctx.word_docids(&word)?.unwrap_or_default();
-                let exact_word_docids = ctx.exact_word_docids(&word)?.unwrap_or_default();
-                Ok(word_docids | exact_word_docids)
+                let mut docids = ctx.word_docids(&word)?.unwrap_or_default();
+                // only add the exact docids if the word hasn't been derived
+                if *original_typo == 0 {
+                    docids |= ctx.exact_word_docids(&word)?.unwrap_or_default();
+                }
+                Ok(docids)
             }
         }
         QueryKind::Tolerant { typo, word } => {
             let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)?;
             let mut docids = RoaringBitmap::new();
-            for (word, _typo) in words {
-                let current_docids = ctx.word_docids(&word)?.unwrap_or_default();
+            for (word, typo) in words {
+                let mut current_docids = ctx.word_docids(&word)?.unwrap_or_default();
+                if *typo == 0 {
+                    current_docids |= ctx.exact_word_docids(&word)?.unwrap_or_default()
+                }
                 docids |= current_docids;
             }
             Ok(docids)
diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs
index 5f231e5aa..fbc9f6919 100644
--- a/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_docids.rs
@@ -69,9 +69,11 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
                 }
                 let fid = field_id_from_position(position);
                 if exact_attributes.contains(&fid) && !added_to_exact {
+                    println!("is exact: {}", std::str::from_utf8(&word_bytes).unwrap());
                     exact_word_docids_sorter.insert(word_bytes, &value_buffer)?;
                     added_to_exact = true;
                 } else if !added_to_word_docids {
+                    println!("isnt exact: {}", std::str::from_utf8(&word_bytes).unwrap());
                     word_docids_sorter.insert(word_bytes, &value_buffer)?;
                     added_to_word_docids = true;
                 }

From bfd81ce050c6f0723f7322300958a0834529bcf6 Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Wed, 30 Mar 2022 16:08:20 +0200
Subject: [PATCH 15/28] add exact atttributes to cli settings

---
 cli/src/main.rs | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/cli/src/main.rs b/cli/src/main.rs
index 503b02887..6523cef2e 100644
--- a/cli/src/main.rs
+++ b/cli/src/main.rs
@@ -1,4 +1,4 @@
-use std::collections::BTreeMap;
+use std::collections::{BTreeMap, HashSet};
 use std::fs::File;
 use std::io::{stdin, BufRead, BufReader, Cursor, Read, Write};
 use std::path::PathBuf;
@@ -99,8 +99,10 @@ impl Settings {
             })
             .collect();
 
+        let exact_attributes = index.exact_attributes(&txn)?;
+
         println!(
-            "displayed attributes:\n\t{}\nsearchable attributes:\n\t{}\nfilterable attributes:\n\t{}\nsortable attributes:\n\t{}\ncriterion:\n\t{}\nstop words:\n\t{}\ndistinct fields:\n\t{}\nsynonyms:\n\t{}\n",
+            "displayed attributes:\n\t{}\nsearchable attributes:\n\t{}\nfilterable attributes:\n\t{}\nsortable attributes:\n\t{}\ncriterion:\n\t{}\nstop words:\n\t{}\ndistinct fields:\n\t{}\nsynonyms:\n\t{}\nexact attributes:\n{}",
             displayed_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"),
             searchable_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"),
             filterable_attributes.join("\n\t"),
@@ -109,6 +111,7 @@ impl Settings {
             stop_words.join("\n\t"),
             distinct_field.unwrap_or_default(),
             synonyms.into_iter().map(|(k, v)| format!("\n\t{}:\n{:?}", k, v)).collect::<String>(),
+            exact_attributes.join("\n\t"),
         );
         Ok(())
     }
@@ -463,6 +466,8 @@ struct SettingsUpdate {
     filterable_attributes: Option<Vec<String>>,
     #[structopt(long)]
     criteria: Option<Vec<String>>,
+    #[structopt(long)]
+    exact_attributes: Option<Vec<String>>,
 }
 
 impl Performer for SettingsUpdate {
@@ -489,6 +494,14 @@ impl Performer for SettingsUpdate {
             }
         }
 
+        if let Some(exact_attributes) = self.exact_attributes {
+            if !exact_attributes.is_empty() {
+                update.set_exact_attributes(exact_attributes.into_iter().collect());
+            } else {
+                update.reset_exact_attributes();
+            }
+        }
+
         let mut bars = Vec::new();
         let progesses = MultiProgress::new();
         for _ in 0..4 {

From c8d3a09af83d7f1cdfab65d45ac6173dfa1b31d3 Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Mon, 4 Apr 2022 14:47:07 +0200
Subject: [PATCH 16/28] add integration test for disabel typo on attributes

---
 milli/tests/search/typo_tolerance.rs | 37 ++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs
index df15fb768..92d57c9b9 100644
--- a/milli/tests/search/typo_tolerance.rs
+++ b/milli/tests/search/typo_tolerance.rs
@@ -170,3 +170,40 @@ fn test_typo_disabled_on_word() {
     let result = search.execute().unwrap();
     assert_eq!(result.documents_ids.len(), 1);
 }
+
+#[test]
+fn test_disable_typo_on_attribute() {
+    let criteria = [Typo];
+    let index = super::setup_search_index_with_criteria(&criteria);
+
+    // basic typo search with default typo settings
+    {
+        let txn = index.read_txn().unwrap();
+
+        let mut search = Search::new(&txn, &index);
+        search.query("antebelum");
+        search.limit(10);
+        search.authorize_typos(true);
+        search.optional_words(true);
+
+        let result = search.execute().unwrap();
+        assert_eq!(result.documents_ids.len(), 1);
+    }
+
+    let mut txn = index.write_txn().unwrap();
+
+    let config = IndexerConfig::default();
+    let mut builder = Settings::new(&mut txn, &index, &config);
+    builder.set_exact_attributes(vec!["description".to_string()].into_iter().collect());
+    builder.execute(|_| ()).unwrap();
+
+    // typo is now supported for 4 letters words
+    let mut search = Search::new(&txn, &index);
+    search.query("antebelum");
+    search.limit(10);
+    search.authorize_typos(true);
+    search.optional_words(true);
+
+    let result = search.execute().unwrap();
+    assert_eq!(result.documents_ids.len(), 0);
+}

From 9963f11172a06fa79ed06c3baf8cb4ae727c743b Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Mon, 4 Apr 2022 20:38:45 +0200
Subject: [PATCH 17/28] fix infos crate compilation issue

---
 cli/src/main.rs   | 2 +-
 infos/src/main.rs | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/cli/src/main.rs b/cli/src/main.rs
index 6523cef2e..cf1e85984 100644
--- a/cli/src/main.rs
+++ b/cli/src/main.rs
@@ -1,4 +1,4 @@
-use std::collections::{BTreeMap, HashSet};
+use std::collections::BTreeMap;
 use std::fs::File;
 use std::io::{stdin, BufRead, BufReader, Cursor, Read, Write};
 use std::path::PathBuf;
diff --git a/infos/src/main.rs b/infos/src/main.rs
index dc98d410d..6a270833b 100644
--- a/infos/src/main.rs
+++ b/infos/src/main.rs
@@ -387,6 +387,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
         field_id_docid_facet_f64s: _,
         field_id_docid_facet_strings: _,
         documents,
+        ..
     } = index;
 
     let main_name = "main";
@@ -968,6 +969,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
         field_id_docid_facet_f64s,
         field_id_docid_facet_strings,
         documents,
+        ..
     } = index;
 
     let names = if names.is_empty() {

From 6cabd47c32bcf2ba53a3ebe94f254a7fe63de520 Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Mon, 4 Apr 2022 20:59:20 +0200
Subject: [PATCH 18/28] fix typo in comment

---
 milli/src/update/index_documents/extract/extract_word_docids.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs
index fbc9f6919..b577ef567 100644
--- a/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_docids.rs
@@ -19,7 +19,7 @@ use crate::{field_id_from_position, FieldId, Result};
 /// Returns a grenad reader with the list of extracted words and
 /// documents ids from the given chunk of docid word positions.
 ///
-/// The first returned reader in the one for normal word_docids, and the second one is for
+/// The first returned reader is the one for normal word_docids, and the second one is for
 /// exact_word_docids
 #[logging_timer::time]
 pub fn extract_word_docids<R: io::Read + io::Seek>(

From b7694c34f53da8f3253236aff1a5b4a24503bf3c Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Mon, 4 Apr 2022 21:00:07 +0200
Subject: [PATCH 19/28] remove println

---
 milli/src/update/index_documents/extract/extract_word_docids.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs
index b577ef567..5083bbd90 100644
--- a/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_docids.rs
@@ -69,11 +69,9 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
                 }
                 let fid = field_id_from_position(position);
                 if exact_attributes.contains(&fid) && !added_to_exact {
-                    println!("is exact: {}", std::str::from_utf8(&word_bytes).unwrap());
                     exact_word_docids_sorter.insert(word_bytes, &value_buffer)?;
                     added_to_exact = true;
                 } else if !added_to_word_docids {
-                    println!("isnt exact: {}", std::str::from_utf8(&word_bytes).unwrap());
                     word_docids_sorter.insert(word_bytes, &value_buffer)?;
                     added_to_word_docids = true;
                 }

From 1810927dbd5f23b85c7e6d9c01d4e68907e84a3f Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Mon, 4 Apr 2022 21:04:49 +0200
Subject: [PATCH 20/28] rephrase exact_attributes doc

---
 milli/src/update/settings.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs
index 3ed2a4152..7a26361d4 100644
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@@ -93,7 +93,7 @@ pub struct Settings<'a, 't, 'u, 'i> {
     min_word_len_two_typos: Setting<u8>,
     min_word_len_one_typo: Setting<u8>,
     exact_words: Setting<BTreeSet<String>>,
-    /// attributes on which typo tolerance is not enabled.
+    /// Attributes on which typo tolerance is disabled.
     exact_attributes: Setting<HashSet<String>>,
 }
 

From 59e41d98e303205fbb38b467d947c853d15f9ca8 Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Mon, 4 Apr 2022 21:17:06 +0200
Subject: [PATCH 21/28] add comments to integration test

---
 milli/tests/search/typo_tolerance.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs
index 92d57c9b9..35cc4b4c2 100644
--- a/milli/tests/search/typo_tolerance.rs
+++ b/milli/tests/search/typo_tolerance.rs
@@ -181,6 +181,7 @@ fn test_disable_typo_on_attribute() {
         let txn = index.read_txn().unwrap();
 
         let mut search = Search::new(&txn, &index);
+        // typo in `antebel(l)um`
         search.query("antebelum");
         search.limit(10);
         search.authorize_typos(true);
@@ -194,10 +195,10 @@ fn test_disable_typo_on_attribute() {
 
     let config = IndexerConfig::default();
     let mut builder = Settings::new(&mut txn, &index, &config);
+    // disable typos on `description`
     builder.set_exact_attributes(vec!["description".to_string()].into_iter().collect());
     builder.execute(|_| ()).unwrap();
 
-    // typo is now supported for 4 letters words
     let mut search = Search::new(&txn, &index);
     search.query("antebelum");
     search.limit(10);

From ab185a59b5a969f76013670cb61c6892e435f32d Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Tue, 5 Apr 2022 09:46:56 +0200
Subject: [PATCH 22/28] fix infos

---
 infos/src/main.rs                    | 26 ++++++++++++++++++++++++--
 milli/src/update/delete_documents.rs |  2 --
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/infos/src/main.rs b/infos/src/main.rs
index 6a270833b..05c168233 100644
--- a/infos/src/main.rs
+++ b/infos/src/main.rs
@@ -29,6 +29,8 @@ const ALL_DATABASE_NAMES: &[&str] = &[
     FACET_ID_STRING_DOCIDS,
     FIELD_ID_DOCID_FACET_F64S,
     FIELD_ID_DOCID_FACET_STRINGS,
+    EXACT_WORD_DOCIDS,
+    EXACT_WORD_PREFIX_DOCIDS,
     DOCUMENTS,
 ];
 
@@ -384,10 +386,11 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
         field_id_word_count_docids,
         facet_id_f64_docids,
         facet_id_string_docids,
+        exact_word_docids,
+        exact_word_prefix_docids,
         field_id_docid_facet_f64s: _,
         field_id_docid_facet_strings: _,
         documents,
-        ..
     } = index;
 
     let main_name = "main";
@@ -437,6 +440,14 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
             }
         }
 
+        for result in exact_word_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
+            let (word, value) = result?;
+            heap.push(Reverse((value.len(), word.to_string(), word_docids_name)));
+            if heap.len() > limit {
+                heap.pop();
+            }
+        }
+
         for result in word_prefix_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
             let (word, value) = result?;
             heap.push(Reverse((value.len(), word.to_string(), word_prefix_docids_name)));
@@ -445,6 +456,14 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
             }
         }
 
+        for result in exact_word_prefix_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
+            let (word, value) = result?;
+            heap.push(Reverse((value.len(), word.to_string(), word_prefix_docids_name)));
+            if heap.len() > limit {
+                heap.pop();
+            }
+        }
+
         for result in docid_word_positions.remap_data_type::<ByteSlice>().iter(rtxn)? {
             let ((docid, word), value) = result?;
             let key = format!("{} {}", docid, word);
@@ -968,8 +987,9 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
         facet_id_string_docids,
         field_id_docid_facet_f64s,
         field_id_docid_facet_strings,
+        exact_word_prefix_docids,
+        exact_word_docids,
         documents,
-        ..
     } = index;
 
     let names = if names.is_empty() {
@@ -993,6 +1013,8 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
             FACET_ID_STRING_DOCIDS => facet_id_string_docids.as_polymorph(),
             FIELD_ID_DOCID_FACET_F64S => field_id_docid_facet_f64s.as_polymorph(),
             FIELD_ID_DOCID_FACET_STRINGS => field_id_docid_facet_strings.as_polymorph(),
+            EXACT_WORD_DOCIDS => exact_word_docids.as_polymorph(),
+            EXACT_WORD_PREFIX_DOCIDS => exact_word_prefix_docids.as_polymorph(),
 
             DOCUMENTS => documents.as_polymorph(),
             unknown => anyhow::bail!("unknown database {:?}", unknown),
diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
index 58c4d4f70..b347aae38 100644
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -473,8 +473,6 @@ fn remove_from_word_prefix_docids(
         }
     }
 
-    drop(iter);
-
     Ok(prefixes_to_delete.into_set())
 }
 

From dac81b2d44e479a838f20cc9bc14e37efa430d7f Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Tue, 5 Apr 2022 09:48:56 +0200
Subject: [PATCH 23/28] add missing \n in cli settings

---
 cli/src/main.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cli/src/main.rs b/cli/src/main.rs
index cf1e85984..202c67707 100644
--- a/cli/src/main.rs
+++ b/cli/src/main.rs
@@ -102,7 +102,7 @@ impl Settings {
         let exact_attributes = index.exact_attributes(&txn)?;
 
         println!(
-            "displayed attributes:\n\t{}\nsearchable attributes:\n\t{}\nfilterable attributes:\n\t{}\nsortable attributes:\n\t{}\ncriterion:\n\t{}\nstop words:\n\t{}\ndistinct fields:\n\t{}\nsynonyms:\n\t{}\nexact attributes:\n{}",
+            "displayed attributes:\n\t{}\nsearchable attributes:\n\t{}\nfilterable attributes:\n\t{}\nsortable attributes:\n\t{}\ncriterion:\n\t{}\nstop words:\n\t{}\ndistinct fields:\n\t{}\nsynonyms:\n\t{}\nexact attributes:\n{}\n",
             displayed_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"),
             searchable_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"),
             filterable_attributes.join("\n\t"),

From b85cd4983ea01b062ce5e3a2c79a8a3a06f7b0ed Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Tue, 5 Apr 2022 09:50:34 +0200
Subject: [PATCH 24/28] remove field_id_from_position

---
 milli/src/lib.rs                                              | 4 ----
 .../src/update/index_documents/extract/extract_word_docids.rs | 4 ++--
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/milli/src/lib.rs b/milli/src/lib.rs
index b68c76048..ba2bd9b0f 100644
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@@ -74,10 +74,6 @@ pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosi
     (field_id as u32) << 16 | (relative as u32)
 }
 
-pub fn field_id_from_position(position: u32) -> FieldId {
-    (position >> 16 & 0xffff) as u16
-}
-
 /// Transform a raw obkv store into a JSON Object.
 pub fn obkv_to_json(
     displayed_fields: &[FieldId],
diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs
index 5083bbd90..0f8b4c039 100644
--- a/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_docids.rs
@@ -12,7 +12,7 @@ use super::helpers::{
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
 use crate::update::index_documents::helpers::read_u32_ne_bytes;
-use crate::{field_id_from_position, FieldId, Result};
+use crate::{relative_from_absolute_position, FieldId, Result};
 
 /// Extracts the word and the documents ids where this word appear.
 ///
@@ -67,7 +67,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
                 if added_to_exact && added_to_word_docids {
                     break;
                 }
-                let fid = field_id_from_position(position);
+                let (fid, _) = relative_from_absolute_position(position);
                 if exact_attributes.contains(&fid) && !added_to_exact {
                     exact_word_docids_sorter.insert(word_bytes, &value_buffer)?;
                     added_to_exact = true;

From 5cfd3d8407bd2bc11f6771385436681726ea8e12 Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Tue, 5 Apr 2022 14:10:22 +0200
Subject: [PATCH 25/28] add exact attributes documentation

---
 milli/src/index.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/milli/src/index.rs b/milli/src/index.rs
index c7441c590..42170bc80 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -964,6 +964,7 @@ impl Index {
         Ok(())
     }
 
+    /// Returns the exact attributes: attributes for which typo is disallowed.
     pub fn exact_attributes<'t>(&self, txn: &'t RoTxn) -> Result<Vec<&'t str>> {
         Ok(self
             .main
@@ -971,17 +972,20 @@ impl Index {
             .unwrap_or_default())
     }
 
+    /// Returns the list of exact attributes field ids.
     pub fn exact_attributes_ids(&self, txn: &RoTxn) -> Result<HashSet<FieldId>> {
         let attrs = self.exact_attributes(txn)?;
         let fid_map = self.fields_ids_map(txn)?;
         Ok(attrs.iter().filter_map(|attr| fid_map.id(attr)).collect())
     }
 
+    /// Writes the exact attributes to the database.
     pub(crate) fn put_exact_attributes(&self, txn: &mut RwTxn, attrs: &[&str]) -> Result<()> {
         self.main.put::<_, Str, SerdeBincode<&[&str]>>(txn, main_key::EXACT_ATTRIBUTES, &attrs)?;
         Ok(())
     }
 
+    /// Clears the exact attributes from the store.
     pub(crate) fn delete_exact_attributes(&self, txn: &mut RwTxn) -> Result<()> {
         self.main.delete::<_, Str>(txn, main_key::EXACT_ATTRIBUTES)?;
         Ok(())

From 201fea0fdaae3a334936a8ad52e2c5de8f178a84 Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Tue, 5 Apr 2022 14:14:15 +0200
Subject: [PATCH 26/28] limit extract_word_docids memory usage

---
 milli/src/update/delete_documents.rs                          | 1 +
 .../src/update/index_documents/extract/extract_word_docids.rs | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
index b347aae38..77c32f0fb 100644
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -502,6 +502,7 @@ fn remove_from_word_docids(
             }
         }
     }
+
     Ok(())
 }
 
diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs
index 0f8b4c039..f3a44162b 100644
--- a/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_docids.rs
@@ -34,7 +34,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         indexer.max_nb_chunks,
-        max_memory,
+        max_memory.map(|x| x / 2),
     );
 
     let mut exact_word_docids_sorter = create_sorter(
@@ -42,7 +42,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         indexer.max_nb_chunks,
-        max_memory,
+        max_memory.map(|x| x / 2),
     );
 
     let mut value_buffer = Vec::new();

From b799f3326b982e382f8f1b7a809f1abe1521c008 Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Tue, 5 Apr 2022 18:44:35 +0200
Subject: [PATCH 27/28] rename merge_nothing to merge_ignore_values

---
 milli/src/update/index_documents/helpers/grenad_helpers.rs | 2 +-
 milli/src/update/index_documents/helpers/mod.rs            | 6 +++---
 milli/src/update/index_documents/typed_chunk.rs            | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs
index fb5242910..9d5a67d78 100644
--- a/milli/src/update/index_documents/helpers/grenad_helpers.rs
+++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs
@@ -279,6 +279,6 @@ pub fn sorter_into_lmdb_database(
 }
 
 /// Used when trying to merge readers, but you don't actually care about the values.
-pub fn merge_nothing<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
+pub fn merge_ignore_values<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
     Ok(Cow::Owned(Vec::new()))
 }
diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs
index 4642bcf14..79d0d0466 100644
--- a/milli/src/update/index_documents/helpers/mod.rs
+++ b/milli/src/update/index_documents/helpers/mod.rs
@@ -8,9 +8,9 @@ use std::convert::{TryFrom, TryInto};
 pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
 use fst::{IntoStreamer, Streamer};
 pub use grenad_helpers::{
-    as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, merge_nothing,
-    sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader,
-    GrenadParameters, MergeableReader,
+    as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
+    merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database,
+    writer_into_reader, GrenadParameters, MergeableReader,
 };
 pub use merge_functions::{
     concat_u32s_array, keep_first, keep_first_prefix_value_merge_roaring_bitmaps, keep_latest_obkv,
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index be440114f..26b97c3a0 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -9,8 +9,8 @@ use heed::{BytesDecode, RwTxn};
 use roaring::RoaringBitmap;
 
 use super::helpers::{
-    self, merge_nothing, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key,
-    CursorClonableMmap,
+    self, merge_ignore_values, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap,
+    valid_lmdb_key, CursorClonableMmap,
 };
 use super::{ClonableMmap, MergeFn};
 use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string};
@@ -226,7 +226,7 @@ fn merge_word_docids_reader_into_fst(
     word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
     exact_word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
 ) -> Result<fst::Set<Vec<u8>>> {
-    let mut merger_builder = MergerBuilder::new(merge_nothing as MergeFn);
+    let mut merger_builder = MergerBuilder::new(merge_ignore_values as MergeFn);
     merger_builder.push(word_docids_iter.into_cursor()?);
     merger_builder.push(exact_word_docids_iter.into_cursor()?);
     let mut iter = merger_builder.build().into_stream_merger_iter()?;

From 86249e2ae43e5a2e9bbdc747435fc6938ce2abc5 Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Tue, 5 Apr 2022 21:35:06 +0200
Subject: [PATCH 28/28] add missing \t in cli update display
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Clément Renault <clement@meilisearch.com>
---
 cli/src/main.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cli/src/main.rs b/cli/src/main.rs
index 202c67707..542b9d472 100644
--- a/cli/src/main.rs
+++ b/cli/src/main.rs
@@ -102,7 +102,7 @@ impl Settings {
         let exact_attributes = index.exact_attributes(&txn)?;
 
         println!(
-            "displayed attributes:\n\t{}\nsearchable attributes:\n\t{}\nfilterable attributes:\n\t{}\nsortable attributes:\n\t{}\ncriterion:\n\t{}\nstop words:\n\t{}\ndistinct fields:\n\t{}\nsynonyms:\n\t{}\nexact attributes:\n{}\n",
+            "displayed attributes:\n\t{}\nsearchable attributes:\n\t{}\nfilterable attributes:\n\t{}\nsortable attributes:\n\t{}\ncriterion:\n\t{}\nstop words:\n\t{}\ndistinct fields:\n\t{}\nsynonyms:\n\t{}\nexact attributes:\n\t{}\n",
             displayed_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"),
             searchable_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"),
             filterable_attributes.join("\n\t"),