From 9b2653427ded198a8d744e112dba68a93470dd51 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic.lecrenier@me.com>
Date: Thu, 23 Mar 2023 09:22:01 +0100
Subject: [PATCH] Split position DB into fid and relative position DB

---
 milli/src/heed_codec/mod.rs                   |   2 +-
 milli/src/heed_codec/str_beu32_codec.rs       |  34 ++++
 milli/src/index.rs                            |  21 ++-
 milli/src/lib.rs                              |  17 ++
 milli/src/search/criteria/attribute.rs        |  17 +-
 milli/src/search/criteria/mod.rs              |  12 +-
 milli/src/update/clear_documents.rs           |   4 +
 milli/src/update/delete_documents.rs          | 162 ++++++------------
 .../extract/extract_word_position_docids.rs   |  15 +-
 .../src/update/index_documents/extract/mod.rs |   4 +-
 .../update/words_prefix_position_docids.rs    |   9 +-
 11 files changed, 162 insertions(+), 135 deletions(-)
diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs
index a4df63e22..b7a8c3c88 100644
--- a/milli/src/heed_codec/mod.rs
+++ b/milli/src/heed_codec/mod.rs
@@ -21,5 +21,5 @@ pub use self::roaring_bitmap_length::{
     BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec,
 };
 pub use self::script_language_codec::ScriptLanguageCodec;
-pub use self::str_beu32_codec::StrBEU32Codec;
+pub use self::str_beu32_codec::{StrBEU32Codec, StrBEU16Codec};
 pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec};
diff --git a/milli/src/heed_codec/str_beu32_codec.rs b/milli/src/heed_codec/str_beu32_codec.rs
index d1f379bdc..17f3c996f 100644
--- a/milli/src/heed_codec/str_beu32_codec.rs
+++ b/milli/src/heed_codec/str_beu32_codec.rs
@@ -36,3 +36,37 @@ impl<'a> heed::BytesEncode<'a> for StrBEU32Codec {
         Some(Cow::Owned(bytes))
     }
 }
+
+pub struct StrBEU16Codec;
+
+impl<'a> heed::BytesDecode<'a> for StrBEU16Codec {
+    type DItem = (&'a str, u16);
+
+    fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
+        let footer_len = size_of::<u16>();
+
+        if bytes.len() < footer_len {
+            return None;
+        }
+
+        let (word, bytes) = bytes.split_at(bytes.len() - footer_len);
+        let word = str::from_utf8(word).ok()?;
+        let pos = bytes.try_into().map(u16::from_be_bytes).ok()?;
+
+        Some((word, pos))
+    }
+}
+
+impl<'a> heed::BytesEncode<'a> for StrBEU16Codec {
+    type EItem = (&'a str, u16);
+
+    fn bytes_encode((word, pos): &Self::EItem) -> Option<Cow<[u8]>> {
+        let pos = pos.to_be_bytes();
+
+        let mut bytes = Vec::with_capacity(word.len() + pos.len());
+        bytes.extend_from_slice(word.as_bytes());
+        bytes.extend_from_slice(&pos[..]);
+
+        Some(Cow::Owned(bytes))
+    }
+}
diff --git a/milli/src/index.rs b/milli/src/index.rs
index a4048dfb0..7848ddf5a 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -19,12 +19,12 @@ use crate::heed_codec::facet::{
     FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
     FieldIdCodec, OrderedF64Codec,
 };
-use crate::heed_codec::{ScriptLanguageCodec, StrRefCodec};
+use crate::heed_codec::{ScriptLanguageCodec, StrBEU16Codec, StrRefCodec};
 use crate::{
     default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
     DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId,
     FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec,
-    Search, StrBEU32Codec, U8StrStrCodec, BEU16, BEU32,
+    Search, U8StrStrCodec, BEU16, BEU32,
 };
 
 pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
@@ -76,7 +76,9 @@ pub mod db_name {
     pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids";
     pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids";
     pub const WORD_POSITION_DOCIDS: &str = "word-position-docids";
+    pub const WORD_FIELD_ID_DOCIDS: &str = "word-field-id-docids";
     pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids";
+    pub const WORD_PREFIX_FIELD_ID_DOCIDS: &str = "word-prefix-field-id-docids";
     pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids";
     pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids";
     pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids";
@@ -118,11 +120,16 @@ pub struct Index {
     pub prefix_word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
 
     /// Maps the word and the position with the docids that corresponds to it.
-    pub word_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
+    pub word_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
+    /// Maps the word and the field id with the docids that corresponds to it.
+    pub word_fid_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
+
     /// Maps the field id and the word count with the docids that corresponds to it.
     pub field_id_word_count_docids: Database<FieldIdWordCountCodec, CboRoaringBitmapCodec>,
     /// Maps the position of a word prefix with all the docids where this prefix appears.
-    pub word_prefix_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
+    pub word_prefix_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
+    /// Maps the word and the field id with the docids that corresponds to it.
+    pub word_prefix_fid_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
 
     /// Maps the script and language with all the docids that corresponds to it.
     pub script_language_docids: Database<ScriptLanguageCodec, RoaringBitmapCodec>,
@@ -153,7 +160,7 @@ impl Index {
     ) -> Result<Index> {
         use db_name::*;
 
-        options.max_dbs(19);
+        options.max_dbs(21);
         unsafe { options.flag(Flags::MdbAlwaysFreePages) };
 
         let env = options.open(path)?;
@@ -170,8 +177,10 @@ impl Index {
         let prefix_word_pair_proximity_docids =
             env.create_database(Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?;
         let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?;
+        let word_fid_docids = env.create_database(Some(WORD_FIELD_ID_DOCIDS))?;
         let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?;
         let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?;
+        let word_prefix_fid_docids = env.create_database(Some(WORD_PREFIX_FIELD_ID_DOCIDS))?;
         let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?;
         let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?;
         let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?;
@@ -196,7 +205,9 @@ impl Index {
             word_prefix_pair_proximity_docids,
             prefix_word_pair_proximity_docids,
             word_position_docids,
+            word_fid_docids,
             word_prefix_position_docids,
+            word_prefix_fid_docids,
             field_id_word_count_docids,
             facet_id_f64_docids,
             facet_id_string_docids,
diff --git a/milli/src/lib.rs b/milli/src/lib.rs
index b256192bd..a62c344f9 100644
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@@ -152,6 +152,23 @@ pub fn relative_from_absolute_position(absolute: Position) -> (FieldId, Relative
 pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosition) -> Position {
     (field_id as u32) << 16 | (relative as u32)
 }
+// TODO: this is wrong, but will do for now
+/// Compute the "bucketed" absolute position from the field id and relative position in the field.
+///
+/// In a bucketed position, the accuracy of the relative position is reduced exponentially as it gets larger.
+pub fn bucketed_position(relative: u16) -> u16 {
+    // The first few relative positions are kept intact.
+    if relative < 16 {
+        relative
+    } else if relative < 24 {
+        // Relative positions between 16 and 24 all become equal to 24
+        24
+    } else {
+        // Then, groups of positions that have the same base-2 logarithm are reduced to
+        // the same relative position: the smallest power of 2 that is greater than them
+        (relative as f64).log2().ceil().exp2() as u16
+    }
+}
 
 /// Transform a raw obkv store into a JSON Object.
 pub fn obkv_to_json(
diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
index 5b33fdf54..322f6e051 100644
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -199,7 +199,7 @@ impl<'t> Criterion for Attribute<'t> {
 struct QueryPositionIterator<'t> {
     #[allow(clippy::type_complexity)]
     inner:
-        Vec<Peekable<Box<dyn Iterator<Item = heed::Result<((&'t str, u32), RoaringBitmap)>> + 't>>>,
+        Vec<Peekable<Box<dyn Iterator<Item = heed::Result<((&'t str, u16), RoaringBitmap)>> + 't>>>,
 }
 
 impl<'t> QueryPositionIterator<'t> {
@@ -241,7 +241,7 @@ impl<'t> QueryPositionIterator<'t> {
 }
 
 impl<'t> Iterator for QueryPositionIterator<'t> {
-    type Item = heed::Result<(u32, RoaringBitmap)>;
+    type Item = heed::Result<(u16, RoaringBitmap)>;
 
     fn next(&mut self) -> Option<Self::Item> {
         // sort inner words from the closest next position to the farthest next position.
@@ -281,9 +281,9 @@ impl<'t> Iterator for QueryPositionIterator<'t> {
 /// A Branch is represent a possible alternative of the original query and is build with the Query Tree,
 /// This branch allows us to iterate over meta-interval of positions.
 struct Branch<'t> {
-    query_level_iterator: Vec<(u32, RoaringBitmap, Peekable<QueryPositionIterator<'t>>)>,
-    last_result: (u32, RoaringBitmap),
-    branch_size: u32,
+    query_level_iterator: Vec<(u16, RoaringBitmap, Peekable<QueryPositionIterator<'t>>)>,
+    last_result: (u16, RoaringBitmap),
+    branch_size: u16,
 }
 
 impl<'t> Branch<'t> {
@@ -303,7 +303,7 @@ impl<'t> Branch<'t> {
         let mut branch = Self {
             query_level_iterator,
             last_result: (0, RoaringBitmap::new()),
-            branch_size: flatten_branch.len() as u32,
+            branch_size: flatten_branch.len() as u16,
         };
 
         branch.update_last_result();
@@ -342,7 +342,7 @@ impl<'t> Branch<'t> {
                         Some(result) => {
                             result.as_ref().map(|(next_pos, _)| *next_pos - *pos).unwrap_or(0)
                         }
-                        None => u32::MAX,
+                        None => u16::MAX,
                     }
                 }
             })
@@ -378,7 +378,8 @@ impl<'t> Branch<'t> {
     fn compute_rank(&self) -> u32 {
         // we compute a rank from the position.
         let (pos, _) = self.last_result;
-        pos.saturating_sub((0..self.branch_size).sum()) * LCM_10_FIRST_NUMBERS / self.branch_size
+        pos.saturating_sub((0..self.branch_size).sum()) as u32 * LCM_10_FIRST_NUMBERS
+            / self.branch_size as u32
     }
 
     fn cmp(&self, other: &Self) -> Ordering {
diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs
index 0c1c8add1..5e491672f 100644
--- a/milli/src/search/criteria/mod.rs
+++ b/milli/src/search/criteria/mod.rs
@@ -171,7 +171,7 @@ pub trait Context<'c> {
         &self,
         word: &str,
         in_prefix_cache: bool,
-    ) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, u32), RoaringBitmap)>> + 'c>>;
+    ) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, u16), RoaringBitmap)>> + 'c>>;
     fn synonyms(&self, word: &str) -> heed::Result<Option<Vec<Vec<String>>>>;
     fn searchable_fields_ids(&self) -> Result<Vec<FieldId>>;
     fn field_id_word_count_docids(
@@ -322,11 +322,11 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
         &self,
         word: &str,
         in_prefix_cache: bool,
-    ) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, u32), RoaringBitmap)>> + 'c>>
+    ) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, u16), RoaringBitmap)>> + 'c>>
     {
         let range = {
-            let left = u32::min_value();
-            let right = u32::max_value();
+            let left = u16::min_value(); // TODO: this is wrong
+            let right = u16::max_value(); // TODO: this is wrong
             let left = (word, left);
             let right = (word, right);
             left..=right
@@ -360,7 +360,7 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
     }
 
     fn word_position_docids(&self, word: &str, pos: u32) -> heed::Result<Option<RoaringBitmap>> {
-        let key = (word, pos);
+        let key = (word, pos as u16); // TODO: this is wrong
         self.index.word_position_docids.get(self.rtxn, &key)
     }
 }
@@ -899,7 +899,7 @@ pub mod test {
             _word: &str,
             _in_prefix_cache: bool,
         ) -> heed::Result<
-            Box<dyn Iterator<Item = heed::Result<((&'c str, u32), RoaringBitmap)>> + 'c>,
+            Box<dyn Iterator<Item = heed::Result<((&'c str, u16), RoaringBitmap)>> + 'c>,
         > {
             todo!()
         }
diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs
index 0296bc192..c9de4d9ab 100644
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@@ -28,8 +28,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
             word_prefix_pair_proximity_docids,
             prefix_word_pair_proximity_docids,
             word_position_docids,
+            word_fid_docids,
             field_id_word_count_docids,
             word_prefix_position_docids,
+            word_prefix_fid_docids,
             script_language_docids,
             facet_id_f64_docids,
             facet_id_string_docids,
@@ -81,8 +83,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
         word_prefix_pair_proximity_docids.clear(self.wtxn)?;
         prefix_word_pair_proximity_docids.clear(self.wtxn)?;
         word_position_docids.clear(self.wtxn)?;
+        word_fid_docids.clear(self.wtxn)?;
         field_id_word_count_docids.clear(self.wtxn)?;
         word_prefix_position_docids.clear(self.wtxn)?;
+        word_prefix_fid_docids.clear(self.wtxn)?;
         script_language_docids.clear(self.wtxn)?;
         facet_id_f64_docids.clear(self.wtxn)?;
         facet_id_exists_docids.clear(self.wtxn)?;
diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
index eeb67b829..47a7bde4c 100644
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -2,8 +2,8 @@ use std::collections::btree_map::Entry;
 use std::collections::{HashMap, HashSet};
 
 use fst::IntoStreamer;
-use heed::types::{ByteSlice, DecodeIgnore, Str};
-use heed::Database;
+use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice};
+use heed::{BytesDecode, BytesEncode, Database, RwIter};
 use roaring::RoaringBitmap;
 use serde::{Deserialize, Serialize};
 use time::OffsetDateTime;
@@ -239,6 +239,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
             prefix_word_pair_proximity_docids,
             word_position_docids,
             word_prefix_position_docids,
+            word_fid_docids,
+            word_prefix_fid_docids,
             facet_id_f64_docids: _,
             facet_id_string_docids: _,
             field_id_docid_facet_f64s: _,
@@ -361,97 +363,34 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
         for db in [word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids] {
             // We delete the documents ids from the word prefix pair proximity database docids
             // and remove the empty pairs too.
-            let db = db.remap_key_type::<ByteSlice>();
-            let mut iter = db.iter_mut(self.wtxn)?;
-            while let Some(result) = iter.next() {
-                let (key, mut docids) = result?;
-                let previous_len = docids.len();
-                docids -= &self.to_delete_docids;
-                if docids.is_empty() {
-                    // safety: we don't keep references from inside the LMDB database.
-                    unsafe { iter.del_current()? };
-                } else if docids.len() != previous_len {
-                    let key = key.to_owned();
-                    // safety: we don't keep references from inside the LMDB database.
-                    unsafe { iter.put_current(&key, &docids)? };
-                }
-            }
+            Self::delete_from_db(db.iter_mut(self.wtxn)?.remap_key_type(), &self.to_delete_docids)?;
         }
-
-        // We delete the documents ids that are under the pairs of words,
-        // it is faster and use no memory to iterate over all the words pairs than
-        // to compute the cartesian product of every words of the deleted documents.
-        let mut iter =
-            word_pair_proximity_docids.remap_key_type::<ByteSlice>().iter_mut(self.wtxn)?;
-        while let Some(result) = iter.next() {
-            let (bytes, mut docids) = result?;
-            let previous_len = docids.len();
-            docids -= &self.to_delete_docids;
-            if docids.is_empty() {
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.del_current()? };
-            } else if docids.len() != previous_len {
-                let bytes = bytes.to_owned();
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.put_current(&bytes, &docids)? };
-            }
-        }
-
-        drop(iter);
-
-        // We delete the documents ids that are under the word level position docids.
-        let mut iter = word_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
-        while let Some(result) = iter.next() {
-            let (bytes, mut docids) = result?;
-            let previous_len = docids.len();
-            docids -= &self.to_delete_docids;
-            if docids.is_empty() {
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.del_current()? };
-            } else if docids.len() != previous_len {
-                let bytes = bytes.to_owned();
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.put_current(&bytes, &docids)? };
-            }
-        }
-
-        drop(iter);
-
-        // We delete the documents ids that are under the word prefix level position docids.
-        let mut iter =
-            word_prefix_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
-        while let Some(result) = iter.next() {
-            let (bytes, mut docids) = result?;
-            let previous_len = docids.len();
-            docids -= &self.to_delete_docids;
-            if docids.is_empty() {
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.del_current()? };
-            } else if docids.len() != previous_len {
-                let bytes = bytes.to_owned();
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.put_current(&bytes, &docids)? };
-            }
-        }
-
-        drop(iter);
+        Self::delete_from_db(
+            word_pair_proximity_docids.iter_mut(self.wtxn)?.remap_key_type(),
+            &self.to_delete_docids,
+        )?;
+        Self::delete_from_db(
+            word_position_docids.iter_mut(self.wtxn)?.remap_key_type(),
+            &self.to_delete_docids,
+        )?;
+        Self::delete_from_db(
+            word_prefix_position_docids.iter_mut(self.wtxn)?.remap_key_type(),
+            &self.to_delete_docids,
+        )?;
+        Self::delete_from_db(
+            word_fid_docids.iter_mut(self.wtxn)?.remap_key_type(),
+            &self.to_delete_docids,
+        )?;
+        Self::delete_from_db(
+            word_prefix_fid_docids.iter_mut(self.wtxn)?.remap_key_type(),
+            &self.to_delete_docids,
+        )?;
 
         // Remove the documents ids from the field id word count database.
-        let mut iter = field_id_word_count_docids.iter_mut(self.wtxn)?;
-        while let Some((key, mut docids)) = iter.next().transpose()? {
-            let previous_len = docids.len();
-            docids -= &self.to_delete_docids;
-            if docids.is_empty() {
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.del_current()? };
-            } else if docids.len() != previous_len {
-                let key = key.to_owned();
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.put_current(&key, &docids)? };
-            }
-        }
-
-        drop(iter);
+        Self::delete_from_db(
+            field_id_word_count_docids.iter_mut(self.wtxn)?.remap_key_type(),
+            &self.to_delete_docids,
+        )?;
 
         if let Some(mut rtree) = self.index.geo_rtree(self.wtxn)? {
             let mut geo_faceted_doc_ids = self.index.geo_faceted_documents_ids(self.wtxn)?;
@@ -501,21 +440,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
         }
 
         // Remove the documents ids from the script language database.
-        let mut iter = script_language_docids.iter_mut(self.wtxn)?;
-        while let Some((key, mut docids)) = iter.next().transpose()? {
-            let previous_len = docids.len();
-            docids -= &self.to_delete_docids;
-            if docids.is_empty() {
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.del_current()? };
-            } else if docids.len() != previous_len {
-                let key = key.to_owned();
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.put_current(&key, &docids)? };
-            }
-        }
-
-        drop(iter);
+        Self::delete_from_db(
+            script_language_docids.iter_mut(self.wtxn)?.remap_key_type(),
+            &self.to_delete_docids,
+        )?;
         // We delete the documents ids that are under the facet field id values.
         remove_docids_from_facet_id_exists_docids(
             self.wtxn,
@@ -531,6 +459,30 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
             soft_deletion_used: false,
         })
     }
+
+    fn delete_from_db<C>(
+        mut iter: RwIter<UnalignedSlice<u8>, C>,
+        to_delete_docids: &RoaringBitmap,
+    ) -> Result<()>
+    where
+        C: for<'a> BytesDecode<'a, DItem = RoaringBitmap>
+            + for<'a> BytesEncode<'a, EItem = RoaringBitmap>,
+    {
+        while let Some(result) = iter.next() {
+            let (bytes, mut docids) = result?;
+            let previous_len = docids.len();
+            docids -= to_delete_docids;
+            if docids.is_empty() {
+                // safety: we don't keep references from inside the LMDB database.
+                unsafe { iter.del_current()? };
+            } else if docids.len() != previous_len {
+                let bytes = bytes.to_owned();
+                // safety: we don't keep references from inside the LMDB database.
+                unsafe { iter.put_current(&bytes, &docids)? };
+            }
+        }
+        Ok(())
+    }
 }
 
 fn remove_from_word_prefix_docids(
diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
index d95db4157..cd3ec691b 100644
--- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
@@ -7,14 +7,17 @@ use super::helpers::{
 };
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
-use crate::{DocumentId, Result};
+use crate::{
+    absolute_from_relative_position, bucketed_position, relative_from_absolute_position,
+    DocumentId, Result,
+};
 
 /// Extracts the word positions and the documents ids where this word appear.
 ///
 /// Returns a grenad reader with the list of extracted words at positions and
 /// documents ids from the given chunk of docid word positions.
 #[logging_timer::time]
-pub fn extract_word_position_docids<R: io::Read + io::Seek>(
+pub fn extract_word_fid_and_position_docids<R: io::Read + io::Seek>(
     docid_word_positions: grenad::Reader<R>,
     indexer: GrenadParameters,
 ) -> Result<grenad::Reader<File>> {
@@ -39,11 +42,15 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
         for position in read_u32_ne_bytes(value) {
             key_buffer.clear();
             key_buffer.extend_from_slice(word_bytes);
+            let (fid, position) = relative_from_absolute_position(position);
+            let position = bucketed_position(position);
+            let position = absolute_from_relative_position(fid, position);
             key_buffer.extend_from_slice(&position.to_be_bytes());
-
             word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
         }
     }
 
-    sorter_into_reader(word_position_docids_sorter, indexer)
+    let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?;
+
+    Ok(word_position_docids_reader)
 }
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index c0f07cf79..844efed36 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -23,7 +23,7 @@ use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
 use self::extract_geo_points::extract_geo_points;
 use self::extract_word_docids::extract_word_docids;
 use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
-use self::extract_word_position_docids::extract_word_position_docids;
+use self::extract_word_position_docids::extract_word_fid_and_position_docids;
 use super::helpers::{
     as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap,
     GrenadParameters, MergeFn, MergeableReader,
@@ -133,7 +133,7 @@ pub(crate) fn data_from_obkv_documents(
         docid_word_positions_chunks,
         indexer,
         lmdb_writer_sx.clone(),
-        extract_word_position_docids,
+        extract_word_fid_and_position_docids,
         merge_cbo_roaring_bitmaps,
         TypedChunk::WordPositionDocids,
         "word-position-docids",
diff --git a/milli/src/update/words_prefix_position_docids.rs b/milli/src/update/words_prefix_position_docids.rs
index 6f12dde38..0822d0d26 100644
--- a/milli/src/update/words_prefix_position_docids.rs
+++ b/milli/src/update/words_prefix_position_docids.rs
@@ -8,13 +8,13 @@ use heed::{BytesDecode, BytesEncode};
 use log::debug;
 
 use crate::error::SerializationError;
-use crate::heed_codec::StrBEU32Codec;
+use crate::heed_codec::{StrBEU16Codec, StrBEU32Codec};
 use crate::index::main_key::WORDS_PREFIXES_FST_KEY;
 use crate::update::index_documents::{
     create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
     CursorClonableMmap, MergeFn,
 };
-use crate::{Index, Result};
+use crate::{bucketed_position, relative_from_absolute_position, Index, Result};
 
 pub struct WordPrefixPositionDocids<'t, 'u, 'i> {
     wtxn: &'t mut heed::RwTxn<'i, 'u>,
@@ -82,6 +82,7 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> {
             let mut prefixes_cache = HashMap::new();
             while let Some((key, data)) = new_word_position_docids_iter.move_on_next()? {
                 let (word, pos) = StrBEU32Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?;
+                let (_fid, pos) = relative_from_absolute_position(pos);
 
                 current_prefixes = match current_prefixes.take() {
                     Some(prefixes) if word.starts_with(&prefixes[0]) => Some(prefixes),
@@ -127,12 +128,12 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> {
             let iter = db
                 .remap_key_type::<ByteSlice>()
                 .prefix_iter(self.wtxn, prefix_bytes.as_bytes())?
-                .remap_key_type::<StrBEU32Codec>();
+                .remap_key_type::<StrBEU16Codec>();
             for result in iter {
                 let ((word, pos), data) = result?;
                 if word.starts_with(prefix) {
                     let key = (prefix, pos);
-                    let bytes = StrBEU32Codec::bytes_encode(&key).unwrap();
+                    let bytes = StrBEU16Codec::bytes_encode(&key).unwrap();
                     prefix_position_docids_sorter.insert(bytes, data)?;
                 }
             }