Document and refine facet indexing algorithms

2025-04-18 07:57:59 +02:00 · 2022-09-07 16:44:08 +02:00 · 2022-09-07 16:44:08 +02:00 · 27454e9828
commit 27454e9828
parent bee3c23b45
5 changed files with 387 additions and 291 deletions
--- a/milli/src/heed_codec/facet/mod.rs
+++ b/milli/src/heed_codec/facet/mod.rs
@ -29,31 +29,14 @@ pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> {
    }
 }

+/// The key in the [`facet_id_string_docids` and `facet_id_f64_docids`][`Index::facet_id_string_docids`]
+/// databases.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
 pub struct FacetGroupKey<T> {
    pub field_id: u16,
    pub level: u8,
    pub left_bound: T,
 }
-impl<'a> FacetGroupKey<&'a [u8]> {
-    pub fn into_owned(self) -> FacetGroupKey<Vec<u8>> {
-        FacetGroupKey {
-            field_id: self.field_id,
-            level: self.level,
-            left_bound: self.left_bound.to_vec(),
-        }
-    }
-}
-
-impl<'a> FacetGroupKey<Vec<u8>> {
-    pub fn as_ref(&self) -> FacetGroupKey<&[u8]> {
-        FacetGroupKey {
-            field_id: self.field_id,
-            level: self.level,
-            left_bound: self.left_bound.as_slice(),
-        }
-    }
-}

 #[derive(Debug)]
 pub struct FacetGroupValue {
--- a/milli/src/update/facet/bulk.rs
+++ b/milli/src/update/facet/bulk.rs
@ -1,24 +1,30 @@
-use std::borrow::Cow;
-use std::cmp;
-use std::fs::File;
-
-use grenad::CompressionType;
-use heed::types::ByteSlice;
-use heed::{BytesEncode, Error, RoTxn, RwTxn};
-use log::debug;
-use roaring::RoaringBitmap;
-use time::OffsetDateTime;
-
 use crate::facet::FacetType;
 use crate::heed_codec::facet::{
    ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
 };
 use crate::update::index_documents::{create_writer, writer_into_reader};
 use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
+use grenad::CompressionType;
+use heed::types::ByteSlice;
+use heed::{BytesEncode, Error, RoTxn, RwTxn};
+use log::debug;
+use roaring::RoaringBitmap;
+use std::borrow::Cow;
+use std::fs::File;
+use time::OffsetDateTime;

+use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
+
+/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases
+/// by rebuilding the database "from scratch".
+///
+/// First, the new elements are inserted into the level 0 of the database. Then, the
+/// higher levels are cleared and recomputed from the content of level 0.
+///
+/// Finally, the `faceted_documents_ids` value in the main database of `Index`
+/// is updated to contain the new set of faceted documents.
 pub struct FacetsUpdateBulk<'i> {
    index: &'i Index,
-    database: heed::Database<FacetGroupKeyCodec<ByteSliceRef>, FacetGroupValueCodec>,
    group_size: u8,
    min_level_size: u8,
    facet_type: FacetType,
@ -31,22 +37,10 @@ impl<'i> FacetsUpdateBulk<'i> {
        index: &'i Index,
        facet_type: FacetType,
        new_data: grenad::Reader<File>,
+        group_size: u8,
+        min_level_size: u8,
    ) -> FacetsUpdateBulk<'i> {
-        FacetsUpdateBulk {
-            index,
-            database: match facet_type {
-                FacetType::String => index
-                    .facet_id_string_docids
-                    .remap_key_type::<FacetGroupKeyCodec<ByteSliceRef>>(),
-                FacetType::Number => {
-                    index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRef>>()
-                }
-            },
-            group_size: 4,
-            min_level_size: 5,
-            facet_type,
-            new_data: Some(new_data),
-        }
+        FacetsUpdateBulk { index, group_size, min_level_size, facet_type, new_data: Some(new_data) }
    }

    pub fn new_not_updating_level_0(
@ -55,44 +49,31 @@ impl<'i> FacetsUpdateBulk<'i> {
    ) -> FacetsUpdateBulk<'i> {
        FacetsUpdateBulk {
            index,
-            database: match facet_type {
-                FacetType::String => index
-                    .facet_id_string_docids
-                    .remap_key_type::<FacetGroupKeyCodec<ByteSliceRef>>(),
-                FacetType::Number => {
-                    index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRef>>()
-                }
-            },
-            group_size: 4,
-            min_level_size: 5,
+            group_size: FACET_GROUP_SIZE,
+            min_level_size: FACET_MIN_LEVEL_SIZE,
            facet_type,
            new_data: None,
        }
    }

-    /// The number of elements from the level below that are represented by a single element in the level above
-    ///
-    /// This setting is always greater than or equal to 2.
-    pub fn level_group_size(mut self, value: u8) -> Self {
-        self.group_size = cmp::max(value, 2);
-        self
-    }
-
-    /// The minimum number of elements that a level is allowed to have.
-    pub fn min_level_size(mut self, value: u8) -> Self {
-        self.min_level_size = cmp::max(value, 2);
-        self
-    }
-
    #[logging_timer::time("FacetsUpdateBulk::{}")]
    pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
        debug!("Computing and writing the facet values levels docids into LMDB on disk...");

-        let Self { index, database, group_size, min_level_size, facet_type, new_data } = self;
+        let Self { index, group_size, min_level_size, facet_type, new_data } = self;
+
+        let db = match facet_type {
+            FacetType::String => {
+                index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRef>>()
+            }
+            FacetType::Number => {
+                index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRef>>()
+            }
+        };

        index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;

-        let inner = FacetsUpdateBulkInner { db: database, new_data, group_size, min_level_size };
+        let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size };

        let field_ids = index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Box<[_]>>();

@ -105,6 +86,7 @@ impl<'i> FacetsUpdateBulk<'i> {
    }
 }

+/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type
 pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
    pub db: heed::Database<FacetGroupKeyCodec<ByteSliceRef>, FacetGroupValueCodec>,
    pub new_data: Option<grenad::Reader<R>>,
--- a/milli/src/update/facet/incremental.rs
+++ b/milli/src/update/facet/incremental.rs
@ -1,16 +1,14 @@
-use std::collections::HashMap;
-use std::fs::File;
-
-use heed::types::ByteSlice;
-use heed::{BytesDecode, Error, RoTxn, RwTxn};
-use roaring::RoaringBitmap;
-
 use crate::facet::FacetType;
 use crate::heed_codec::facet::{
    ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
 };
 use crate::search::facet::get_highest_level;
 use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
+use heed::types::{ByteSlice, DecodeIgnore};
+use heed::{BytesDecode, Error, RoTxn, RwTxn};
+use roaring::RoaringBitmap;
+use std::collections::HashMap;
+use std::fs::File;

 enum InsertionResult {
    InPlace,
@ -18,10 +16,15 @@ enum InsertionResult {
 }
 enum DeletionResult {
    InPlace,
-    Reduce { prev: Option<Vec<u8>>, next: Option<Vec<u8>> },
-    Remove { prev: Option<Vec<u8>>, next: Option<Vec<u8>> },
+    Reduce { next: Option<Vec<u8>> },
+    Remove { next: Option<Vec<u8>> },
 }

+/// Algorithm to incrementally insert and delete elememts into the
+/// `facet_id_(string/f64)_docids` databases.
+///
+/// Rhe `faceted_documents_ids` value in the main database of `Index`
+/// is also updated to contain the new set of faceted documents.
 pub struct FacetsUpdateIncremental<'i> {
    index: &'i Index,
    inner: FacetsUpdateIncrementalInner,
@ -30,7 +33,14 @@ pub struct FacetsUpdateIncremental<'i> {
 }

 impl<'i> FacetsUpdateIncremental<'i> {
-    pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader<File>) -> Self {
+    pub fn new(
+        index: &'i Index,
+        facet_type: FacetType,
+        new_data: grenad::Reader<File>,
+        group_size: u8,
+        min_level_size: u8,
+        max_group_size: u8,
+    ) -> Self {
        FacetsUpdateIncremental {
            index,
            inner: FacetsUpdateIncrementalInner {
@ -42,26 +52,15 @@ impl<'i> FacetsUpdateIncremental<'i> {
                        .facet_id_f64_docids
                        .remap_key_type::<FacetGroupKeyCodec<ByteSliceRef>>(),
                },
-                group_size: 4,
-                max_group_size: 8,
-                min_level_size: 5,
+                group_size,
+                max_group_size,
+                min_level_size,
            },
            facet_type,
            new_data,
        }
    }
-    pub fn group_size(mut self, size: u8) -> Self {
-        self.inner.group_size = size;
-        self
-    }
-    pub fn min_level_size(mut self, size: u8) -> Self {
-        self.inner.min_level_size = size;
-        self
-    }
-    pub fn max_group_size(mut self, size: u8) -> Self {
-        self.inner.max_group_size = size;
-        self
-    }
+
    pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> {
        let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default();

@ -83,6 +82,7 @@ impl<'i> FacetsUpdateIncremental<'i> {
    }
 }

+/// Implementation of `FacetsUpdateIncremental` that is independent of milli's `Index` type
 pub struct FacetsUpdateIncrementalInner {
    pub db: heed::Database<FacetGroupKeyCodec<ByteSliceRef>, FacetGroupValueCodec>,
    pub group_size: u8,
@ -90,22 +90,36 @@ pub struct FacetsUpdateIncrementalInner {
    pub max_group_size: u8,
 }
 impl FacetsUpdateIncrementalInner {
+    /// Find the `FacetGroupKey`/`FacetGroupValue` in the database that
+    /// should be used to insert the new `facet_value` for the given `field_id` and `level`
+    /// where `level` must be strictly greater than 0.
+    ///
+    /// For example, when inserting the facet value `4`, there are two possibilities:
+    ///
+    /// 1. We find a key whose lower bound is 3 followed by a key whose lower bound is 6. Therefore,
+    ///    we know that the implicit range of the first key is 3..6, which contains 4.
+    ///    So the new facet value belongs in that first key/value pair.
+    ///
+    /// 2. The first key of the level has a lower bound of `5`. We return this key/value pair
+    ///    but will need to change the lowerbound of this key to `4` in order to insert this facet value.
    fn find_insertion_key_value(
        &self,
        field_id: u16,
        level: u8,
-        search_key: &[u8],
+        facet_value: &[u8],
        txn: &RoTxn,
    ) -> Result<(FacetGroupKey<Vec<u8>>, FacetGroupValue)> {
+        assert!(level > 0);
+
        let mut prefix = vec![];
        prefix.extend_from_slice(&field_id.to_be_bytes());
        prefix.push(level);
-        prefix.extend_from_slice(search_key);
+        prefix.extend_from_slice(facet_value);

        let mut prefix_iter = self
            .db
            .as_polymorph()
-            .prefix_iter::<_, ByteSliceRef, FacetGroupValueCodec>(txn, &prefix.as_slice())?;
+            .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(txn, prefix.as_slice())?;
        if let Some(e) = prefix_iter.next() {
            let (key_bytes, value) = e?;
            Ok((
@ -115,10 +129,10 @@ impl FacetsUpdateIncrementalInner {
                value,
            ))
        } else {
-            let key = FacetGroupKey { field_id, level, left_bound: search_key };
+            let key = FacetGroupKey { field_id, level, left_bound: facet_value };
            match self.db.get_lower_than(txn, &key)? {
                Some((key, value)) => {
-                    if key.level != level || key.field_id != field_id {
+                    if key.level != level {
                        let mut prefix = vec![];
                        prefix.extend_from_slice(&field_id.to_be_bytes());
                        prefix.push(level);
@ -126,7 +140,7 @@ impl FacetsUpdateIncrementalInner {
                        let mut iter = self
                            .db
                            .as_polymorph()
-                            .prefix_iter::<_, ByteSliceRef, FacetGroupValueCodec>(
+                            .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(
                                txn,
                                &prefix.as_slice(),
                            )?;
@ -146,15 +160,19 @@ impl FacetsUpdateIncrementalInner {
        }
    }

+    /// Insert the given facet value and corresponding document ids in the level 0 of the database
+    ///
+    /// ## Return
+    /// See documentation of `insert_in_level`
    fn insert_in_level_0<'t>(
        &self,
        txn: &'t mut RwTxn,
        field_id: u16,
-        new_key: &[u8],
-        new_values: &RoaringBitmap,
+        facet_value: &[u8],
+        docids: &RoaringBitmap,
    ) -> Result<InsertionResult> {
-        let key = FacetGroupKey { field_id, level: 0, left_bound: new_key };
-        let value = FacetGroupValue { bitmap: new_values.clone(), size: 1 };
+        let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value };
+        let value = FacetGroupValue { bitmap: docids.clone(), size: 1 };

        let mut level0_prefix = vec![];
        level0_prefix.extend_from_slice(&field_id.to_be_bytes());
@ -163,7 +181,7 @@ impl FacetsUpdateIncrementalInner {
        let mut iter = self
            .db
            .as_polymorph()
-            .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &level0_prefix)?;
+            .prefix_iter::<_, ByteSlice, DecodeIgnore>(&txn, &level0_prefix)?;

        if iter.next().is_none() {
            drop(iter);
@ -186,143 +204,158 @@ impl FacetsUpdateIncrementalInner {
            }
        }
    }
+
+    /// Insert the given facet value  and corresponding document ids in all the levels of the database up to the given `level`.
+    /// This function works recursively.
+    ///
+    /// ## Return
+    /// Returns the effect of adding the facet value to the database on the given `level`.
+    ///
+    /// - `InsertionResult::InPlace` means that inserting the `facet_value` into the `level` did not have
+    /// an effect on the number of keys in that level. Therefore, it did not increase the number of children
+    /// of the parent node.
+    ///
+    /// - `InsertionResult::Insert` means that inserting the `facet_value` into the `level` resulted
+    /// in the addition of a new key in that level, and that therefore the number of children
+    /// of the parent node should be incremented.
    fn insert_in_level<'t>(
        &self,
        txn: &'t mut RwTxn,
        field_id: u16,
        level: u8,
-        new_key: &[u8],
-        new_values: &RoaringBitmap,
+        facet_value: &[u8],
+        docids: &RoaringBitmap,
    ) -> Result<InsertionResult> {
        if level == 0 {
-            return self.insert_in_level_0(txn, field_id, new_key, new_values);
+            return self.insert_in_level_0(txn, field_id, facet_value, docids);
        }

        let max_group_size = self.max_group_size;

-        let (insertion_key, insertion_value) =
-            self.find_insertion_key_value(field_id, level, new_key, txn)?;
-
-        let result = self.insert_in_level(txn, field_id, level - 1, new_key.clone(), new_values)?;
+        let result = self.insert_in_level(txn, field_id, level - 1, facet_value.clone(), docids)?;
        // level below inserted an element

-        let insertion_key = {
-            let mut new_insertion_key = insertion_key.clone();
-            let mut modified = false;
-
-            if new_key < insertion_key.left_bound.as_slice() {
-                new_insertion_key.left_bound = new_key.to_vec();
-                modified = true;
-            }
-            if modified {
-                let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?;
-                assert!(is_deleted);
-                self.db.put(txn, &new_insertion_key.as_ref(), &insertion_value)?;
-            }
-            new_insertion_key
-        };
+        let (insertion_key, insertion_value) =
+            self.find_insertion_key_value(field_id, level, facet_value, txn)?;

        match result {
-            // TODO: this could go above the block recomputing insertion key
-            // because we know that if we inserted in place, the key is not a new one
-            // thus it doesn't extend a group
+            // because we know that we inserted in place, the facet_value is not a new one
+            // thus it doesn't extend a group, and thus the insertion key computed above is
+            // still correct
            InsertionResult::InPlace => {
-                let mut updated_value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap();
-                updated_value.bitmap |= new_values;
+                let mut updated_value = insertion_value;
+                updated_value.bitmap |= docids;
                self.db.put(txn, &insertion_key.as_ref(), &updated_value)?;

                return Ok(InsertionResult::InPlace);
            }
            InsertionResult::Insert => {}
        }
-        let mut updated_value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap();
+
+        // Here we know that inserting the facet value in the level below resulted in the creation
+        // of a new key. Therefore, it may be the case that we need to modify the left bound of the
+        // insertion key (see documentation of `find_insertion_key_value` for an example of when that
+        // could happen).
+        let insertion_key = {
+            let mut new_insertion_key = insertion_key.clone();
+            let mut key_should_be_modified = false;
+
+            if facet_value < insertion_key.left_bound.as_slice() {
+                new_insertion_key.left_bound = facet_value.to_vec();
+                key_should_be_modified = true;
+            }
+            if key_should_be_modified {
+                let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?;
+                assert!(is_deleted);
+                self.db.put(txn, &new_insertion_key.as_ref(), &insertion_value)?;
+            }
+            new_insertion_key
+        };
+        // Now we know that the insertion key contains the `facet_value`.
+
+        // We still need to update the insertion value by:
+        // 1. Incrementing the number of children (since the recursive call returned `InsertionResult::Insert`)
+        // 2. Merge the previous docids with the new one
+        let mut updated_value = insertion_value;

        updated_value.size += 1;
-        if updated_value.size == max_group_size {
-            let size_left = max_group_size / 2;
-            let size_right = max_group_size - size_left;

-            let level_below = level - 1;
+        if updated_value.size < max_group_size {
+            updated_value.bitmap |= docids;
+            self.db.put(txn, &insertion_key.as_ref(), &updated_value)?;

-            let (start_key, _) = self
-                .db
-                .get_greater_than_or_equal_to(
-                    &txn,
-                    &FacetGroupKey {
-                        field_id,
-                        level: level_below,
-                        left_bound: insertion_key.left_bound.as_slice(),
-                    },
-                )?
-                .unwrap();
-
-            let mut iter = self.db.range(&txn, &(start_key..))?.take(max_group_size as usize);
-
-            let group_left = {
-                let mut values_left = RoaringBitmap::new();
-
-                let mut i = 0;
-                while let Some(next) = iter.next() {
-                    let (_key, value) = next?;
-                    i += 1;
-                    values_left |= &value.bitmap;
-                    if i == size_left {
-                        break;
-                    }
-                }
-
-                let key =
-                    FacetGroupKey { field_id, level, left_bound: insertion_key.left_bound.clone() };
-                let value = FacetGroupValue { size: size_left as u8, bitmap: values_left };
-                (key, value)
-            };
-
-            let group_right = {
-                let mut values_right = RoaringBitmap::new();
-                let mut right_start_key = None;
-
-                while let Some(next) = iter.next() {
-                    let (key, value) = next?;
-                    if right_start_key.is_none() {
-                        right_start_key = Some(key.left_bound);
-                    }
-                    values_right |= &value.bitmap;
-                }
-
-                let key = FacetGroupKey {
-                    field_id,
-                    level,
-                    left_bound: right_start_key.unwrap().to_vec(),
-                };
-                let value = FacetGroupValue { size: size_right as u8, bitmap: values_right };
-                (key, value)
-            };
-            drop(iter);
-
-            let _ = self.db.delete(txn, &insertion_key.as_ref())?;
-
-            self.db.put(txn, &group_left.0.as_ref(), &group_left.1)?;
-            self.db.put(txn, &group_right.0.as_ref(), &group_right.1)?;
-
-            Ok(InsertionResult::Insert)
-        } else {
-            let mut value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap();
-            value.bitmap |= new_values;
-            value.size += 1;
-            self.db.put(txn, &insertion_key.as_ref(), &value).unwrap();
-
-            Ok(InsertionResult::InPlace)
+            return Ok(InsertionResult::InPlace);
        }
+
+        // We've increased the group size of the value and realised it has become greater than or equal to `max_group_size`
+        // Therefore it must be split into two nodes.
+
+        let size_left = max_group_size / 2;
+        let size_right = max_group_size - size_left;
+
+        let level_below = level - 1;
+
+        let start_key = FacetGroupKey {
+            field_id,
+            level: level_below,
+            left_bound: insertion_key.left_bound.as_slice(),
+        };
+
+        let mut iter = self.db.range(&txn, &(start_key..))?.take(max_group_size as usize);
+
+        let group_left = {
+            let mut values_left = RoaringBitmap::new();
+
+            let mut i = 0;
+            while let Some(next) = iter.next() {
+                let (_key, value) = next?;
+                i += 1;
+                values_left |= &value.bitmap;
+                if i == size_left {
+                    break;
+                }
+            }
+
+            let key =
+                FacetGroupKey { field_id, level, left_bound: insertion_key.left_bound.clone() };
+            let value = FacetGroupValue { size: size_left as u8, bitmap: values_left };
+            (key, value)
+        };
+
+        let group_right = {
+            let (
+                FacetGroupKey { left_bound: right_left_bound, .. },
+                FacetGroupValue { bitmap: mut values_right, .. },
+            ) = iter.next().unwrap()?;
+
+            while let Some(next) = iter.next() {
+                let (_, value) = next?;
+                values_right |= &value.bitmap;
+            }
+
+            let key = FacetGroupKey { field_id, level, left_bound: right_left_bound.to_vec() };
+            let value = FacetGroupValue { size: size_right as u8, bitmap: values_right };
+            (key, value)
+        };
+        drop(iter);
+
+        let _ = self.db.delete(txn, &insertion_key.as_ref())?;
+
+        self.db.put(txn, &group_left.0.as_ref(), &group_left.1)?;
+        self.db.put(txn, &group_right.0.as_ref(), &group_right.1)?;
+
+        Ok(InsertionResult::Insert)
    }

+    /// Insert the given facet value and corresponding document ids in the database.
    pub fn insert<'a, 't>(
        &self,
        txn: &'t mut RwTxn,
        field_id: u16,
-        new_key: &[u8],
-        new_values: &RoaringBitmap,
+        facet_value: &[u8],
+        docids: &RoaringBitmap,
    ) -> Result<()> {
-        if new_values.is_empty() {
+        if docids.is_empty() {
            return Ok(());
        }
        let group_size = self.group_size;
@ -330,12 +363,15 @@ impl FacetsUpdateIncrementalInner {
        let highest_level = get_highest_level(&txn, self.db, field_id)?;

        let result =
-            self.insert_in_level(txn, field_id, highest_level as u8, new_key, new_values)?;
+            self.insert_in_level(txn, field_id, highest_level as u8, facet_value, docids)?;
        match result {
            InsertionResult::InPlace => return Ok(()),
            InsertionResult::Insert => {}
        }

+        // Here we check whether the highest level has exceeded `min_level_size` * `self.group_size`.
+        // If it has, we must build an addition level above it.
+
        let mut highest_level_prefix = vec![];
        highest_level_prefix.extend_from_slice(&field_id.to_be_bytes());
        highest_level_prefix.push(highest_level);
@ -384,36 +420,61 @@ impl FacetsUpdateIncrementalInner {
        Ok(())
    }

+    /// Delete the given document id from the given facet value in the database, from level 0 to the
+    /// the given level.
+    ///
+    /// ## Return
+    /// Returns the effect of removing the document id from the database on the given `level`.
+    ///
+    /// - `DeletionResult::InPlace` means that deleting the document id did not have
+    /// an effect on the keys in that level.
+    ///
+    /// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the
+    /// number of keys in the level. For example, removing a document id from the facet value `3` could
+    /// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted
+    /// entirely. In that case, `DeletionResult::Remove` is returned. The parent of the deleted key must
+    /// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well.
+    ///
+    /// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the
+    /// bounds of the keys of the level. For example, removing a document id from the facet value
+    /// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore,
+    /// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4).
+    /// In that case `DeletionResult::Reduce` is returned. The parent of the reduced key may need to adjust
+    /// its left bound as well.
    fn delete_in_level<'t>(
        &self,
        txn: &'t mut RwTxn,
        field_id: u16,
        level: u8,
-        key: &[u8],
-        value: u32,
+        facet_value: &[u8],
+        docid: u32,
    ) -> Result<DeletionResult> {
        if level == 0 {
-            return self.delete_in_level_0(txn, field_id, key, value);
+            return self.delete_in_level_0(txn, field_id, facet_value, docid);
        }
        let (deletion_key, mut bitmap) =
-            self.find_insertion_key_value(field_id, level, key, txn)?;
+            self.find_insertion_key_value(field_id, level, facet_value, txn)?;

-        let result = self.delete_in_level(txn, field_id, level - 1, key.clone(), value)?;
+        let result = self.delete_in_level(txn, field_id, level - 1, facet_value.clone(), docid)?;

        let mut decrease_size = false;
-        let (prev_key, next_key) = match result {
+        let next_key = match result {
            DeletionResult::InPlace => {
-                bitmap.bitmap.remove(value);
+                bitmap.bitmap.remove(docid);
                self.db.put(txn, &deletion_key.as_ref(), &bitmap)?;
                return Ok(DeletionResult::InPlace);
            }
-            DeletionResult::Reduce { prev, next } => (prev, next),
-            DeletionResult::Remove { prev, next } => {
+            DeletionResult::Reduce { next } => next,
+            DeletionResult::Remove { next } => {
                decrease_size = true;
-                (prev, next)
+                next
            }
        };
+        // If either DeletionResult::Reduce or DeletionResult::Remove was returned,
+        // then we may need to adjust the left_bound of the deletion key.

+        // If DeletionResult::Remove was returned, then we need to decrease the group
+        // size of the deletion key.
        let mut updated_value = bitmap;
        if decrease_size {
            updated_value.size -= 1;
@ -421,17 +482,21 @@ impl FacetsUpdateIncrementalInner {

        if updated_value.size == 0 {
            self.db.delete(txn, &deletion_key.as_ref())?;
-            Ok(DeletionResult::Remove { prev: prev_key, next: next_key })
+            Ok(DeletionResult::Remove { next: next_key })
        } else {
            let mut updated_deletion_key = deletion_key.clone();
-            if key == deletion_key.left_bound {
+            let reduced_range = facet_value == deletion_key.left_bound;
+            if reduced_range {
                updated_deletion_key.left_bound = next_key.clone().unwrap();
            }
-            updated_value.bitmap.remove(value);
+            updated_value.bitmap.remove(docid);
            let _ = self.db.delete(txn, &deletion_key.as_ref())?;
            self.db.put(txn, &updated_deletion_key.as_ref(), &updated_value)?;
-
-            Ok(DeletionResult::Reduce { prev: prev_key, next: next_key })
+            if reduced_range {
+                Ok(DeletionResult::Reduce { next: next_key })
+            } else {
+                Ok(DeletionResult::InPlace)
+            }
        }
    }

@ -439,27 +504,24 @@ impl FacetsUpdateIncrementalInner {
        &self,
        txn: &'t mut RwTxn,
        field_id: u16,
-        key: &[u8],
-        value: u32,
+        facet_value: &[u8],
+        docid: u32,
    ) -> Result<DeletionResult> {
-        let key = FacetGroupKey { field_id, level: 0, left_bound: key };
+        let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value };
        let mut bitmap = self.db.get(&txn, &key)?.unwrap().bitmap;
-        bitmap.remove(value);
+        bitmap.remove(docid);

        if bitmap.is_empty() {
-            let mut prev_key = None;
            let mut next_key = None;
-
-            if let Some(prev) = self.db.get_lower_than(&txn, &key)? {
-                prev_key = Some(prev.0.left_bound.to_vec());
-            }
-            if let Some(next) = self.db.get_greater_than(&txn, &key)? {
-                if next.0.level == 0 {
-                    next_key = Some(next.0.left_bound.to_vec());
+            if let Some((next, _)) =
+                self.db.remap_data_type::<DecodeIgnore>().get_greater_than(&txn, &key)?
+            {
+                if next.field_id == field_id && next.level == 0 {
+                    next_key = Some(next.left_bound.to_vec());
                }
            }
            self.db.delete(txn, &key)?;
-            Ok(DeletionResult::Remove { prev: prev_key, next: next_key })
+            Ok(DeletionResult::Remove { next: next_key })
        } else {
            self.db.put(txn, &key, &FacetGroupValue { size: 1, bitmap })?;
            Ok(DeletionResult::InPlace)
@ -470,22 +532,30 @@ impl FacetsUpdateIncrementalInner {
        &self,
        txn: &'t mut RwTxn,
        field_id: u16,
-        key: &[u8],
-        value: u32,
+        facet_value: &[u8],
+        docid: u32,
    ) -> Result<()> {
-        if self.db.get(txn, &FacetGroupKey { field_id, level: 0, left_bound: key })?.is_none() {
+        if self
+            .db
+            .remap_data_type::<DecodeIgnore>()
+            .get(txn, &FacetGroupKey { field_id, level: 0, left_bound: facet_value })?
+            .is_none()
+        {
            return Ok(());
        }
        let highest_level = get_highest_level(&txn, self.db, field_id)?;

-        // let key_bytes = BoundCodec::bytes_encode(&key).unwrap();
-
-        let result = self.delete_in_level(txn, field_id, highest_level as u8, key, value)?;
+        let result =
+            self.delete_in_level(txn, field_id, highest_level as u8, facet_value, docid)?;
        match result {
            DeletionResult::InPlace => return Ok(()),
-            DeletionResult::Reduce { .. } => {}
+            DeletionResult::Reduce { .. } => return Ok(()),
            DeletionResult::Remove { .. } => {}
        }
+
+        // if we either removed a key from the highest level, its size may have fallen
+        // below `min_level_size`, in which case we need to remove the entire level
+
        let mut highest_level_prefix = vec![];
        highest_level_prefix.extend_from_slice(&field_id.to_be_bytes());
        highest_level_prefix.push(highest_level);
@ -521,6 +591,26 @@ impl FacetsUpdateIncrementalInner {
    }
 }

+impl<'a> FacetGroupKey<&'a [u8]> {
+    pub fn into_owned(self) -> FacetGroupKey<Vec<u8>> {
+        FacetGroupKey {
+            field_id: self.field_id,
+            level: self.level,
+            left_bound: self.left_bound.to_vec(),
+        }
+    }
+}
+
+impl<'a> FacetGroupKey<Vec<u8>> {
+    pub fn as_ref(&self) -> FacetGroupKey<&[u8]> {
+        FacetGroupKey {
+            field_id: self.field_id,
+            level: self.level,
+            left_bound: self.left_bound.as_slice(),
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use crate::heed_codec::facet::{OrderedF64Codec, StrRefCodec};
--- a/milli/src/update/facet/mod.rs
+++ b/milli/src/update/facet/mod.rs
@ -1,3 +1,79 @@
+/*!
+This module implements two different algorithms for updating the `facet_id_string_docids`
+and `facet_id_f64_docids` databases. The first algorithm is a "bulk" algorithm, meaning that
+it recreates the database from scratch when new elements are added to it. The second algorithm
+is incremental: it modifies the database as little as possible.
+
+The databases must be able to return results for queries such as:
+1. Filter       : find all the document ids that have a facet value greater than X and/or smaller than Y
+2. Min/Max      : find the minimum/maximum facet value among these document ids
+3. Sort         : sort these document ids by increasing/decreasing facet values
+4. Distribution : given some document ids, make a list of each facet value
+   found in these documents along with the number of documents that contain it
+
+The algorithms that implement these queries are found in the `src/search/facet` folder.
+
+To make these queries fast to compute, the database adopts a tree structure:
+```ignore
+            ┌───────────────────────────────┬───────────────────────────────┬───────────────┐
+┌───────┐   │           "ab" (2)            │           "gaf" (2)           │   "woz" (1)   │
+│Level 2│   │                               │                               │               │
+└───────┘   │        [a, b, d, f, z]        │        [c, d, e, f, g]        │    [u, y]     │
+            ├───────────────┬───────────────┼───────────────┬───────────────┼───────────────┤
+┌───────┐   │   "ab" (2)    │   "ba" (2)    │   "gaf" (2)   │  "form" (2)   │   "woz" (2)   │
+│Level 1│   │               │               │               │               │               │
+└───────┘   │ [a, b, d, z]  │   [a, b, f]   │   [c, d, g]   │    [e, f]     │    [u, y]     │
+            ├───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┤
+┌───────┐   │  "ab" │  "ac" │  "ba" │ "bac" │ "gaf" │ "gal" │ "form"│ "wow" │ "woz" │  "zz" │
+│Level 0│   │       │       │       │       │       │       │       │       │       │       │
+└───────┘   │ [a, b]│ [d, z]│ [b, f]│ [a, f]│ [c, d]│  [g]  │  [e]  │ [e, f]│  [y]  │  [u]  │
+            └───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘
+```
+In the diagram above, each cell corresponds to a node in the tree. The first line of the cell
+contains the left bound of the range of facet values as well as the number of children of the node.
+The second line contains the document ids which have a facet value within the range of the node.
+The nodes at level 0 are the leaf nodes. They have 0 children and a single facet value in their range.
+
+In the diagram above, the first cell of level 2 is `ab (2)`. Its range is `ab .. gaf` (because
+`gaf` is the left bound of the next node) and it has two children. Its document ids are `[a,b,d,f,z]`.
+These documents all contain a facet value that is contained within `ab .. gaf`.
+
+In the database, each node is represented by a key/value pair encoded as a [`FacetGroupKey`] and a
+[`FacetGroupValue`], which have the following format:
+
+```ignore
+FacetGroupKey:
+- field id  : u16
+- level     : u8
+- left bound: [u8]    // the facet value encoded using either OrderedF64Codec or Str
+
+FacetGroupValue:
+- #children : u8
+- docids    : RoaringBitmap
+```
+
+When the database is first created using the "bulk" method, each node has a fixed number of children
+(except for possibly the last one) given by the `group_size` parameter (default to `FACET_GROUP_SIZE`). 
+The tree is also built such that the highest level has more than `min_level_size` 
+(default to `FACET_MIN_LEVEL_SIZE`) elements in it.
+
+When the database is incrementally updated, the number of children of a node can vary between
+1 and `max_group_size`. This is done so that most incremental operations do not need to change
+the structure of the tree. When the number of children of a node reaches `max_group_size`,
+we split the node in two and update the number of children of its parent.
+
+When adding documents to the databases, it is important to determine which method to use to
+minimise indexing time. The incremental method is faster when adding few new facet values, but the
+bulk method is faster when a large part of the database is modified. Empirically, it seems that
+it takes 50x more time to incrementally add N facet values to an existing database than it is to
+construct a database of N facet values. This is the heuristic that is used to choose between the 
+two methods.
+*/
+
+pub const FACET_MAX_GROUP_SIZE: u8 = 8;
+pub const FACET_GROUP_SIZE: u8 = 4;
+pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
+
 use self::incremental::FacetsUpdateIncremental;
 use super::FacetsUpdateBulk;
 use crate::facet::FacetType;
@ -13,8 +89,8 @@ pub struct FacetsUpdate<'i> {
    database: heed::Database<FacetGroupKeyCodec<ByteSliceRef>, FacetGroupValueCodec>,
    facet_type: FacetType,
    new_data: grenad::Reader<File>,
-    level_group_size: u8,
-    max_level_group_size: u8,
+    group_size: u8,
+    max_group_size: u8,
    min_level_size: u8,
 }
 impl<'i> FacetsUpdate<'i> {
@ -30,57 +106,24 @@ impl<'i> FacetsUpdate<'i> {
        Self {
            index,
            database,
-            level_group_size: 4,
-            max_level_group_size: 8,
-            min_level_size: 5,
+            group_size: FACET_GROUP_SIZE,
+            max_group_size: FACET_MAX_GROUP_SIZE,
+            min_level_size: FACET_MIN_LEVEL_SIZE,
            facet_type,
            new_data,
        }
    }

-    // TODO: use the options below?
-    // but I don't actually see why they should be configurable
-    // /// The minimum number of elements that a level is allowed to have.
-    // pub fn level_max_group_size(mut self, value: u8) -> Self {
-    //     self.max_level_group_size = std::cmp::max(value, 4);
-    //     self
-    // }
-
-    // /// The number of elements from the level below that are represented by a single element in the level above
-    // ///
-    // /// This setting is always greater than or equal to 2.
-    // pub fn level_group_size(mut self, value: u8) -> Self {
-    //     self.level_group_size = std::cmp::max(value, 2);
-    //     self
-    // }
-
-    // /// The minimum number of elements that a level is allowed to have.
-    // pub fn min_level_size(mut self, value: u8) -> Self {
-    //     self.min_level_size = std::cmp::max(value, 2);
-    //     self
-    // }
-
    pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
        if self.new_data.is_empty() {
            return Ok(());
        }
-        // here, come up with a better condition!
-        // ideally we'd choose which method to use for each field id individually
-        // but I dont' think it's worth the effort yet
-        // As a first requirement, we ask that the length of the new data is less
-        // than a 1/50th of the length of the database in order to use the incremental
-        // method.
        if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) {
-            let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data)
-                .level_group_size(self.level_group_size)
-                .min_level_size(self.min_level_size);
+            let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data, self.group_size, self.min_level_size);
            bulk_update.execute(wtxn)?;
        } else {
            let incremental_update =
-                FacetsUpdateIncremental::new(self.index, self.facet_type, self.new_data)
-                    .group_size(self.level_group_size)
-                    .max_group_size(self.max_level_group_size)
-                    .min_level_size(self.min_level_size);
+                FacetsUpdateIncremental::new(self.index, self.facet_type, self.new_data, self.group_size, self.min_level_size, self.max_group_size);
            incremental_update.execute(wtxn)?;
        }
        Ok(())
@ -346,7 +389,7 @@ mod comparison_bench {
    // of the incremental vs. bulk indexer.
    // It appears that the incremental indexer is about 50 times slower than the
    // bulk indexer.
-    #[test]
+    // #[test]
    fn benchmark_facet_indexing() {
        // then we add 10_000 documents at a time and compare the speed of adding 1, 100, and 1000 documents to it

--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -7,7 +7,7 @@ mod typed_chunk;
 use std::collections::HashSet;
 use std::io::{Cursor, Read, Seek};
 use std::iter::FromIterator;
-use std::num::{NonZeroU32, NonZeroUsize};
+use std::num::NonZeroU32;
 use std::result::Result as StdResult;

 use crossbeam_channel::{Receiver, Sender};
@ -82,8 +82,6 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a, F> {

 #[derive(Default, Debug, Clone)]
 pub struct IndexDocumentsConfig {
-    pub facet_level_group_size: Option<NonZeroUsize>,
-    pub facet_min_level_size: Option<NonZeroUsize>,
    pub words_prefix_threshold: Option<u32>,
    pub max_prefix_length: Option<usize>,
    pub words_positions_level_group_size: Option<NonZeroU32>,