From 865b415b3f706e12c76aef675877e84c0f3a2075 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 15 Feb 2024 16:00:48 +0100 Subject: [PATCH 1/4] Add test rerpoducing bug --- .../tests/documents/update_documents.rs | 92 ++++++++++++++++++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/meilisearch/tests/documents/update_documents.rs b/meilisearch/tests/documents/update_documents.rs index b4f61bf99..a5d466513 100644 --- a/meilisearch/tests/documents/update_documents.rs +++ b/meilisearch/tests/documents/update_documents.rs @@ -1,4 +1,4 @@ -use meili_snap::snapshot; +use meili_snap::{json_string, snapshot}; use crate::common::encoder::Encoder; use crate::common::{GetAllDocumentsOptions, Server}; @@ -209,3 +209,93 @@ async fn error_update_documents_missing_document_id() { "https://docs.meilisearch.com/errors#missing_document_id" ); } + +#[actix_rt::test] +async fn update_faceted_document() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "rankingRules": ["facet:asc"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(0).await; + + let documents: Vec<_> = (0..1000) + .map(|id| { + json!({ + "doc_id": id, + "facet": (id/3), + }) + }) + .collect(); + + let (_response, code) = index.add_documents(documents.into(), None).await; + assert_eq!(code, 202); + + index.wait_task(1).await; + + let documents = json!([ + { + "doc_id": 9, + "facet": 1.5, + } + ]); + + let (response, code) = index.update_documents(documents, None).await; + assert_eq!(code, 202, "response: {}", response); + + index.wait_task(2).await; + + index + .search(json!({"limit": 10}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "doc_id": 0, + "facet": 0 + }, + { + "doc_id": 1, + "facet": 0 + }, + { + "doc_id": 2, + "facet": 0 + }, + { + "doc_id": 3, + "facet": 1 + }, + { + "doc_id": 4, + "facet": 1 + }, + { + "doc_id": 5, + "facet": 1 + }, + { + "doc_id": 9, + "facet": 1.5 + }, + { + "doc_id": 6, + "facet": 2 + }, + { + "doc_id": 7, + "facet": 2 + }, + { + "doc_id": 8, + "facet": 2 + } + ] + "###); + }) + .await; +} From 9d1f489a37927c730060f195140ca368a8b4aa5d Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 21 Feb 2024 18:42:16 +0100 Subject: [PATCH 2/4] Fix facet incremental indexing --- milli/src/update/facet/incremental.rs | 671 ++++++++++++++------------ milli/src/update/facet/mod.rs | 6 +- 2 files changed, 361 insertions(+), 316 deletions(-) diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index a28aa5a47..d68540967 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -18,15 +18,13 @@ use crate::update::index_documents::valid_lmdb_key; use crate::update::MergeFn; use crate::{CboRoaringBitmapCodec, Index, Result}; -enum InsertionResult { +enum ModificationResult { InPlace, Expand, Insert, -} -enum DeletionResult { - InPlace, Reduce { next: Option> }, Remove { next: Option> }, + Nothing, } /// Algorithm to incrementally insert and delete elememts into the @@ -65,8 +63,9 @@ impl FacetsUpdateIncremental { #[tracing::instrument(level = "trace", skip_all, target = "indexing::facets::incremental")] pub fn execute(self, wtxn: &mut RwTxn) -> crate::Result<()> { + let mut current_field_id = None; + let mut facet_level_may_be_updated = false; let mut iter = self.delta_data.into_stream_merger_iter()?; - while let Some((key, value)) = iter.next()? { if !valid_lmdb_key(key) { continue; @@ -74,25 +73,47 @@ impl FacetsUpdateIncremental { let key = FacetGroupKeyCodec::::bytes_decode(key) .map_err(heed::Error::Encoding)?; + + if facet_level_may_be_updated + && current_field_id.map_or(false, |fid| fid != key.field_id) + { + // Only add or remove a level after making all the field modifications. + self.inner.add_or_delete_level(wtxn, current_field_id.unwrap())?; + facet_level_may_be_updated = false; + } + current_field_id = Some(key.field_id); + let value = KvReader::new(value); let docids_to_delete = value .get(DelAdd::Deletion) .map(CboRoaringBitmapCodec::bytes_decode) - .map(|o| o.map_err(heed::Error::Encoding)); + .map(|o| o.map_err(heed::Error::Encoding)) + .transpose()?; let docids_to_add = value .get(DelAdd::Addition) .map(CboRoaringBitmapCodec::bytes_decode) - .map(|o| o.map_err(heed::Error::Encoding)); + .map(|o| o.map_err(heed::Error::Encoding)) + .transpose()?; - if let Some(docids_to_delete) = docids_to_delete { - let docids_to_delete = docids_to_delete?; - self.inner.delete(wtxn, key.field_id, key.left_bound, &docids_to_delete)?; + let level_size_changed = self.inner.modify( + wtxn, + key.field_id, + key.left_bound, + docids_to_add.as_ref(), + docids_to_delete.as_ref(), + )?; + + if level_size_changed { + // if a node has been added or removed from the highest level, + // we may have to update the facet level. + facet_level_may_be_updated = true; } + } - if let Some(docids_to_add) = docids_to_add { - let docids_to_add = docids_to_add?; - self.inner.insert(wtxn, key.field_id, key.left_bound, &docids_to_add)?; + if let Some(field_id) = current_field_id { + if facet_level_may_be_updated { + self.inner.add_or_delete_level(wtxn, field_id)?; } } @@ -166,138 +187,78 @@ impl FacetsUpdateIncrementalInner { /// /// ## Return /// See documentation of `insert_in_level` - fn insert_in_level_0( + fn modify_in_level_0( &self, txn: &mut RwTxn, field_id: u16, facet_value: &[u8], - docids: &RoaringBitmap, - ) -> Result { + add_docids: Option<&RoaringBitmap>, + del_docids: Option<&RoaringBitmap>, + ) -> Result { let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value }; - let value = FacetGroupValue { bitmap: docids.clone(), size: 1 }; - let mut level0_prefix = vec![]; - level0_prefix.extend_from_slice(&field_id.to_be_bytes()); - level0_prefix.push(0); - - let mut iter = - self.db.remap_types::().prefix_iter(txn, &level0_prefix)?; - - if iter.next().is_none() { - drop(iter); - self.db.put(txn, &key, &value)?; - Ok(InsertionResult::Insert) - } else { - drop(iter); - let old_value = self.db.get(txn, &key)?; - match old_value { - Some(mut updated_value) => { - // now merge the two - updated_value.bitmap |= value.bitmap; - self.db.put(txn, &key, &updated_value)?; - Ok(InsertionResult::InPlace) - } - None => { + let old_value = self.db.get(txn, &key)?; + match (old_value, add_docids, del_docids) { + // Addition + deletion on an existing value + (Some(FacetGroupValue { bitmap, .. }), Some(add_docids), Some(del_docids)) => { + let value = FacetGroupValue { bitmap: bitmap - del_docids | add_docids, size: 1 }; + self.db.put(txn, &key, &value)?; + Ok(ModificationResult::InPlace) + } + // Addition on an existing value + (Some(FacetGroupValue { bitmap, .. }), Some(add_docids), None) => { + let value = FacetGroupValue { bitmap: bitmap | add_docids, size: 1 }; + self.db.put(txn, &key, &value)?; + Ok(ModificationResult::InPlace) + } + // Addition of a new value (ignore deletion) + (None, Some(add_docids), _) => { + let value = FacetGroupValue { bitmap: add_docids.clone(), size: 1 }; + self.db.put(txn, &key, &value)?; + Ok(ModificationResult::Insert) + } + // Deletion on an existing value, fully delete the key if the resulted value is empty. + (Some(FacetGroupValue { mut bitmap, .. }), None, Some(del_docids)) => { + bitmap -= del_docids; + if bitmap.is_empty() { + // Full deletion + let mut next_key = None; + if let Some((next, _)) = + self.db.remap_data_type::().get_greater_than(txn, &key)? + { + if next.field_id == field_id && next.level == 0 { + next_key = Some(next.left_bound.to_vec()); + } + } + self.db.delete(txn, &key)?; + Ok(ModificationResult::Remove { next: next_key }) + } else { + // Partial deletion + let value = FacetGroupValue { bitmap, size: 1 }; self.db.put(txn, &key, &value)?; - Ok(InsertionResult::Insert) + Ok(ModificationResult::InPlace) } } + // Otherwise do nothing (None + no addition + deletion == Some + no addition + no deletion == Nothing), + // may be unreachable at some point. + (None, None, _) | (Some(_), None, None) => Ok(ModificationResult::Nothing), } } - /// Insert the given facet value and corresponding document ids in all the levels of the database up to the given `level`. - /// This function works recursively. + /// Split a level node into two balanced nodes. /// - /// ## Return - /// Returns the effect of adding the facet value to the database on the given `level`. - /// - /// - `InsertionResult::InPlace` means that inserting the `facet_value` into the `level` did not have - /// an effect on the number of keys in that level. Therefore, it did not increase the number of children - /// of the parent node. - /// - /// - `InsertionResult::Insert` means that inserting the `facet_value` into the `level` resulted - /// in the addition of a new key in that level, and that therefore the number of children - /// of the parent node should be incremented. - fn insert_in_level( + /// # Return + /// Returns `ModificationResult::Insert` if the split is successful. + fn split_group( &self, txn: &mut RwTxn, field_id: u16, level: u8, - facet_value: &[u8], - docids: &RoaringBitmap, - ) -> Result { - if level == 0 { - return self.insert_in_level_0(txn, field_id, facet_value, docids); - } - - let max_group_size = self.max_group_size; - - let result = self.insert_in_level(txn, field_id, level - 1, facet_value, docids)?; - // level below inserted an element - - let (insertion_key, insertion_value) = - self.find_insertion_key_value(field_id, level, facet_value, txn)?; - - match result { - // because we know that we inserted in place, the facet_value is not a new one - // thus it doesn't extend a group, and thus the insertion key computed above is - // still correct - InsertionResult::InPlace => { - let mut updated_value = insertion_value; - updated_value.bitmap |= docids; - self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; - - return Ok(InsertionResult::InPlace); - } - InsertionResult::Expand => {} - InsertionResult::Insert => {} - } - - // Here we know that inserting the facet value in the level below resulted in the creation - // of a new key. Therefore, it may be the case that we need to modify the left bound of the - // insertion key (see documentation of `find_insertion_key_value` for an example of when that - // could happen). - let (insertion_key, insertion_key_was_modified) = { - let mut new_insertion_key = insertion_key.clone(); - let mut key_should_be_modified = false; - - if facet_value < insertion_key.left_bound.as_slice() { - new_insertion_key.left_bound = facet_value.to_vec(); - key_should_be_modified = true; - } - if key_should_be_modified { - let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?; - assert!(is_deleted); - self.db.put(txn, &new_insertion_key.as_ref(), &insertion_value)?; - } - (new_insertion_key, key_should_be_modified) - }; - // Now we know that the insertion key contains the `facet_value`. - - // We still need to update the insertion value by: - // 1. Incrementing the number of children (since the recursive call returned `InsertionResult::Insert`) - // 2. Merge the previous docids with the new one - let mut updated_value = insertion_value; - - if matches!(result, InsertionResult::Insert) { - updated_value.size += 1; - } - - if updated_value.size < max_group_size { - updated_value.bitmap |= docids; - self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; - if insertion_key_was_modified { - return Ok(InsertionResult::Expand); - } else { - return Ok(InsertionResult::InPlace); - } - } - - // We've increased the group size of the value and realised it has become greater than or equal to `max_group_size` - // Therefore it must be split into two nodes. - - let size_left = updated_value.size / 2; - let size_right = updated_value.size - size_left; + insertion_key: FacetGroupKey>, + insertion_value: FacetGroupValue, + ) -> Result { + let size_left = insertion_value.size / 2; + let size_right = insertion_value.size - size_left; let level_below = level - 1; @@ -351,34 +312,249 @@ impl FacetsUpdateIncrementalInner { self.db.put(txn, &group_left.0.as_ref(), &group_left.1)?; self.db.put(txn, &group_right.0.as_ref(), &group_right.1)?; - Ok(InsertionResult::Insert) + Ok(ModificationResult::Insert) } - /// Insert the given facet value and corresponding document ids in the database. - pub fn insert( + fn trim_del_docids<'a>( + &self, + txn: &mut RwTxn, + field_id: u16, + level: u8, + insertion_key: &FacetGroupKey>, + insertion_value_size: usize, + del_docids: &'a RoaringBitmap, + ) -> Result> { + let level_below = level - 1; + + let start_key = FacetGroupKey { + field_id, + level: level_below, + left_bound: insertion_key.left_bound.as_slice(), + }; + + let mut del_docids = std::borrow::Cow::Borrowed(del_docids); + let iter = self.db.range(txn, &(start_key..))?.take(insertion_value_size); + for next in iter { + let (_, value) = next?; + // if a sublevel bitmap as common docids with del_docids, + // then these docids shouldn't be removed and so, remove them from the deletion list. + if !value.bitmap.is_disjoint(&del_docids) { + *del_docids.to_mut() -= value.bitmap; + } + } + + Ok(del_docids) + } + + /// Modify the given facet value and corresponding document ids in all the levels of the database up to the given `level`. + /// This function works recursively. + /// + /// ## Return + /// Returns the effect of modifying the facet value to the database on the given `level`. + /// + /// - `ModificationResult::InPlace` means that modifying the `facet_value` into the `level` did not have + /// an effect on the number of keys in that level. Therefore, it did not increase the number of children + /// of the parent node. + /// + /// - `ModificationResult::Insert` means that modifying the `facet_value` into the `level` resulted + /// in the addition of a new key in that level, and that therefore the number of children + /// of the parent node should be incremented. + /// + /// - `ModificationResult::Remove` means that modifying the `facet_value` into the `level` resulted in a change in the + /// number of keys in the level. For example, removing a document id from the facet value `3` could + /// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted + /// entirely. In that case, `ModificationResult::Remove` is returned. The parent of the deleted key must + /// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well. + /// + /// - `ModificationResult::Reduce/Expand` means that modifying the `facet_value` into the `level` resulted in a change in the + /// bounds of the keys of the level. For example, removing a document id from the facet value + /// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore, + /// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4). + /// In that case `ModificationResult::Reduce` is returned. The parent of the reduced key may need to adjust + /// its left bound as well. + /// + /// - `ModificationResult::Nothing` means that modifying the `facet_value` didn't have any impact into the `level`. + /// This case is reachable when a document id is removed from a sub-level node but is still present in another one. + /// For example, removing `2` from a document containing `2` and `3`, the document id will removed form the `level 0` but should remain in the group node [1..4] in `level 1`. + fn modify_in_level( + &self, + txn: &mut RwTxn, + field_id: u16, + level: u8, + facet_value: &[u8], + add_docids: Option<&RoaringBitmap>, + del_docids: Option<&RoaringBitmap>, + ) -> Result { + if level == 0 { + return self.modify_in_level_0(txn, field_id, facet_value, add_docids, del_docids); + } + + let result = + self.modify_in_level(txn, field_id, level - 1, facet_value, add_docids, del_docids)?; + // level below inserted an element + + if let ModificationResult::Nothing = result { + // if the previous level has not been modified, + // early return ModificationResult::Nothing. + return Ok(ModificationResult::Nothing); + } + + let (insertion_key, insertion_value) = + self.find_insertion_key_value(field_id, level, facet_value, txn)?; + let insertion_value_size = insertion_value.size as usize; + + let mut insertion_value_was_modified = false; + let mut updated_value = insertion_value; + + if let ModificationResult::Insert = result { + // if a key has been inserted in the sub-level raise the value size. + updated_value.size += 1; + insertion_value_was_modified = true; + } else if let ModificationResult::Remove { .. } = result { + if updated_value.size <= 1 { + // if the only remaining node is the one to delete, + // delete the key instead and early return. + let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?; + assert!(is_deleted); + return Ok(result); + } else { + // Reduce the value size + updated_value.size -= 1; + insertion_value_was_modified = true; + } + } + + let (insertion_key, insertion_key_modification) = + if let ModificationResult::InPlace = result { + (insertion_key, ModificationResult::InPlace) + } else { + // Inserting or deleting the facet value in the level below resulted in the creation + // of a new key. Therefore, it may be the case that we need to modify the left bound of the + // insertion key (see documentation of `find_insertion_key_value` for an example of when that + // could happen). + let mut new_insertion_key = insertion_key.clone(); + let mut key_modification = ModificationResult::InPlace; + + if let ModificationResult::Remove { next } | ModificationResult::Reduce { next } = + result + { + // if the deleted facet_value is the left_bound of the current node, + // the left_bound should be updated reducing the current node. + let reduced_range = facet_value == insertion_key.left_bound; + if reduced_range { + new_insertion_key.left_bound = next.clone().unwrap(); + key_modification = ModificationResult::Reduce { next }; + } + } else if facet_value < insertion_key.left_bound.as_slice() { + // if the added facet_value is the under the left_bound of the current node, + // the left_bound should be updated expanding the current node. + new_insertion_key.left_bound = facet_value.to_vec(); + key_modification = ModificationResult::Expand; + } + + if matches!( + key_modification, + ModificationResult::Expand | ModificationResult::Reduce { .. } + ) { + // if the node should be updated, delete it, it will be recreated using a new key later. + let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?; + assert!(is_deleted); + } + (new_insertion_key, key_modification) + }; + + if updated_value.size < self.max_group_size { + // If there are docids to delete, trim them avoiding unexpected removal. + let del_docids = del_docids + .map(|ids| { + self.trim_del_docids( + txn, + field_id, + level, + &insertion_key, + insertion_value_size, + ids, + ) + }) + .transpose()? + .filter(|ids| !ids.is_empty()); + if let Some(del_docids) = del_docids { + updated_value.bitmap -= &*del_docids; + insertion_value_was_modified = true; + } + + if let Some(add_docids) = add_docids { + updated_value.bitmap |= add_docids; + insertion_value_was_modified = true; + } + + if insertion_value_was_modified + || matches!( + insertion_key_modification, + ModificationResult::Expand | ModificationResult::Reduce { .. } + ) + { + // if any modification occured, insert it in the database. + self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; + Ok(insertion_key_modification) + } else { + // this case is reachable when a docid is removed from a sub-level node but is still present in another one. + // For instance, a document containing 2 and 3, if 2 is removed, the docid should remain in the group node [1..4]. + Ok(ModificationResult::Nothing) + } + } else { + // We've increased the group size of the value and realised it has become greater than or equal to `max_group_size` + // Therefore it must be split into two nodes. + self.split_group(txn, field_id, level, insertion_key, updated_value) + } + } + + /// Modify the given facet value and corresponding document ids in the database. + /// If no more document ids correspond to the facet value, delete it completely. + /// + /// ## Return + /// Returns `true` if some tree-nodes of the highest level have been removed or added implying a potential + /// addition or deletion of a facet level. + /// Otherwise returns `false` if the tree-nodes have been modified in place. + pub fn modify( &self, txn: &mut RwTxn, field_id: u16, facet_value: &[u8], - docids: &RoaringBitmap, - ) -> Result<()> { - if docids.is_empty() { - return Ok(()); + add_docids: Option<&RoaringBitmap>, + del_docids: Option<&RoaringBitmap>, + ) -> Result { + if add_docids.map_or(true, RoaringBitmap::is_empty) + && del_docids.map_or(true, RoaringBitmap::is_empty) + { + return Ok(false); } - let group_size = self.group_size; let highest_level = get_highest_level(txn, self.db, field_id)?; - let result = self.insert_in_level(txn, field_id, highest_level, facet_value, docids)?; + let result = self.modify_in_level( + txn, + field_id, + highest_level, + facet_value, + add_docids, + del_docids, + )?; match result { - InsertionResult::InPlace => return Ok(()), - InsertionResult::Expand => return Ok(()), - InsertionResult::Insert => {} + ModificationResult::InPlace + | ModificationResult::Expand + | ModificationResult::Nothing + | ModificationResult::Reduce { .. } => Ok(false), + ModificationResult::Insert | ModificationResult::Remove { .. } => Ok(true), } + } - // Here we check whether the highest level has exceeded `min_level_size` * `self.group_size`. - // If it has, we must build an addition level above it. - + /// Check whether the highest level has exceeded `min_level_size` * `self.group_size`. + /// If it has, we must build an addition level above it. + /// Then check whether the highest level is under `min_level_size`. + /// If it has, we must remove the complete level. + pub(crate) fn add_or_delete_level(&self, txn: &mut RwTxn, field_id: u16) -> Result<()> { + let highest_level = get_highest_level(txn, self.db, field_id)?; let mut highest_level_prefix = vec![]; highest_level_prefix.extend_from_slice(&field_id.to_be_bytes()); highest_level_prefix.push(highest_level); @@ -386,10 +562,44 @@ impl FacetsUpdateIncrementalInner { let size_highest_level = self.db.remap_types::().prefix_iter(txn, &highest_level_prefix)?.count(); - if size_highest_level < self.group_size as usize * self.min_level_size as usize { - return Ok(()); + if size_highest_level >= self.group_size as usize * self.min_level_size as usize { + self.add_level(txn, field_id, highest_level, &highest_level_prefix, size_highest_level) + } else if size_highest_level < self.min_level_size as usize && highest_level != 0 { + self.delete_level(txn, &highest_level_prefix) + } else { + Ok(()) } + } + /// Delete a level. + fn delete_level(&self, txn: &mut RwTxn, highest_level_prefix: &[u8]) -> Result<()> { + let mut to_delete = vec![]; + let mut iter = + self.db.remap_types::().prefix_iter(txn, &highest_level_prefix)?; + for el in iter.by_ref() { + let (k, _) = el?; + to_delete.push( + FacetGroupKeyCodec::::bytes_decode(k) + .map_err(Error::Encoding)? + .into_owned(), + ); + } + drop(iter); + for k in to_delete { + self.db.delete(txn, &k.as_ref())?; + } + Ok(()) + } + + /// Build an additional level for the field id. + fn add_level( + &self, + txn: &mut RwTxn, + field_id: u16, + highest_level: u8, + highest_level_prefix: &[u8], + size_highest_level: usize, + ) -> Result<()> { let mut groups_iter = self .db .remap_types::() @@ -402,7 +612,7 @@ impl FacetsUpdateIncrementalInner { for _ in 0..nbr_new_groups { let mut first_key = None; let mut values = RoaringBitmap::new(); - for _ in 0..group_size { + for _ in 0..self.group_size { let (key_bytes, value_i) = groups_iter.next().unwrap()?; let key_i = FacetGroupKeyCodec::::bytes_decode(key_bytes) .map_err(Error::Encoding)?; @@ -417,7 +627,7 @@ impl FacetsUpdateIncrementalInner { level: highest_level + 1, left_bound: first_key.unwrap().left_bound, }; - let value = FacetGroupValue { size: group_size, bitmap: values }; + let value = FacetGroupValue { size: self.group_size, bitmap: values }; to_add.push((key.into_owned(), value)); } // now we add the rest of the level, in case its size is > group_size * min_level_size @@ -452,173 +662,6 @@ impl FacetsUpdateIncrementalInner { } Ok(()) } - - /// Delete the given document id from the given facet value in the database, from level 0 to the - /// the given level. - /// - /// ## Return - /// Returns the effect of removing the document id from the database on the given `level`. - /// - /// - `DeletionResult::InPlace` means that deleting the document id did not have - /// an effect on the keys in that level. - /// - /// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the - /// number of keys in the level. For example, removing a document id from the facet value `3` could - /// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted - /// entirely. In that case, `DeletionResult::Remove` is returned. The parent of the deleted key must - /// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well. - /// - /// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the - /// bounds of the keys of the level. For example, removing a document id from the facet value - /// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore, - /// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4). - /// In that case `DeletionResult::Reduce` is returned. The parent of the reduced key may need to adjust - /// its left bound as well. - fn delete_in_level( - &self, - txn: &mut RwTxn, - field_id: u16, - level: u8, - facet_value: &[u8], - docids: &RoaringBitmap, - ) -> Result { - if level == 0 { - return self.delete_in_level_0(txn, field_id, facet_value, docids); - } - let (deletion_key, mut bitmap) = - self.find_insertion_key_value(field_id, level, facet_value, txn)?; - - let result = self.delete_in_level(txn, field_id, level - 1, facet_value, docids)?; - - let mut decrease_size = false; - let next_key = match result { - DeletionResult::InPlace => { - bitmap.bitmap -= docids; - self.db.put(txn, &deletion_key.as_ref(), &bitmap)?; - return Ok(DeletionResult::InPlace); - } - DeletionResult::Reduce { next } => next, - DeletionResult::Remove { next } => { - decrease_size = true; - next - } - }; - // If either DeletionResult::Reduce or DeletionResult::Remove was returned, - // then we may need to adjust the left_bound of the deletion key. - - // If DeletionResult::Remove was returned, then we need to decrease the group - // size of the deletion key. - let mut updated_value = bitmap; - if decrease_size { - updated_value.size -= 1; - } - - if updated_value.size == 0 { - self.db.delete(txn, &deletion_key.as_ref())?; - Ok(DeletionResult::Remove { next: next_key }) - } else { - let mut updated_deletion_key = deletion_key.clone(); - let reduced_range = facet_value == deletion_key.left_bound; - if reduced_range { - updated_deletion_key.left_bound = next_key.clone().unwrap(); - } - updated_value.bitmap -= docids; - let _ = self.db.delete(txn, &deletion_key.as_ref())?; - self.db.put(txn, &updated_deletion_key.as_ref(), &updated_value)?; - if reduced_range { - Ok(DeletionResult::Reduce { next: next_key }) - } else { - Ok(DeletionResult::InPlace) - } - } - } - - fn delete_in_level_0( - &self, - txn: &mut RwTxn, - field_id: u16, - facet_value: &[u8], - docids: &RoaringBitmap, - ) -> Result { - let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value }; - let mut bitmap = self.db.get(txn, &key)?.unwrap().bitmap; - bitmap -= docids; - - if bitmap.is_empty() { - let mut next_key = None; - if let Some((next, _)) = - self.db.remap_data_type::().get_greater_than(txn, &key)? - { - if next.field_id == field_id && next.level == 0 { - next_key = Some(next.left_bound.to_vec()); - } - } - self.db.delete(txn, &key)?; - Ok(DeletionResult::Remove { next: next_key }) - } else { - self.db.put(txn, &key, &FacetGroupValue { size: 1, bitmap })?; - Ok(DeletionResult::InPlace) - } - } - - pub fn delete( - &self, - txn: &mut RwTxn, - field_id: u16, - facet_value: &[u8], - docids: &RoaringBitmap, - ) -> Result<()> { - if self - .db - .remap_data_type::() - .get(txn, &FacetGroupKey { field_id, level: 0, left_bound: facet_value })? - .is_none() - { - return Ok(()); - } - let highest_level = get_highest_level(txn, self.db, field_id)?; - - let result = self.delete_in_level(txn, field_id, highest_level, facet_value, docids)?; - match result { - DeletionResult::InPlace => return Ok(()), - DeletionResult::Reduce { .. } => return Ok(()), - DeletionResult::Remove { .. } => {} - } - - // if we either removed a key from the highest level, its size may have fallen - // below `min_level_size`, in which case we need to remove the entire level - - let mut highest_level_prefix = vec![]; - highest_level_prefix.extend_from_slice(&field_id.to_be_bytes()); - highest_level_prefix.push(highest_level); - - if highest_level == 0 - || self - .db - .remap_types::() - .prefix_iter(txn, &highest_level_prefix)? - .count() - >= self.min_level_size as usize - { - return Ok(()); - } - let mut to_delete = vec![]; - let mut iter = - self.db.remap_types::().prefix_iter(txn, &highest_level_prefix)?; - for el in iter.by_ref() { - let (k, _) = el?; - to_delete.push( - FacetGroupKeyCodec::::bytes_decode(k) - .map_err(Error::Encoding)? - .into_owned(), - ); - } - drop(iter); - for k in to_delete { - self.db.delete(txn, &k.as_ref())?; - } - Ok(()) - } } impl<'a> FacetGroupKey<&'a [u8]> { diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index ca5a21ce2..15a646836 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -429,7 +429,8 @@ pub(crate) mod test_helpers { max_group_size: self.max_group_size.get(), }; let key_bytes = BoundCodec::bytes_encode(key).unwrap(); - update.insert(wtxn, field_id, &key_bytes, docids).unwrap(); + update.modify(wtxn, field_id, &key_bytes, Some(docids), None).unwrap(); + update.add_or_delete_level(wtxn, field_id).unwrap(); } pub fn delete_single_docid<'a>( &self, @@ -455,7 +456,8 @@ pub(crate) mod test_helpers { max_group_size: self.max_group_size.get(), }; let key_bytes = BoundCodec::bytes_encode(key).unwrap(); - update.delete(wtxn, field_id, &key_bytes, docids).unwrap(); + update.modify(wtxn, field_id, &key_bytes, None, Some(docids)).unwrap(); + update.add_or_delete_level(wtxn, field_id).unwrap(); } pub fn bulk_insert<'a, 'b>( From a493a50825fd73ef7231fbad08abd308414867fd Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 22 Feb 2024 14:53:33 +0100 Subject: [PATCH 3/4] Fix clippy --- milli/src/update/facet/incremental.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index d68540967..584870a7a 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -201,7 +201,7 @@ impl FacetsUpdateIncrementalInner { match (old_value, add_docids, del_docids) { // Addition + deletion on an existing value (Some(FacetGroupValue { bitmap, .. }), Some(add_docids), Some(del_docids)) => { - let value = FacetGroupValue { bitmap: bitmap - del_docids | add_docids, size: 1 }; + let value = FacetGroupValue { bitmap: (bitmap - del_docids) | add_docids, size: 1 }; self.db.put(txn, &key, &value)?; Ok(ModificationResult::InPlace) } @@ -575,7 +575,7 @@ impl FacetsUpdateIncrementalInner { fn delete_level(&self, txn: &mut RwTxn, highest_level_prefix: &[u8]) -> Result<()> { let mut to_delete = vec![]; let mut iter = - self.db.remap_types::().prefix_iter(txn, &highest_level_prefix)?; + self.db.remap_types::().prefix_iter(txn, highest_level_prefix)?; for el in iter.by_ref() { let (k, _) = el?; to_delete.push( @@ -603,7 +603,7 @@ impl FacetsUpdateIncrementalInner { let mut groups_iter = self .db .remap_types::() - .prefix_iter(txn, &highest_level_prefix)?; + .prefix_iter(txn, highest_level_prefix)?; let nbr_new_groups = size_highest_level / self.group_size as usize; let nbr_leftover_elements = size_highest_level % self.group_size as usize; From 5e83bac448dabea20dc9d43eb161d07de0f15d3a Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 26 Feb 2024 15:40:15 +0100 Subject: [PATCH 4/4] Fix PR comments --- milli/src/update/facet/incremental.rs | 59 +++++++++++++++------------ 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 584870a7a..798e0fe3d 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -18,6 +18,32 @@ use crate::update::index_documents::valid_lmdb_key; use crate::update::MergeFn; use crate::{CboRoaringBitmapCodec, Index, Result}; +/// Enum used as a return value for the facet incremental indexing. +/// +/// - `ModificationResult::InPlace` means that modifying the `facet_value` into the `level` did not have +/// an effect on the number of keys in that level. Therefore, it did not increase the number of children +/// of the parent node. +/// +/// - `ModificationResult::Insert` means that modifying the `facet_value` into the `level` resulted +/// in the addition of a new key in that level, and that therefore the number of children +/// of the parent node should be incremented. +/// +/// - `ModificationResult::Remove` means that modifying the `facet_value` into the `level` resulted in a change in the +/// number of keys in the level. For example, removing a document id from the facet value `3` could +/// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted +/// entirely. In that case, `ModificationResult::Remove` is returned. The parent of the deleted key must +/// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well. +/// +/// - `ModificationResult::Reduce/Expand` means that modifying the `facet_value` into the `level` resulted in a change in the +/// bounds of the keys of the level. For example, removing a document id from the facet value +/// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore, +/// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4). +/// In that case `ModificationResult::Reduce` is returned. The parent of the reduced key may need to adjust +/// its left bound as well. +/// +/// - `ModificationResult::Nothing` means that modifying the `facet_value` didn't have any impact into the `level`. +/// This case is reachable when a document id is removed from a sub-level node but is still present in another one. +/// For example, removing `2` from a document containing `2` and `3`, the document id will removed form the `level 0` but should remain in the group node [1..4] in `level 1`. enum ModificationResult { InPlace, Expand, @@ -315,6 +341,9 @@ impl FacetsUpdateIncrementalInner { Ok(ModificationResult::Insert) } + /// Remove the docids still present in the related sub-level nodes from the del_docids. + /// + /// This process is needed to avoid removing docids from a group node where the docid is present in several sub-nodes. fn trim_del_docids<'a>( &self, txn: &mut RwTxn, @@ -352,30 +381,6 @@ impl FacetsUpdateIncrementalInner { /// ## Return /// Returns the effect of modifying the facet value to the database on the given `level`. /// - /// - `ModificationResult::InPlace` means that modifying the `facet_value` into the `level` did not have - /// an effect on the number of keys in that level. Therefore, it did not increase the number of children - /// of the parent node. - /// - /// - `ModificationResult::Insert` means that modifying the `facet_value` into the `level` resulted - /// in the addition of a new key in that level, and that therefore the number of children - /// of the parent node should be incremented. - /// - /// - `ModificationResult::Remove` means that modifying the `facet_value` into the `level` resulted in a change in the - /// number of keys in the level. For example, removing a document id from the facet value `3` could - /// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted - /// entirely. In that case, `ModificationResult::Remove` is returned. The parent of the deleted key must - /// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well. - /// - /// - `ModificationResult::Reduce/Expand` means that modifying the `facet_value` into the `level` resulted in a change in the - /// bounds of the keys of the level. For example, removing a document id from the facet value - /// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore, - /// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4). - /// In that case `ModificationResult::Reduce` is returned. The parent of the reduced key may need to adjust - /// its left bound as well. - /// - /// - `ModificationResult::Nothing` means that modifying the `facet_value` didn't have any impact into the `level`. - /// This case is reachable when a document id is removed from a sub-level node but is still present in another one. - /// For example, removing `2` from a document containing `2` and `3`, the document id will removed form the `level 0` but should remain in the group node [1..4] in `level 1`. fn modify_in_level( &self, txn: &mut RwTxn, @@ -465,7 +470,7 @@ impl FacetsUpdateIncrementalInner { if updated_value.size < self.max_group_size { // If there are docids to delete, trim them avoiding unexpected removal. - let del_docids = del_docids + if let Some(del_docids) = del_docids .map(|ids| { self.trim_del_docids( txn, @@ -477,8 +482,8 @@ impl FacetsUpdateIncrementalInner { ) }) .transpose()? - .filter(|ids| !ids.is_empty()); - if let Some(del_docids) = del_docids { + .filter(|ids| !ids.is_empty()) + { updated_value.bitmap -= &*del_docids; insertion_value_was_modified = true; }