2022-09-07 18:04:07 +02:00
use std ::fs ::File ;
2023-09-28 16:26:01 +02:00
use std ::io ::BufReader ;
2022-09-07 18:04:07 +02:00
2024-01-22 16:23:12 +01:00
use grenad ::Merger ;
2023-11-27 11:52:22 +01:00
use heed ::types ::{ Bytes , DecodeIgnore } ;
2022-09-07 18:04:07 +02:00
use heed ::{ BytesDecode , Error , RoTxn , RwTxn } ;
2023-10-19 12:01:12 +02:00
use obkv ::KvReader ;
2022-09-07 18:04:07 +02:00
use roaring ::RoaringBitmap ;
2022-09-05 17:31:26 +02:00
use crate ::facet ::FacetType ;
2022-09-05 13:01:36 +02:00
use crate ::heed_codec ::facet ::{
2022-10-12 09:42:55 +02:00
FacetGroupKey , FacetGroupKeyCodec , FacetGroupValue , FacetGroupValueCodec ,
2022-08-30 15:22:39 +02:00
} ;
2023-11-27 11:52:22 +01:00
use crate ::heed_codec ::BytesRefCodec ;
2022-08-31 13:03:36 +02:00
use crate ::search ::facet ::get_highest_level ;
2023-10-19 12:01:12 +02:00
use crate ::update ::del_add ::DelAdd ;
2022-11-14 14:16:14 +01:00
use crate ::update ::index_documents ::valid_lmdb_key ;
2024-01-22 16:23:12 +01:00
use crate ::update ::MergeFn ;
2023-10-23 15:19:33 +02:00
use crate ::{ CboRoaringBitmapCodec , Index , Result } ;
2022-08-30 15:22:39 +02:00
2024-02-26 15:40:15 +01:00
/// Enum used as a return value for the facet incremental indexing.
///
/// - `ModificationResult::InPlace` means that modifying the `facet_value` into the `level` did not have
/// an effect on the number of keys in that level. Therefore, it did not increase the number of children
/// of the parent node.
///
/// - `ModificationResult::Insert` means that modifying the `facet_value` into the `level` resulted
/// in the addition of a new key in that level, and that therefore the number of children
/// of the parent node should be incremented.
///
/// - `ModificationResult::Remove` means that modifying the `facet_value` into the `level` resulted in a change in the
/// number of keys in the level. For example, removing a document id from the facet value `3` could
/// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted
/// entirely. In that case, `ModificationResult::Remove` is returned. The parent of the deleted key must
/// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well.
///
/// - `ModificationResult::Reduce/Expand` means that modifying the `facet_value` into the `level` resulted in a change in the
/// bounds of the keys of the level. For example, removing a document id from the facet value
/// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore,
/// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4).
/// In that case `ModificationResult::Reduce` is returned. The parent of the reduced key may need to adjust
/// its left bound as well.
///
/// - `ModificationResult::Nothing` means that modifying the `facet_value` didn't have any impact into the `level`.
/// This case is reachable when a document id is removed from a sub-level node but is still present in another one.
/// For example, removing `2` from a document containing `2` and `3`, the document id will removed form the `level 0` but should remain in the group node [1..4] in `level 1`.
2024-02-21 18:42:16 +01:00
enum ModificationResult {
2022-08-30 15:22:39 +02:00
InPlace ,
2022-09-08 11:53:01 +02:00
Expand ,
2022-08-30 15:22:39 +02:00
Insert ,
2022-09-07 16:44:08 +02:00
Reduce { next : Option < Vec < u8 > > } ,
Remove { next : Option < Vec < u8 > > } ,
2024-02-21 18:42:16 +01:00
Nothing ,
2022-08-30 15:22:39 +02:00
}
2022-09-07 16:44:08 +02:00
/// Algorithm to incrementally insert and delete elememts into the
/// `facet_id_(string/f64)_docids` databases.
2023-10-23 15:19:33 +02:00
pub struct FacetsUpdateIncremental {
2022-09-05 17:31:26 +02:00
inner : FacetsUpdateIncrementalInner ,
2024-01-22 16:23:12 +01:00
delta_data : Merger < BufReader < File > , MergeFn > ,
2022-08-30 15:22:39 +02:00
}
2022-09-05 17:31:26 +02:00
2023-10-23 15:19:33 +02:00
impl FacetsUpdateIncremental {
2022-09-07 16:44:08 +02:00
pub fn new (
2023-10-23 15:19:33 +02:00
index : & Index ,
2022-09-07 16:44:08 +02:00
facet_type : FacetType ,
2024-01-22 16:23:12 +01:00
delta_data : Merger < BufReader < File > , MergeFn > ,
2022-09-07 16:44:08 +02:00
group_size : u8 ,
min_level_size : u8 ,
max_group_size : u8 ,
) -> Self {
2022-09-05 17:31:26 +02:00
FacetsUpdateIncremental {
inner : FacetsUpdateIncrementalInner {
db : match facet_type {
FacetType ::String = > index
. facet_id_string_docids
2023-11-27 11:52:22 +01:00
. remap_key_type ::< FacetGroupKeyCodec < BytesRefCodec > > ( ) ,
2022-09-05 17:31:26 +02:00
FacetType ::Number = > index
. facet_id_f64_docids
2023-11-27 11:52:22 +01:00
. remap_key_type ::< FacetGroupKeyCodec < BytesRefCodec > > ( ) ,
2022-09-05 17:31:26 +02:00
} ,
2022-09-07 16:44:08 +02:00
group_size ,
max_group_size ,
min_level_size ,
2022-09-05 17:31:26 +02:00
} ,
2023-10-19 12:01:12 +02:00
delta_data ,
2022-09-05 17:31:26 +02:00
}
2022-08-31 14:19:52 +02:00
}
2022-09-07 16:44:08 +02:00
2024-02-08 17:37:15 +01:00
#[ tracing::instrument(level = " trace " , skip_all, target = " indexing::facets::incremental " ) ]
2024-07-09 17:25:39 +02:00
pub fn execute ( self , wtxn : & mut RwTxn < '_ > ) -> crate ::Result < ( ) > {
2024-02-21 18:42:16 +01:00
let mut current_field_id = None ;
let mut facet_level_may_be_updated = false ;
2024-01-22 16:23:12 +01:00
let mut iter = self . delta_data . into_stream_merger_iter ( ) ? ;
while let Some ( ( key , value ) ) = iter . next ( ) ? {
2022-11-14 14:16:14 +01:00
if ! valid_lmdb_key ( key ) {
continue ;
}
2024-01-22 16:23:12 +01:00
2023-11-27 11:52:22 +01:00
let key = FacetGroupKeyCodec ::< BytesRefCodec > ::bytes_decode ( key )
2023-11-22 18:21:19 +01:00
. map_err ( heed ::Error ::Encoding ) ? ;
2024-02-21 18:42:16 +01:00
if facet_level_may_be_updated
& & current_field_id . map_or ( false , | fid | fid ! = key . field_id )
{
// Only add or remove a level after making all the field modifications.
self . inner . add_or_delete_level ( wtxn , current_field_id . unwrap ( ) ) ? ;
facet_level_may_be_updated = false ;
}
current_field_id = Some ( key . field_id ) ;
2023-10-19 12:01:12 +02:00
let value = KvReader ::new ( value ) ;
let docids_to_delete = value
. get ( DelAdd ::Deletion )
. map ( CboRoaringBitmapCodec ::bytes_decode )
2024-02-21 18:42:16 +01:00
. map ( | o | o . map_err ( heed ::Error ::Encoding ) )
. transpose ( ) ? ;
2023-10-19 12:01:12 +02:00
let docids_to_add = value
. get ( DelAdd ::Addition )
. map ( CboRoaringBitmapCodec ::bytes_decode )
2024-02-21 18:42:16 +01:00
. map ( | o | o . map_err ( heed ::Error ::Encoding ) )
. transpose ( ) ? ;
let level_size_changed = self . inner . modify (
wtxn ,
key . field_id ,
key . left_bound ,
docids_to_add . as_ref ( ) ,
docids_to_delete . as_ref ( ) ,
) ? ;
if level_size_changed {
// if a node has been added or removed from the highest level,
// we may have to update the facet level.
facet_level_may_be_updated = true ;
2023-10-19 12:01:12 +02:00
}
2024-02-21 18:42:16 +01:00
}
2023-10-19 12:01:12 +02:00
2024-02-21 18:42:16 +01:00
if let Some ( field_id ) = current_field_id {
if facet_level_may_be_updated {
self . inner . add_or_delete_level ( wtxn , field_id ) ? ;
2023-10-19 12:01:12 +02:00
}
2022-09-05 17:31:26 +02:00
}
Ok ( ( ) )
}
}
2022-09-07 16:44:08 +02:00
/// Implementation of `FacetsUpdateIncremental` that is independent of milli's `Index` type
2022-09-05 17:31:26 +02:00
pub struct FacetsUpdateIncrementalInner {
2023-11-27 11:52:22 +01:00
pub db : heed ::Database < FacetGroupKeyCodec < BytesRefCodec > , FacetGroupValueCodec > ,
2022-09-06 11:52:57 +02:00
pub group_size : u8 ,
pub min_level_size : u8 ,
pub max_group_size : u8 ,
2022-08-31 14:19:52 +02:00
}
2022-09-05 17:31:26 +02:00
impl FacetsUpdateIncrementalInner {
2022-09-07 16:44:08 +02:00
/// Find the `FacetGroupKey`/`FacetGroupValue` in the database that
/// should be used to insert the new `facet_value` for the given `field_id` and `level`
/// where `level` must be strictly greater than 0.
///
/// For example, when inserting the facet value `4`, there are two possibilities:
///
/// 1. We find a key whose lower bound is 3 followed by a key whose lower bound is 6. Therefore,
/// we know that the implicit range of the first key is 3..6, which contains 4.
/// So the new facet value belongs in that first key/value pair.
///
/// 2. The first key of the level has a lower bound of `5`. We return this key/value pair
/// but will need to change the lowerbound of this key to `4` in order to insert this facet value.
2022-08-31 13:03:36 +02:00
fn find_insertion_key_value (
2022-08-30 15:22:39 +02:00
& self ,
field_id : u16 ,
level : u8 ,
2022-09-07 16:44:08 +02:00
facet_value : & [ u8 ] ,
2024-07-09 17:25:39 +02:00
txn : & RoTxn < '_ > ,
2022-09-05 13:01:36 +02:00
) -> Result < ( FacetGroupKey < Vec < u8 > > , FacetGroupValue ) > {
2022-09-07 16:44:08 +02:00
assert! ( level > 0 ) ;
2022-11-14 14:16:14 +01:00
match self . db . get_lower_than_or_equal_to (
txn ,
& FacetGroupKey { field_id , level , left_bound : facet_value } ,
) ? {
Some ( ( key , value ) ) = > {
if key . level ! = level {
let mut prefix = vec! [ ] ;
prefix . extend_from_slice ( & field_id . to_be_bytes ( ) ) ;
prefix . push ( level ) ;
2023-11-22 18:21:19 +01:00
let mut iter = self
. db
2023-11-27 11:52:22 +01:00
. remap_types ::< Bytes , FacetGroupValueCodec > ( )
2023-11-22 18:21:19 +01:00
. prefix_iter ( txn , prefix . as_slice ( ) ) ? ;
2022-11-14 14:16:14 +01:00
let ( key_bytes , value ) = iter . next ( ) . unwrap ( ) ? ;
Ok ( (
2023-11-27 11:52:22 +01:00
FacetGroupKeyCodec ::< BytesRefCodec > ::bytes_decode ( key_bytes )
2023-11-22 18:21:19 +01:00
. map_err ( Error ::Encoding ) ?
2022-11-14 14:16:14 +01:00
. into_owned ( ) ,
value ,
) )
} else {
Ok ( ( key . into_owned ( ) , value ) )
2022-08-30 15:22:39 +02:00
}
2022-11-14 14:16:14 +01:00
}
None = > {
// We checked that the level is > 0
// Since all keys of level 1 are greater than those of level 0,
// we are guaranteed that db.get_lower_than_or_equal_to(key) exists
panic! ( )
2022-08-30 15:22:39 +02:00
}
}
}
2022-09-07 16:44:08 +02:00
/// Insert the given facet value and corresponding document ids in the level 0 of the database
///
/// ## Return
/// See documentation of `insert_in_level`
2024-02-21 18:42:16 +01:00
fn modify_in_level_0 (
2022-08-30 15:22:39 +02:00
& self ,
2024-07-09 17:25:39 +02:00
txn : & mut RwTxn < '_ > ,
2022-08-30 15:22:39 +02:00
field_id : u16 ,
2022-09-07 16:44:08 +02:00
facet_value : & [ u8 ] ,
2024-02-21 18:42:16 +01:00
add_docids : Option < & RoaringBitmap > ,
del_docids : Option < & RoaringBitmap > ,
) -> Result < ModificationResult > {
2022-09-07 16:44:08 +02:00
let key = FacetGroupKey { field_id , level : 0 , left_bound : facet_value } ;
2022-08-30 15:22:39 +02:00
2024-02-21 18:42:16 +01:00
let old_value = self . db . get ( txn , & key ) ? ;
match ( old_value , add_docids , del_docids ) {
// Addition + deletion on an existing value
( Some ( FacetGroupValue { bitmap , .. } ) , Some ( add_docids ) , Some ( del_docids ) ) = > {
2024-02-22 14:53:33 +01:00
let value = FacetGroupValue { bitmap : ( bitmap - del_docids ) | add_docids , size : 1 } ;
2024-02-21 18:42:16 +01:00
self . db . put ( txn , & key , & value ) ? ;
Ok ( ModificationResult ::InPlace )
}
// Addition on an existing value
( Some ( FacetGroupValue { bitmap , .. } ) , Some ( add_docids ) , None ) = > {
let value = FacetGroupValue { bitmap : bitmap | add_docids , size : 1 } ;
self . db . put ( txn , & key , & value ) ? ;
Ok ( ModificationResult ::InPlace )
}
// Addition of a new value (ignore deletion)
( None , Some ( add_docids ) , _ ) = > {
let value = FacetGroupValue { bitmap : add_docids . clone ( ) , size : 1 } ;
self . db . put ( txn , & key , & value ) ? ;
Ok ( ModificationResult ::Insert )
}
// Deletion on an existing value, fully delete the key if the resulted value is empty.
( Some ( FacetGroupValue { mut bitmap , .. } ) , None , Some ( del_docids ) ) = > {
bitmap - = del_docids ;
if bitmap . is_empty ( ) {
// Full deletion
let mut next_key = None ;
if let Some ( ( next , _ ) ) =
self . db . remap_data_type ::< DecodeIgnore > ( ) . get_greater_than ( txn , & key ) ?
{
if next . field_id = = field_id & & next . level = = 0 {
next_key = Some ( next . left_bound . to_vec ( ) ) ;
}
}
self . db . delete ( txn , & key ) ? ;
Ok ( ModificationResult ::Remove { next : next_key } )
} else {
// Partial deletion
let value = FacetGroupValue { bitmap , size : 1 } ;
2022-08-30 15:22:39 +02:00
self . db . put ( txn , & key , & value ) ? ;
2024-02-21 18:42:16 +01:00
Ok ( ModificationResult ::InPlace )
2022-08-30 15:22:39 +02:00
}
}
2024-02-21 18:42:16 +01:00
// Otherwise do nothing (None + no addition + deletion == Some + no addition + no deletion == Nothing),
// may be unreachable at some point.
( None , None , _ ) | ( Some ( _ ) , None , None ) = > Ok ( ModificationResult ::Nothing ) ,
2022-08-30 15:22:39 +02:00
}
}
2022-09-07 16:44:08 +02:00
2024-02-21 18:42:16 +01:00
/// Split a level node into two balanced nodes.
2022-09-07 16:44:08 +02:00
///
2024-02-21 18:42:16 +01:00
/// # Return
/// Returns `ModificationResult::Insert` if the split is successful.
fn split_group (
2022-08-30 15:22:39 +02:00
& self ,
2024-07-09 17:25:39 +02:00
txn : & mut RwTxn < '_ > ,
2022-08-30 15:22:39 +02:00
field_id : u16 ,
level : u8 ,
2024-02-21 18:42:16 +01:00
insertion_key : FacetGroupKey < Vec < u8 > > ,
insertion_value : FacetGroupValue ,
) -> Result < ModificationResult > {
let size_left = insertion_value . size / 2 ;
let size_right = insertion_value . size - size_left ;
2022-08-30 15:22:39 +02:00
2022-09-07 16:44:08 +02:00
let level_below = level - 1 ;
2022-08-30 15:22:39 +02:00
2022-09-07 16:44:08 +02:00
let start_key = FacetGroupKey {
field_id ,
level : level_below ,
left_bound : insertion_key . left_bound . as_slice ( ) ,
} ;
2022-08-30 15:22:39 +02:00
2022-09-08 11:53:01 +02:00
let mut iter =
2022-10-27 16:58:13 +02:00
self . db . range ( txn , & ( start_key .. ) ) ? . take ( ( size_left as usize ) + ( size_right as usize ) ) ;
2022-08-30 15:22:39 +02:00
2022-09-07 16:44:08 +02:00
let group_left = {
let mut values_left = RoaringBitmap ::new ( ) ;
let mut i = 0 ;
2022-10-27 16:58:13 +02:00
for next in iter . by_ref ( ) {
2022-09-07 16:44:08 +02:00
let ( _key , value ) = next ? ;
i + = 1 ;
values_left | = & value . bitmap ;
if i = = size_left {
break ;
2022-08-30 15:22:39 +02:00
}
2022-09-07 16:44:08 +02:00
}
2022-08-30 15:22:39 +02:00
2022-09-07 16:44:08 +02:00
let key =
FacetGroupKey { field_id , level , left_bound : insertion_key . left_bound . clone ( ) } ;
2022-11-04 01:27:46 +01:00
let value = FacetGroupValue { size : size_left , bitmap : values_left } ;
2022-09-07 16:44:08 +02:00
( key , value )
} ;
2022-08-30 15:22:39 +02:00
2022-09-07 16:44:08 +02:00
let group_right = {
let (
FacetGroupKey { left_bound : right_left_bound , .. } ,
FacetGroupValue { bitmap : mut values_right , .. } ,
) = iter . next ( ) . unwrap ( ) ? ;
2022-08-30 15:22:39 +02:00
2022-10-27 16:58:13 +02:00
for next in iter . by_ref ( ) {
2022-09-07 16:44:08 +02:00
let ( _ , value ) = next ? ;
values_right | = & value . bitmap ;
}
2022-08-30 15:22:39 +02:00
2022-09-07 16:44:08 +02:00
let key = FacetGroupKey { field_id , level , left_bound : right_left_bound . to_vec ( ) } ;
2022-11-04 01:27:46 +01:00
let value = FacetGroupValue { size : size_right , bitmap : values_right } ;
2022-09-07 16:44:08 +02:00
( key , value )
} ;
drop ( iter ) ;
2022-08-30 15:22:39 +02:00
2022-09-07 16:44:08 +02:00
let _ = self . db . delete ( txn , & insertion_key . as_ref ( ) ) ? ;
self . db . put ( txn , & group_left . 0. as_ref ( ) , & group_left . 1 ) ? ;
self . db . put ( txn , & group_right . 0. as_ref ( ) , & group_right . 1 ) ? ;
2024-02-21 18:42:16 +01:00
Ok ( ModificationResult ::Insert )
2022-08-30 15:22:39 +02:00
}
2024-02-26 15:40:15 +01:00
/// Remove the docids still present in the related sub-level nodes from the del_docids.
///
/// This process is needed to avoid removing docids from a group node where the docid is present in several sub-nodes.
2024-02-21 18:42:16 +01:00
fn trim_del_docids < ' a > (
2022-08-30 15:22:39 +02:00
& self ,
2024-07-09 17:25:39 +02:00
txn : & mut RwTxn < '_ > ,
2022-08-30 15:22:39 +02:00
field_id : u16 ,
2024-02-21 18:42:16 +01:00
level : u8 ,
insertion_key : & FacetGroupKey < Vec < u8 > > ,
insertion_value_size : usize ,
del_docids : & ' a RoaringBitmap ,
) -> Result < std ::borrow ::Cow < ' a , RoaringBitmap > > {
let level_below = level - 1 ;
let start_key = FacetGroupKey {
field_id ,
level : level_below ,
left_bound : insertion_key . left_bound . as_slice ( ) ,
} ;
let mut del_docids = std ::borrow ::Cow ::Borrowed ( del_docids ) ;
let iter = self . db . range ( txn , & ( start_key .. ) ) ? . take ( insertion_value_size ) ;
for next in iter {
let ( _ , value ) = next ? ;
// if a sublevel bitmap as common docids with del_docids,
// then these docids shouldn't be removed and so, remove them from the deletion list.
if ! value . bitmap . is_disjoint ( & del_docids ) {
* del_docids . to_mut ( ) - = value . bitmap ;
}
}
Ok ( del_docids )
}
/// Modify the given facet value and corresponding document ids in all the levels of the database up to the given `level`.
/// This function works recursively.
///
/// ## Return
/// Returns the effect of modifying the facet value to the database on the given `level`.
///
fn modify_in_level (
& self ,
2024-07-09 17:25:39 +02:00
txn : & mut RwTxn < '_ > ,
2024-02-21 18:42:16 +01:00
field_id : u16 ,
level : u8 ,
2022-09-07 16:44:08 +02:00
facet_value : & [ u8 ] ,
2024-02-21 18:42:16 +01:00
add_docids : Option < & RoaringBitmap > ,
del_docids : Option < & RoaringBitmap > ,
) -> Result < ModificationResult > {
if level = = 0 {
return self . modify_in_level_0 ( txn , field_id , facet_value , add_docids , del_docids ) ;
}
let result =
self . modify_in_level ( txn , field_id , level - 1 , facet_value , add_docids , del_docids ) ? ;
// level below inserted an element
if let ModificationResult ::Nothing = result {
// if the previous level has not been modified,
// early return ModificationResult::Nothing.
return Ok ( ModificationResult ::Nothing ) ;
}
let ( insertion_key , insertion_value ) =
self . find_insertion_key_value ( field_id , level , facet_value , txn ) ? ;
let insertion_value_size = insertion_value . size as usize ;
let mut insertion_value_was_modified = false ;
let mut updated_value = insertion_value ;
if let ModificationResult ::Insert = result {
// if a key has been inserted in the sub-level raise the value size.
updated_value . size + = 1 ;
insertion_value_was_modified = true ;
} else if let ModificationResult ::Remove { .. } = result {
if updated_value . size < = 1 {
// if the only remaining node is the one to delete,
// delete the key instead and early return.
let is_deleted = self . db . delete ( txn , & insertion_key . as_ref ( ) ) ? ;
assert! ( is_deleted ) ;
return Ok ( result ) ;
} else {
// Reduce the value size
updated_value . size - = 1 ;
insertion_value_was_modified = true ;
}
}
let ( insertion_key , insertion_key_modification ) =
if let ModificationResult ::InPlace = result {
( insertion_key , ModificationResult ::InPlace )
} else {
// Inserting or deleting the facet value in the level below resulted in the creation
// of a new key. Therefore, it may be the case that we need to modify the left bound of the
// insertion key (see documentation of `find_insertion_key_value` for an example of when that
// could happen).
let mut new_insertion_key = insertion_key . clone ( ) ;
let mut key_modification = ModificationResult ::InPlace ;
if let ModificationResult ::Remove { next } | ModificationResult ::Reduce { next } =
result
{
// if the deleted facet_value is the left_bound of the current node,
// the left_bound should be updated reducing the current node.
let reduced_range = facet_value = = insertion_key . left_bound ;
if reduced_range {
new_insertion_key . left_bound = next . clone ( ) . unwrap ( ) ;
key_modification = ModificationResult ::Reduce { next } ;
}
} else if facet_value < insertion_key . left_bound . as_slice ( ) {
// if the added facet_value is the under the left_bound of the current node,
// the left_bound should be updated expanding the current node.
new_insertion_key . left_bound = facet_value . to_vec ( ) ;
key_modification = ModificationResult ::Expand ;
}
if matches! (
key_modification ,
ModificationResult ::Expand | ModificationResult ::Reduce { .. }
) {
// if the node should be updated, delete it, it will be recreated using a new key later.
let is_deleted = self . db . delete ( txn , & insertion_key . as_ref ( ) ) ? ;
assert! ( is_deleted ) ;
}
( new_insertion_key , key_modification )
} ;
if updated_value . size < self . max_group_size {
// If there are docids to delete, trim them avoiding unexpected removal.
2024-02-26 15:40:15 +01:00
if let Some ( del_docids ) = del_docids
2024-02-21 18:42:16 +01:00
. map ( | ids | {
self . trim_del_docids (
txn ,
field_id ,
level ,
& insertion_key ,
insertion_value_size ,
ids ,
)
} )
. transpose ( ) ?
2024-02-26 15:40:15 +01:00
. filter ( | ids | ! ids . is_empty ( ) )
{
2024-02-21 18:42:16 +01:00
updated_value . bitmap - = & * del_docids ;
insertion_value_was_modified = true ;
}
if let Some ( add_docids ) = add_docids {
updated_value . bitmap | = add_docids ;
insertion_value_was_modified = true ;
}
if insertion_value_was_modified
| | matches! (
insertion_key_modification ,
ModificationResult ::Expand | ModificationResult ::Reduce { .. }
)
{
2024-04-14 14:11:34 +02:00
// if any modification occurred, insert it in the database.
2024-02-21 18:42:16 +01:00
self . db . put ( txn , & insertion_key . as_ref ( ) , & updated_value ) ? ;
Ok ( insertion_key_modification )
} else {
// this case is reachable when a docid is removed from a sub-level node but is still present in another one.
// For instance, a document containing 2 and 3, if 2 is removed, the docid should remain in the group node [1..4].
Ok ( ModificationResult ::Nothing )
}
} else {
// We've increased the group size of the value and realised it has become greater than or equal to `max_group_size`
// Therefore it must be split into two nodes.
self . split_group ( txn , field_id , level , insertion_key , updated_value )
}
}
/// Modify the given facet value and corresponding document ids in the database.
/// If no more document ids correspond to the facet value, delete it completely.
///
/// ## Return
/// Returns `true` if some tree-nodes of the highest level have been removed or added implying a potential
/// addition or deletion of a facet level.
/// Otherwise returns `false` if the tree-nodes have been modified in place.
pub fn modify (
& self ,
2024-07-09 17:25:39 +02:00
txn : & mut RwTxn < '_ > ,
2024-02-21 18:42:16 +01:00
field_id : u16 ,
facet_value : & [ u8 ] ,
add_docids : Option < & RoaringBitmap > ,
del_docids : Option < & RoaringBitmap > ,
) -> Result < bool > {
if add_docids . map_or ( true , RoaringBitmap ::is_empty )
& & del_docids . map_or ( true , RoaringBitmap ::is_empty )
{
return Ok ( false ) ;
2022-08-30 15:22:39 +02:00
}
2022-10-27 16:58:13 +02:00
let highest_level = get_highest_level ( txn , self . db , field_id ) ? ;
2022-08-30 15:22:39 +02:00
2024-02-21 18:42:16 +01:00
let result = self . modify_in_level (
txn ,
field_id ,
highest_level ,
facet_value ,
add_docids ,
del_docids ,
) ? ;
2022-08-30 15:22:39 +02:00
match result {
2024-02-21 18:42:16 +01:00
ModificationResult ::InPlace
| ModificationResult ::Expand
| ModificationResult ::Nothing
| ModificationResult ::Reduce { .. } = > Ok ( false ) ,
ModificationResult ::Insert | ModificationResult ::Remove { .. } = > Ok ( true ) ,
2022-08-30 15:22:39 +02:00
}
2024-02-21 18:42:16 +01:00
}
2022-08-30 15:22:39 +02:00
2024-02-21 18:42:16 +01:00
/// Check whether the highest level has exceeded `min_level_size` * `self.group_size`.
/// If it has, we must build an addition level above it.
/// Then check whether the highest level is under `min_level_size`.
/// If it has, we must remove the complete level.
2024-07-09 17:25:39 +02:00
pub ( crate ) fn add_or_delete_level ( & self , txn : & mut RwTxn < '_ > , field_id : u16 ) -> Result < ( ) > {
2024-02-21 18:42:16 +01:00
let highest_level = get_highest_level ( txn , self . db , field_id ) ? ;
2022-08-30 15:22:39 +02:00
let mut highest_level_prefix = vec! [ ] ;
highest_level_prefix . extend_from_slice ( & field_id . to_be_bytes ( ) ) ;
highest_level_prefix . push ( highest_level ) ;
2023-11-27 11:52:22 +01:00
let size_highest_level =
self . db . remap_types ::< Bytes , Bytes > ( ) . prefix_iter ( txn , & highest_level_prefix ) ? . count ( ) ;
2022-08-30 15:22:39 +02:00
2024-02-21 18:42:16 +01:00
if size_highest_level > = self . group_size as usize * self . min_level_size as usize {
self . add_level ( txn , field_id , highest_level , & highest_level_prefix , size_highest_level )
} else if size_highest_level < self . min_level_size as usize & & highest_level ! = 0 {
self . delete_level ( txn , & highest_level_prefix )
} else {
Ok ( ( ) )
}
}
/// Delete a level.
2024-07-09 17:25:39 +02:00
fn delete_level ( & self , txn : & mut RwTxn < '_ > , highest_level_prefix : & [ u8 ] ) -> Result < ( ) > {
2024-02-21 18:42:16 +01:00
let mut to_delete = vec! [ ] ;
let mut iter =
2024-02-22 14:53:33 +01:00
self . db . remap_types ::< Bytes , Bytes > ( ) . prefix_iter ( txn , highest_level_prefix ) ? ;
2024-02-21 18:42:16 +01:00
for el in iter . by_ref ( ) {
let ( k , _ ) = el ? ;
to_delete . push (
FacetGroupKeyCodec ::< BytesRefCodec > ::bytes_decode ( k )
. map_err ( Error ::Encoding ) ?
. into_owned ( ) ,
) ;
2022-08-30 15:22:39 +02:00
}
2024-02-21 18:42:16 +01:00
drop ( iter ) ;
for k in to_delete {
self . db . delete ( txn , & k . as_ref ( ) ) ? ;
}
Ok ( ( ) )
}
2022-08-30 15:22:39 +02:00
2024-02-21 18:42:16 +01:00
/// Build an additional level for the field id.
fn add_level (
& self ,
2024-07-09 17:25:39 +02:00
txn : & mut RwTxn < '_ > ,
2024-02-21 18:42:16 +01:00
field_id : u16 ,
highest_level : u8 ,
highest_level_prefix : & [ u8 ] ,
size_highest_level : usize ,
) -> Result < ( ) > {
2022-08-30 15:22:39 +02:00
let mut groups_iter = self
. db
2023-11-27 11:52:22 +01:00
. remap_types ::< Bytes , FacetGroupValueCodec > ( )
2024-02-22 14:53:33 +01:00
. prefix_iter ( txn , highest_level_prefix ) ? ;
2022-08-30 15:22:39 +02:00
2022-09-08 11:53:01 +02:00
let nbr_new_groups = size_highest_level / self . group_size as usize ;
let nbr_leftover_elements = size_highest_level % self . group_size as usize ;
2022-08-30 15:22:39 +02:00
let mut to_add = vec! [ ] ;
2022-09-08 11:53:01 +02:00
for _ in 0 .. nbr_new_groups {
2022-08-30 15:22:39 +02:00
let mut first_key = None ;
let mut values = RoaringBitmap ::new ( ) ;
2024-02-21 18:42:16 +01:00
for _ in 0 .. self . group_size {
2022-08-30 15:22:39 +02:00
let ( key_bytes , value_i ) = groups_iter . next ( ) . unwrap ( ) ? ;
2023-11-27 11:52:22 +01:00
let key_i = FacetGroupKeyCodec ::< BytesRefCodec > ::bytes_decode ( key_bytes )
2023-11-22 18:21:19 +01:00
. map_err ( Error ::Encoding ) ? ;
2022-08-30 15:22:39 +02:00
if first_key . is_none ( ) {
first_key = Some ( key_i ) ;
}
values | = value_i . bitmap ;
}
2022-09-05 13:01:36 +02:00
let key = FacetGroupKey {
2022-08-30 15:22:39 +02:00
field_id ,
level : highest_level + 1 ,
left_bound : first_key . unwrap ( ) . left_bound ,
} ;
2024-02-21 18:42:16 +01:00
let value = FacetGroupValue { size : self . group_size , bitmap : values } ;
2022-08-30 15:22:39 +02:00
to_add . push ( ( key . into_owned ( ) , value ) ) ;
}
2022-09-08 11:53:01 +02:00
// now we add the rest of the level, in case its size is > group_size * min_level_size
// this can indeed happen if the min_level_size parameter changes between two calls to `insert`
if nbr_leftover_elements > 0 {
let mut first_key = None ;
let mut values = RoaringBitmap ::new ( ) ;
for _ in 0 .. nbr_leftover_elements {
let ( key_bytes , value_i ) = groups_iter . next ( ) . unwrap ( ) ? ;
2023-11-27 11:52:22 +01:00
let key_i = FacetGroupKeyCodec ::< BytesRefCodec > ::bytes_decode ( key_bytes )
2023-11-22 18:21:19 +01:00
. map_err ( Error ::Encoding ) ? ;
2022-09-08 11:53:01 +02:00
if first_key . is_none ( ) {
first_key = Some ( key_i ) ;
}
values | = value_i . bitmap ;
}
let key = FacetGroupKey {
field_id ,
level : highest_level + 1 ,
left_bound : first_key . unwrap ( ) . left_bound ,
} ;
2022-11-30 14:27:36 +01:00
// Note: nbr_leftover_elements can be casted to a u8 since it is bounded by `max_group_size`
// when it is created above.
2022-09-08 11:53:01 +02:00
let value = FacetGroupValue { size : nbr_leftover_elements as u8 , bitmap : values } ;
to_add . push ( ( key . into_owned ( ) , value ) ) ;
}
2022-08-30 15:22:39 +02:00
drop ( groups_iter ) ;
for ( key , value ) in to_add {
self . db . put ( txn , & key . as_ref ( ) , & value ) ? ;
}
Ok ( ( ) )
}
}
2022-09-01 11:33:50 +02:00
2022-09-07 16:44:08 +02:00
impl < ' a > FacetGroupKey < & ' a [ u8 ] > {
pub fn into_owned ( self ) -> FacetGroupKey < Vec < u8 > > {
FacetGroupKey {
field_id : self . field_id ,
level : self . level ,
left_bound : self . left_bound . to_vec ( ) ,
}
}
}
2022-10-27 16:58:13 +02:00
impl FacetGroupKey < Vec < u8 > > {
2022-09-07 16:44:08 +02:00
pub fn as_ref ( & self ) -> FacetGroupKey < & [ u8 ] > {
FacetGroupKey {
field_id : self . field_id ,
level : self . level ,
left_bound : self . left_bound . as_slice ( ) ,
}
}
}
2022-09-01 11:33:50 +02:00
#[ cfg(test) ]
mod tests {
2022-09-01 11:40:29 +02:00
use rand ::seq ::SliceRandom ;
use rand ::{ Rng , SeedableRng } ;
2022-09-01 11:33:50 +02:00
use roaring ::RoaringBitmap ;
2022-10-12 09:42:55 +02:00
use crate ::heed_codec ::facet ::OrderedF64Codec ;
use crate ::heed_codec ::StrRefCodec ;
2022-09-07 18:04:07 +02:00
use crate ::milli_snap ;
2022-12-05 10:33:31 +01:00
use crate ::update ::facet ::test_helpers ::FacetIndex ;
2022-09-07 18:04:07 +02:00
2022-09-01 11:33:50 +02:00
#[ test ]
fn append ( ) {
2022-09-05 17:31:26 +02:00
let index = FacetIndex ::< OrderedF64Codec > ::new ( 4 , 8 , 5 ) ;
2022-09-01 11:33:50 +02:00
for i in 0 .. 256 u16 {
let mut bitmap = RoaringBitmap ::new ( ) ;
bitmap . insert ( i as u32 ) ;
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
index . insert ( & mut txn , 0 , & ( i as f64 ) , & bitmap ) ;
txn . commit ( ) . unwrap ( ) ;
}
2022-09-06 11:52:57 +02:00
let txn = index . env . read_txn ( ) . unwrap ( ) ;
index . verify_structure_validity ( & txn , 0 ) ;
txn . commit ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
milli_snap! ( format! ( " {index} " ) ) ;
}
#[ test ]
fn many_field_ids_append ( ) {
2022-09-05 17:31:26 +02:00
let index = FacetIndex ::< OrderedF64Codec > ::new ( 4 , 8 , 5 ) ;
2022-09-01 11:33:50 +02:00
for i in 0 .. 256 u16 {
let mut bitmap = RoaringBitmap ::new ( ) ;
bitmap . insert ( i as u32 ) ;
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
index . insert ( & mut txn , 0 , & ( i as f64 ) , & bitmap ) ;
txn . commit ( ) . unwrap ( ) ;
}
for i in 0 .. 256 u16 {
let mut bitmap = RoaringBitmap ::new ( ) ;
bitmap . insert ( i as u32 ) ;
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
index . insert ( & mut txn , 2 , & ( i as f64 ) , & bitmap ) ;
txn . commit ( ) . unwrap ( ) ;
}
for i in 0 .. 256 u16 {
let mut bitmap = RoaringBitmap ::new ( ) ;
bitmap . insert ( i as u32 ) ;
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
index . insert ( & mut txn , 1 , & ( i as f64 ) , & bitmap ) ;
txn . commit ( ) . unwrap ( ) ;
}
2022-09-06 11:52:57 +02:00
let txn = index . env . read_txn ( ) . unwrap ( ) ;
index . verify_structure_validity ( & txn , 0 ) ;
index . verify_structure_validity ( & txn , 1 ) ;
index . verify_structure_validity ( & txn , 2 ) ;
txn . commit ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
milli_snap! ( format! ( " {index} " ) ) ;
}
#[ test ]
fn many_field_ids_prepend ( ) {
2022-09-05 17:31:26 +02:00
let index = FacetIndex ::< OrderedF64Codec > ::new ( 4 , 8 , 5 ) ;
2023-04-25 16:40:32 +02:00
for i in ( 0 .. 256 ) . rev ( ) {
2022-09-01 11:33:50 +02:00
let mut bitmap = RoaringBitmap ::new ( ) ;
bitmap . insert ( i as u32 ) ;
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
index . insert ( & mut txn , 0 , & ( i as f64 ) , & bitmap ) ;
txn . commit ( ) . unwrap ( ) ;
}
2023-04-25 16:40:32 +02:00
for i in ( 0 .. 256 ) . rev ( ) {
2022-09-01 11:33:50 +02:00
let mut bitmap = RoaringBitmap ::new ( ) ;
bitmap . insert ( i as u32 ) ;
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
index . insert ( & mut txn , 2 , & ( i as f64 ) , & bitmap ) ;
txn . commit ( ) . unwrap ( ) ;
}
2023-04-25 16:40:32 +02:00
for i in ( 0 .. 256 ) . rev ( ) {
2022-09-01 11:33:50 +02:00
let mut bitmap = RoaringBitmap ::new ( ) ;
bitmap . insert ( i as u32 ) ;
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
index . insert ( & mut txn , 1 , & ( i as f64 ) , & bitmap ) ;
txn . commit ( ) . unwrap ( ) ;
}
2022-09-06 11:52:57 +02:00
let txn = index . env . read_txn ( ) . unwrap ( ) ;
index . verify_structure_validity ( & txn , 0 ) ;
index . verify_structure_validity ( & txn , 1 ) ;
index . verify_structure_validity ( & txn , 2 ) ;
txn . commit ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
milli_snap! ( format! ( " {index} " ) ) ;
}
#[ test ]
fn prepend ( ) {
2022-09-05 17:31:26 +02:00
let index = FacetIndex ::< OrderedF64Codec > ::new ( 4 , 8 , 5 ) ;
2022-09-01 11:33:50 +02:00
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
2023-04-25 16:40:32 +02:00
for i in ( 0 .. 256 ) . rev ( ) {
2022-09-01 11:33:50 +02:00
let mut bitmap = RoaringBitmap ::new ( ) ;
bitmap . insert ( i ) ;
index . insert ( & mut txn , 0 , & ( i as f64 ) , & bitmap ) ;
}
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
2022-09-01 11:33:50 +02:00
txn . commit ( ) . unwrap ( ) ;
milli_snap! ( format! ( " {index} " ) ) ;
}
#[ test ]
fn shuffled ( ) {
2022-09-05 17:31:26 +02:00
let index = FacetIndex ::< OrderedF64Codec > ::new ( 4 , 8 , 5 ) ;
2022-09-01 11:33:50 +02:00
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
2023-04-25 16:40:32 +02:00
let mut keys = ( 0 .. 256 ) . collect ::< Vec < _ > > ( ) ;
2022-09-01 11:33:50 +02:00
let mut rng = rand ::rngs ::SmallRng ::from_seed ( [ 0 ; 32 ] ) ;
keys . shuffle ( & mut rng ) ;
2024-01-16 15:27:24 +01:00
for key in keys {
2022-09-01 11:33:50 +02:00
let mut bitmap = RoaringBitmap ::new ( ) ;
bitmap . insert ( key ) ;
index . insert ( & mut txn , 0 , & ( key as f64 ) , & bitmap ) ;
}
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
2022-09-01 11:33:50 +02:00
txn . commit ( ) . unwrap ( ) ;
milli_snap! ( format! ( " {index} " ) ) ;
}
#[ test ]
fn merge_values ( ) {
2022-09-05 17:31:26 +02:00
let index = FacetIndex ::< OrderedF64Codec > ::new ( 4 , 8 , 5 ) ;
2022-09-06 11:52:57 +02:00
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
2023-04-25 16:40:32 +02:00
let mut keys = ( 0 .. 256 ) . collect ::< Vec < _ > > ( ) ;
2022-09-01 11:33:50 +02:00
let mut rng = rand ::rngs ::SmallRng ::from_seed ( [ 0 ; 32 ] ) ;
keys . shuffle ( & mut rng ) ;
2022-09-06 11:52:57 +02:00
2024-01-16 15:27:24 +01:00
for key in keys {
2022-09-01 11:33:50 +02:00
let mut bitmap = RoaringBitmap ::new ( ) ;
bitmap . insert ( key ) ;
bitmap . insert ( rng . gen_range ( 256 .. 512 ) ) ;
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
2022-09-01 11:33:50 +02:00
index . insert ( & mut txn , 0 , & ( key as f64 ) , & bitmap ) ;
}
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
txn . commit ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
milli_snap! ( format! ( " {index} " ) ) ;
}
#[ test ]
fn delete_from_end ( ) {
2022-09-05 17:31:26 +02:00
let index = FacetIndex ::< OrderedF64Codec > ::new ( 4 , 8 , 5 ) ;
2022-09-06 11:52:57 +02:00
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
for i in 0 .. 256 {
let mut bitmap = RoaringBitmap ::new ( ) ;
bitmap . insert ( i ) ;
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
2022-10-27 16:58:13 +02:00
index . insert ( & mut txn , 0 , & ( i as f64 ) , & bitmap ) ;
2022-09-01 11:33:50 +02:00
}
2023-04-25 16:40:32 +02:00
for i in ( 200 .. 256 ) . rev ( ) {
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
2022-09-21 15:53:39 +02:00
index . delete_single_docid ( & mut txn , 0 , & ( i as f64 ) , i as u32 ) ;
2022-09-01 11:33:50 +02:00
}
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
txn . commit ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
milli_snap! ( format! ( " {index} " ) , 200 ) ;
2022-09-06 11:52:57 +02:00
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
2023-04-25 16:40:32 +02:00
for i in ( 150 .. 200 ) . rev ( ) {
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
2022-09-21 15:53:39 +02:00
index . delete_single_docid ( & mut txn , 0 , & ( i as f64 ) , i as u32 ) ;
2022-09-01 11:33:50 +02:00
}
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
txn . commit ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
milli_snap! ( format! ( " {index} " ) , 150 ) ;
2022-09-06 11:52:57 +02:00
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
2023-04-25 16:40:32 +02:00
for i in ( 100 .. 150 ) . rev ( ) {
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
2022-09-21 15:53:39 +02:00
index . delete_single_docid ( & mut txn , 0 , & ( i as f64 ) , i as u32 ) ;
2022-09-01 11:33:50 +02:00
}
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
txn . commit ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
milli_snap! ( format! ( " {index} " ) , 100 ) ;
2022-09-06 11:52:57 +02:00
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
2023-04-25 16:40:32 +02:00
for i in ( 17 .. 100 ) . rev ( ) {
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
2022-09-21 15:53:39 +02:00
index . delete_single_docid ( & mut txn , 0 , & ( i as f64 ) , i as u32 ) ;
2022-09-01 11:33:50 +02:00
}
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
txn . commit ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
milli_snap! ( format! ( " {index} " ) , 17 ) ;
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
2023-04-25 16:40:32 +02:00
for i in ( 15 .. 17 ) . rev ( ) {
2022-09-21 15:53:39 +02:00
index . delete_single_docid ( & mut txn , 0 , & ( i as f64 ) , i as u32 ) ;
2022-09-01 11:33:50 +02:00
}
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
2022-09-01 11:33:50 +02:00
txn . commit ( ) . unwrap ( ) ;
milli_snap! ( format! ( " {index} " ) , 15 ) ;
2022-09-06 11:52:57 +02:00
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
2023-04-25 16:40:32 +02:00
for i in ( 0 .. 15 ) . rev ( ) {
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
2022-09-21 15:53:39 +02:00
index . delete_single_docid ( & mut txn , 0 , & ( i as f64 ) , i as u32 ) ;
2022-09-01 11:33:50 +02:00
}
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
txn . commit ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
milli_snap! ( format! ( " {index} " ) , 0 ) ;
}
#[ test ]
fn delete_from_start ( ) {
2022-09-05 17:31:26 +02:00
let index = FacetIndex ::< OrderedF64Codec > ::new ( 4 , 8 , 5 ) ;
2022-09-06 11:52:57 +02:00
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
for i in 0 .. 256 {
let mut bitmap = RoaringBitmap ::new ( ) ;
bitmap . insert ( i ) ;
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
2022-09-01 11:33:50 +02:00
index . insert ( & mut txn , 0 , & ( i as f64 ) , & bitmap ) ;
}
for i in 0 .. 128 {
2022-09-21 15:53:39 +02:00
index . delete_single_docid ( & mut txn , 0 , & ( i as f64 ) , i as u32 ) ;
2022-09-01 11:33:50 +02:00
}
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
txn . commit ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
milli_snap! ( format! ( " {index} " ) , 127 ) ;
2022-09-06 11:52:57 +02:00
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
for i in 128 .. 216 {
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
2022-09-21 15:53:39 +02:00
index . delete_single_docid ( & mut txn , 0 , & ( i as f64 ) , i as u32 ) ;
2022-09-01 11:33:50 +02:00
}
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
txn . commit ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
milli_snap! ( format! ( " {index} " ) , 215 ) ;
2022-09-06 11:52:57 +02:00
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
for i in 216 .. 256 {
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
2022-09-21 15:53:39 +02:00
index . delete_single_docid ( & mut txn , 0 , & ( i as f64 ) , i as u32 ) ;
2022-09-01 11:33:50 +02:00
}
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
txn . commit ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
milli_snap! ( format! ( " {index} " ) , 255 ) ;
}
#[ test ]
2023-01-17 18:01:26 +01:00
#[ allow(clippy::needless_range_loop) ]
2022-09-01 11:33:50 +02:00
fn delete_shuffled ( ) {
2022-09-05 17:31:26 +02:00
let index = FacetIndex ::< OrderedF64Codec > ::new ( 4 , 8 , 5 ) ;
2022-09-06 11:52:57 +02:00
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
for i in 0 .. 256 {
let mut bitmap = RoaringBitmap ::new ( ) ;
bitmap . insert ( i ) ;
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
2022-09-01 11:33:50 +02:00
index . insert ( & mut txn , 0 , & ( i as f64 ) , & bitmap ) ;
}
2023-04-25 16:40:32 +02:00
let mut keys = ( 0 .. 256 ) . collect ::< Vec < _ > > ( ) ;
2022-09-01 11:33:50 +02:00
let mut rng = rand ::rngs ::SmallRng ::from_seed ( [ 0 ; 32 ] ) ;
keys . shuffle ( & mut rng ) ;
for i in 0 .. 128 {
let key = keys [ i ] ;
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
2022-09-21 15:53:39 +02:00
index . delete_single_docid ( & mut txn , 0 , & ( key as f64 ) , key as u32 ) ;
2022-09-01 11:33:50 +02:00
}
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
txn . commit ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
milli_snap! ( format! ( " {index} " ) , 127 ) ;
2022-09-06 11:52:57 +02:00
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
for i in 128 .. 216 {
let key = keys [ i ] ;
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
2022-09-21 15:53:39 +02:00
index . delete_single_docid ( & mut txn , 0 , & ( key as f64 ) , key as u32 ) ;
2022-09-01 11:33:50 +02:00
}
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
txn . commit ( ) . unwrap ( ) ;
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
milli_snap! ( format! ( " {index} " ) , 215 ) ;
for i in 216 .. 256 {
let key = keys [ i ] ;
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
2022-09-21 15:53:39 +02:00
index . delete_single_docid ( & mut txn , 0 , & ( key as f64 ) , key as u32 ) ;
2022-09-01 11:33:50 +02:00
}
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
txn . commit ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
milli_snap! ( format! ( " {index} " ) , 255 ) ;
}
#[ test ]
fn in_place_level0_insert ( ) {
2022-09-05 17:31:26 +02:00
let index = FacetIndex ::< OrderedF64Codec > ::new ( 4 , 8 , 5 ) ;
2022-09-06 11:52:57 +02:00
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
2023-04-25 16:40:32 +02:00
let mut keys = ( 0 .. 16 ) . collect ::< Vec < _ > > ( ) ;
2022-09-01 11:33:50 +02:00
let mut rng = rand ::rngs ::SmallRng ::from_seed ( [ 0 ; 32 ] ) ;
keys . shuffle ( & mut rng ) ;
for i in 0 .. 4 {
for & key in keys . iter ( ) {
let mut bitmap = RoaringBitmap ::new ( ) ;
bitmap . insert ( rng . gen_range ( i * 256 .. ( i + 1 ) * 256 ) ) ;
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
2022-09-01 11:33:50 +02:00
index . insert ( & mut txn , 0 , & ( key as f64 ) , & bitmap ) ;
}
}
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
txn . commit ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
milli_snap! ( format! ( " {index} " ) ) ;
}
#[ test ]
fn in_place_level0_delete ( ) {
2022-09-05 17:31:26 +02:00
let index = FacetIndex ::< OrderedF64Codec > ::new ( 4 , 8 , 5 ) ;
2022-09-06 11:52:57 +02:00
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
2023-04-25 16:40:32 +02:00
let mut keys = ( 0 .. 64 ) . collect ::< Vec < _ > > ( ) ;
2022-09-01 11:33:50 +02:00
let mut rng = rand ::rngs ::SmallRng ::from_seed ( [ 0 ; 32 ] ) ;
keys . shuffle ( & mut rng ) ;
for & key in keys . iter ( ) {
let mut bitmap = RoaringBitmap ::new ( ) ;
bitmap . insert ( key ) ;
bitmap . insert ( key + 100 ) ;
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
2022-09-01 11:33:50 +02:00
index . insert ( & mut txn , 0 , & ( key as f64 ) , & bitmap ) ;
}
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
txn . commit ( ) . unwrap ( ) ;
2022-09-01 12:57:29 +02:00
milli_snap! ( format! ( " {index} " ) , " before_delete " ) ;
2022-09-01 11:33:50 +02:00
2022-09-06 11:52:57 +02:00
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
for & key in keys . iter ( ) {
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
2022-09-21 15:53:39 +02:00
index . delete_single_docid ( & mut txn , 0 , & ( key as f64 ) , key + 100 ) ;
2022-09-01 11:33:50 +02:00
}
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
txn . commit ( ) . unwrap ( ) ;
2022-09-01 12:57:29 +02:00
milli_snap! ( format! ( " {index} " ) , " after_delete " ) ;
2022-09-01 11:33:50 +02:00
}
#[ test ]
2022-09-01 12:57:29 +02:00
fn shuffle_merge_string_and_delete ( ) {
2022-09-05 17:31:26 +02:00
let index = FacetIndex ::< StrRefCodec > ::new ( 4 , 8 , 5 ) ;
2022-09-06 11:52:57 +02:00
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
2023-04-25 16:40:32 +02:00
let mut keys = ( 1000 .. 1064 ) . collect ::< Vec < _ > > ( ) ;
2022-09-01 11:33:50 +02:00
let mut rng = rand ::rngs ::SmallRng ::from_seed ( [ 0 ; 32 ] ) ;
keys . shuffle ( & mut rng ) ;
for & key in keys . iter ( ) {
let mut bitmap = RoaringBitmap ::new ( ) ;
bitmap . insert ( key ) ;
bitmap . insert ( key + 100 ) ;
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
2022-09-01 11:33:50 +02:00
index . insert ( & mut txn , 0 , & format! ( " {key:x} " ) . as_str ( ) , & bitmap ) ;
}
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
txn . commit ( ) . unwrap ( ) ;
2022-09-01 12:57:29 +02:00
milli_snap! ( format! ( " {index} " ) , " before_delete " ) ;
2022-09-01 11:33:50 +02:00
2022-09-06 11:52:57 +02:00
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
2022-09-01 11:33:50 +02:00
for & key in keys . iter ( ) {
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
2022-09-21 15:53:39 +02:00
index . delete_single_docid ( & mut txn , 0 , & format! ( " {key:x} " ) . as_str ( ) , key + 100 ) ;
2022-09-01 11:33:50 +02:00
}
2022-09-06 11:52:57 +02:00
index . verify_structure_validity ( & txn , 0 ) ;
txn . commit ( ) . unwrap ( ) ;
2022-09-01 12:57:29 +02:00
milli_snap! ( format! ( " {index} " ) , " after_delete " ) ;
2022-09-01 11:33:50 +02:00
}
}
2022-10-12 09:42:55 +02:00
2022-10-12 10:23:40 +02:00
// fuzz tests
2022-09-08 11:53:01 +02:00
#[ cfg(all(test, fuzzing)) ]
2022-10-17 12:42:12 +02:00
/**
Fuzz test for the incremental indxer .
The fuzz test uses fuzzcheck , a coverage - guided fuzzer .
See https ://github.com/loiclec/fuzzcheck-rs and https://fuzzcheck.neocities.org
for more information .
It is only run when using the ` cargo fuzzcheck ` command line tool , which can be installed with :
` ` ` sh
cargo install cargo - fuzzcheck
` ` `
To start the fuzz test , run ( from the base folder or from milli / ) :
` ` ` sh
cargo fuzzcheck update ::facet ::incremental ::fuzz ::fuzz
` ` `
and wait a couple minutes to make sure the code was thoroughly tested , then
hit ` Ctrl - C ` to stop the fuzzer . The corpus generated by the fuzzer is located in milli / fuzz .
To work on this module with rust - analyzer working properly , add the following to your . cargo / config . toml file :
` ` ` toml
[ build ]
rustflags = [ " --cfg " , " fuzzing " ]
` ` `
The fuzz test generates sequences of additions and deletions to the facet database and
ensures that :
1. its structure is still internally valid
2. its content is the same as a trivially correct implementation of the same database
* /
2022-09-08 11:53:01 +02:00
mod fuzz {
use std ::collections ::{ BTreeMap , HashMap } ;
2022-10-12 10:23:40 +02:00
use std ::iter ::FromIterator ;
2022-09-08 11:53:01 +02:00
use std ::rc ::Rc ;
2022-10-12 10:23:40 +02:00
use fuzzcheck ::mutators ::integer ::U8Mutator ;
2022-09-08 11:53:01 +02:00
use fuzzcheck ::mutators ::integer_within_range ::{ U16WithinRangeMutator , U8WithinRangeMutator } ;
2022-10-12 10:23:40 +02:00
use fuzzcheck ::mutators ::vector ::VecMutator ;
2022-09-08 11:53:01 +02:00
use fuzzcheck ::DefaultMutator ;
use roaring ::RoaringBitmap ;
use tempfile ::TempDir ;
use super ::* ;
2022-12-05 10:33:31 +01:00
use crate ::update ::facet ::test_helpers ::FacetIndex ;
2022-09-08 11:53:01 +02:00
#[ derive(Default) ]
pub struct TrivialDatabase < T > {
pub elements : BTreeMap < u16 , BTreeMap < T , RoaringBitmap > > ,
}
impl < T > TrivialDatabase < T >
where
2022-11-14 14:16:14 +01:00
T : Ord + Clone + Eq + std ::fmt ::Debug ,
2022-09-08 11:53:01 +02:00
{
#[ no_coverage ]
2022-11-14 14:16:14 +01:00
pub fn insert ( & mut self , field_id : u16 , new_key : & T , new_values : & RoaringBitmap ) {
2022-09-08 11:53:01 +02:00
if new_values . is_empty ( ) {
return ;
}
let values_field_id = self . elements . entry ( field_id ) . or_default ( ) ;
2022-11-14 14:16:14 +01:00
let values = values_field_id . entry ( new_key . clone ( ) ) . or_default ( ) ;
2022-09-08 11:53:01 +02:00
* values | = new_values ;
}
#[ no_coverage ]
2022-11-14 14:16:14 +01:00
pub fn delete ( & mut self , field_id : u16 , key : & T , values_to_remove : & RoaringBitmap ) {
2022-09-08 11:53:01 +02:00
if let Some ( values_field_id ) = self . elements . get_mut ( & field_id ) {
if let Some ( values ) = values_field_id . get_mut ( & key ) {
2022-10-12 10:23:40 +02:00
* values - = values_to_remove ;
2022-09-08 11:53:01 +02:00
if values . is_empty ( ) {
values_field_id . remove ( & key ) ;
}
}
if values_field_id . is_empty ( ) {
self . elements . remove ( & field_id ) ;
}
}
}
}
#[ derive(Clone, DefaultMutator, serde::Serialize, serde::Deserialize) ]
2022-11-14 14:16:14 +01:00
struct Operation {
#[ field_mutator(VecMutator<u8, U8Mutator> = { VecMutator::new(u8::default_mutator(), 0 ..= 5) }) ]
key : Vec < u8 > ,
2022-09-08 11:53:01 +02:00
#[ field_mutator(U8WithinRangeMutator = { U8WithinRangeMutator::new(..32) }) ]
group_size : u8 ,
#[ field_mutator(U8WithinRangeMutator = { U8WithinRangeMutator::new(..32) }) ]
max_group_size : u8 ,
#[ field_mutator(U8WithinRangeMutator = { U8WithinRangeMutator::new(..32) }) ]
min_level_size : u8 ,
#[ field_mutator(U16WithinRangeMutator = { U16WithinRangeMutator::new(..=3) }) ]
field_id : u16 ,
kind : OperationKind ,
}
#[ derive(Clone, DefaultMutator, serde::Serialize, serde::Deserialize) ]
enum OperationKind {
2022-10-12 10:23:40 +02:00
Insert (
#[ field_mutator(VecMutator<u8, U8Mutator> = { VecMutator::new(U8Mutator::default(), 0 ..= 10) }) ]
Vec < u8 > ,
) ,
Delete (
#[ field_mutator(VecMutator<u8, U8Mutator> = { VecMutator::new(U8Mutator::default(), 0 ..= 10) }) ]
Vec < u8 > ,
) ,
2022-09-08 11:53:01 +02:00
}
#[ no_coverage ]
2022-11-14 14:16:14 +01:00
fn compare_with_trivial_database ( tempdir : Rc < TempDir > , operations : & [ Operation ] ) {
2023-11-27 11:52:22 +01:00
let index = FacetIndex ::< BytesRefCodec > ::open_from_tempdir ( tempdir , 4 , 8 , 5 ) ; // dummy params, they'll be overwritten
2022-09-08 11:53:01 +02:00
let mut txn = index . env . write_txn ( ) . unwrap ( ) ;
2022-11-14 14:16:14 +01:00
let mut trivial_db = TrivialDatabase ::< Vec < u8 > > ::default ( ) ;
let mut value_to_keys = HashMap ::< u8 , Vec < Vec < u8 > > > ::new ( ) ;
2022-09-08 11:53:01 +02:00
for Operation { key , group_size , max_group_size , min_level_size , field_id , kind } in
operations
{
index . set_group_size ( * group_size ) ;
index . set_max_group_size ( * max_group_size ) ;
index . set_min_level_size ( * min_level_size ) ;
match kind {
OperationKind ::Insert ( values ) = > {
let mut bitmap = RoaringBitmap ::new ( ) ;
for value in values {
bitmap . insert ( * value as u32 ) ;
2022-11-14 14:16:14 +01:00
value_to_keys . entry ( * value ) . or_default ( ) . push ( key . clone ( ) ) ;
2022-09-08 11:53:01 +02:00
}
2022-11-14 14:16:14 +01:00
index . insert ( & mut txn , * field_id , & key . as_slice ( ) , & bitmap ) ;
trivial_db . insert ( * field_id , & key , & bitmap ) ;
2022-09-08 11:53:01 +02:00
}
2022-10-12 10:23:40 +02:00
OperationKind ::Delete ( values ) = > {
let values = RoaringBitmap ::from_iter ( values . iter ( ) . copied ( ) . map ( | x | x as u32 ) ) ;
let mut values_per_key = HashMap ::new ( ) ;
for value in values {
if let Some ( keys ) = value_to_keys . get ( & ( value as u8 ) ) {
for key in keys {
let values : & mut RoaringBitmap =
values_per_key . entry ( key ) . or_default ( ) ;
values . insert ( value ) ;
}
2022-09-08 11:53:01 +02:00
}
}
2022-10-12 10:23:40 +02:00
for ( key , values ) in values_per_key {
2022-11-14 14:16:14 +01:00
index . delete ( & mut txn , * field_id , & key . as_slice ( ) , & values ) ;
trivial_db . delete ( * field_id , & key , & values ) ;
2022-10-12 10:23:40 +02:00
}
2022-09-08 11:53:01 +02:00
}
}
}
for ( field_id , values_field_id ) in trivial_db . elements . iter ( ) {
let level0iter = index
. content
. as_polymorph ( )
2023-11-27 11:52:22 +01:00
. prefix_iter ::< _ , Bytes , FacetGroupValueCodec > ( & mut txn , & field_id . to_be_bytes ( ) )
2022-09-08 11:53:01 +02:00
. unwrap ( ) ;
for ( ( key , values ) , group ) in values_field_id . iter ( ) . zip ( level0iter ) {
let ( group_key , group_values ) = group . unwrap ( ) ;
2022-11-14 14:16:14 +01:00
let group_key =
2023-11-27 11:52:22 +01:00
FacetGroupKeyCodec ::< BytesRefCodec > ::bytes_decode ( group_key ) . unwrap ( ) ;
2022-09-08 11:53:01 +02:00
assert_eq! ( key , & group_key . left_bound ) ;
assert_eq! ( values , & group_values . bitmap ) ;
}
}
for ( field_id , values_field_id ) in trivial_db . elements . iter ( ) {
let level0iter = index
. content
. as_polymorph ( )
2023-11-27 11:52:22 +01:00
. prefix_iter ::< _ , Bytes , FacetGroupValueCodec > ( & txn , & field_id . to_be_bytes ( ) )
2022-09-08 11:53:01 +02:00
. unwrap ( ) ;
for ( ( key , values ) , group ) in values_field_id . iter ( ) . zip ( level0iter ) {
let ( group_key , group_values ) = group . unwrap ( ) ;
2022-11-14 14:16:14 +01:00
let group_key =
2023-11-27 11:52:22 +01:00
FacetGroupKeyCodec ::< BytesRefCodec > ::bytes_decode ( group_key ) . unwrap ( ) ;
2022-09-08 11:53:01 +02:00
assert_eq! ( key , & group_key . left_bound ) ;
assert_eq! ( values , & group_values . bitmap ) ;
}
index . verify_structure_validity ( & txn , * field_id ) ;
}
txn . abort ( ) . unwrap ( ) ;
}
#[ test ]
#[ no_coverage ]
fn fuzz ( ) {
let tempdir = Rc ::new ( TempDir ::new ( ) . unwrap ( ) ) ;
let tempdir_cloned = tempdir . clone ( ) ;
2022-11-14 14:16:14 +01:00
let result = fuzzcheck ::fuzz_test ( move | operations : & [ Operation ] | {
2022-09-08 11:53:01 +02:00
compare_with_trivial_database ( tempdir_cloned . clone ( ) , operations )
} )
. default_mutator ( )
. serde_serializer ( )
. default_sensor_and_pool_with_custom_filter ( | file , function | {
file = = std ::path ::Path ::new ( " milli/src/update/facet/incremental.rs " )
& & ! function . contains ( " serde " )
& & ! function . contains ( " tests:: " )
& & ! function . contains ( " fuzz:: " )
& & ! function . contains ( " display_bitmap " )
} )
. arguments_from_cargo_fuzzcheck ( )
. launch ( ) ;
assert! ( ! result . found_test_failure ) ;
}
}