Document and refine facet indexing algorithms

This commit is contained in:
Loïc Lecrenier 2022-09-07 16:44:08 +02:00 committed by Loïc Lecrenier
parent bee3c23b45
commit 27454e9828
5 changed files with 387 additions and 291 deletions

View File

@ -29,31 +29,14 @@ pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> {
}
}
/// The key in the [`facet_id_string_docids` and `facet_id_f64_docids`][`Index::facet_id_string_docids`]
/// databases.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct FacetGroupKey<T> {
pub field_id: u16,
pub level: u8,
pub left_bound: T,
}
impl<'a> FacetGroupKey<&'a [u8]> {
pub fn into_owned(self) -> FacetGroupKey<Vec<u8>> {
FacetGroupKey {
field_id: self.field_id,
level: self.level,
left_bound: self.left_bound.to_vec(),
}
}
}
impl<'a> FacetGroupKey<Vec<u8>> {
pub fn as_ref(&self) -> FacetGroupKey<&[u8]> {
FacetGroupKey {
field_id: self.field_id,
level: self.level,
left_bound: self.left_bound.as_slice(),
}
}
}
#[derive(Debug)]
pub struct FacetGroupValue {

View File

@ -1,24 +1,30 @@
use std::borrow::Cow;
use std::cmp;
use std::fs::File;
use grenad::CompressionType;
use heed::types::ByteSlice;
use heed::{BytesEncode, Error, RoTxn, RwTxn};
use log::debug;
use roaring::RoaringBitmap;
use time::OffsetDateTime;
use crate::facet::FacetType;
use crate::heed_codec::facet::{
ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
};
use crate::update::index_documents::{create_writer, writer_into_reader};
use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
use grenad::CompressionType;
use heed::types::ByteSlice;
use heed::{BytesEncode, Error, RoTxn, RwTxn};
use log::debug;
use roaring::RoaringBitmap;
use std::borrow::Cow;
use std::fs::File;
use time::OffsetDateTime;
use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases
/// by rebuilding the database "from scratch".
///
/// First, the new elements are inserted into the level 0 of the database. Then, the
/// higher levels are cleared and recomputed from the content of level 0.
///
/// Finally, the `faceted_documents_ids` value in the main database of `Index`
/// is updated to contain the new set of faceted documents.
pub struct FacetsUpdateBulk<'i> {
index: &'i Index,
database: heed::Database<FacetGroupKeyCodec<ByteSliceRef>, FacetGroupValueCodec>,
group_size: u8,
min_level_size: u8,
facet_type: FacetType,
@ -31,22 +37,10 @@ impl<'i> FacetsUpdateBulk<'i> {
index: &'i Index,
facet_type: FacetType,
new_data: grenad::Reader<File>,
group_size: u8,
min_level_size: u8,
) -> FacetsUpdateBulk<'i> {
FacetsUpdateBulk {
index,
database: match facet_type {
FacetType::String => index
.facet_id_string_docids
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRef>>(),
FacetType::Number => {
index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRef>>()
}
},
group_size: 4,
min_level_size: 5,
facet_type,
new_data: Some(new_data),
}
FacetsUpdateBulk { index, group_size, min_level_size, facet_type, new_data: Some(new_data) }
}
pub fn new_not_updating_level_0(
@ -55,44 +49,31 @@ impl<'i> FacetsUpdateBulk<'i> {
) -> FacetsUpdateBulk<'i> {
FacetsUpdateBulk {
index,
database: match facet_type {
FacetType::String => index
.facet_id_string_docids
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRef>>(),
FacetType::Number => {
index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRef>>()
}
},
group_size: 4,
min_level_size: 5,
group_size: FACET_GROUP_SIZE,
min_level_size: FACET_MIN_LEVEL_SIZE,
facet_type,
new_data: None,
}
}
/// The number of elements from the level below that are represented by a single element in the level above
///
/// This setting is always greater than or equal to 2.
pub fn level_group_size(mut self, value: u8) -> Self {
self.group_size = cmp::max(value, 2);
self
}
/// The minimum number of elements that a level is allowed to have.
pub fn min_level_size(mut self, value: u8) -> Self {
self.min_level_size = cmp::max(value, 2);
self
}
#[logging_timer::time("FacetsUpdateBulk::{}")]
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
let Self { index, database, group_size, min_level_size, facet_type, new_data } = self;
let Self { index, group_size, min_level_size, facet_type, new_data } = self;
let db = match facet_type {
FacetType::String => {
index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRef>>()
}
FacetType::Number => {
index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRef>>()
}
};
index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
let inner = FacetsUpdateBulkInner { db: database, new_data, group_size, min_level_size };
let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size };
let field_ids = index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Box<[_]>>();
@ -105,6 +86,7 @@ impl<'i> FacetsUpdateBulk<'i> {
}
}
/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type
pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
pub db: heed::Database<FacetGroupKeyCodec<ByteSliceRef>, FacetGroupValueCodec>,
pub new_data: Option<grenad::Reader<R>>,

View File

@ -1,16 +1,14 @@
use std::collections::HashMap;
use std::fs::File;
use heed::types::ByteSlice;
use heed::{BytesDecode, Error, RoTxn, RwTxn};
use roaring::RoaringBitmap;
use crate::facet::FacetType;
use crate::heed_codec::facet::{
ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
};
use crate::search::facet::get_highest_level;
use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
use heed::types::{ByteSlice, DecodeIgnore};
use heed::{BytesDecode, Error, RoTxn, RwTxn};
use roaring::RoaringBitmap;
use std::collections::HashMap;
use std::fs::File;
enum InsertionResult {
InPlace,
@ -18,10 +16,15 @@ enum InsertionResult {
}
enum DeletionResult {
InPlace,
Reduce { prev: Option<Vec<u8>>, next: Option<Vec<u8>> },
Remove { prev: Option<Vec<u8>>, next: Option<Vec<u8>> },
Reduce { next: Option<Vec<u8>> },
Remove { next: Option<Vec<u8>> },
}
/// Algorithm to incrementally insert and delete elememts into the
/// `facet_id_(string/f64)_docids` databases.
///
/// Rhe `faceted_documents_ids` value in the main database of `Index`
/// is also updated to contain the new set of faceted documents.
pub struct FacetsUpdateIncremental<'i> {
index: &'i Index,
inner: FacetsUpdateIncrementalInner,
@ -30,7 +33,14 @@ pub struct FacetsUpdateIncremental<'i> {
}
impl<'i> FacetsUpdateIncremental<'i> {
pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader<File>) -> Self {
pub fn new(
index: &'i Index,
facet_type: FacetType,
new_data: grenad::Reader<File>,
group_size: u8,
min_level_size: u8,
max_group_size: u8,
) -> Self {
FacetsUpdateIncremental {
index,
inner: FacetsUpdateIncrementalInner {
@ -42,26 +52,15 @@ impl<'i> FacetsUpdateIncremental<'i> {
.facet_id_f64_docids
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRef>>(),
},
group_size: 4,
max_group_size: 8,
min_level_size: 5,
group_size,
max_group_size,
min_level_size,
},
facet_type,
new_data,
}
}
pub fn group_size(mut self, size: u8) -> Self {
self.inner.group_size = size;
self
}
pub fn min_level_size(mut self, size: u8) -> Self {
self.inner.min_level_size = size;
self
}
pub fn max_group_size(mut self, size: u8) -> Self {
self.inner.max_group_size = size;
self
}
pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> {
let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default();
@ -83,6 +82,7 @@ impl<'i> FacetsUpdateIncremental<'i> {
}
}
/// Implementation of `FacetsUpdateIncremental` that is independent of milli's `Index` type
pub struct FacetsUpdateIncrementalInner {
pub db: heed::Database<FacetGroupKeyCodec<ByteSliceRef>, FacetGroupValueCodec>,
pub group_size: u8,
@ -90,22 +90,36 @@ pub struct FacetsUpdateIncrementalInner {
pub max_group_size: u8,
}
impl FacetsUpdateIncrementalInner {
/// Find the `FacetGroupKey`/`FacetGroupValue` in the database that
/// should be used to insert the new `facet_value` for the given `field_id` and `level`
/// where `level` must be strictly greater than 0.
///
/// For example, when inserting the facet value `4`, there are two possibilities:
///
/// 1. We find a key whose lower bound is 3 followed by a key whose lower bound is 6. Therefore,
/// we know that the implicit range of the first key is 3..6, which contains 4.
/// So the new facet value belongs in that first key/value pair.
///
/// 2. The first key of the level has a lower bound of `5`. We return this key/value pair
/// but will need to change the lowerbound of this key to `4` in order to insert this facet value.
fn find_insertion_key_value(
&self,
field_id: u16,
level: u8,
search_key: &[u8],
facet_value: &[u8],
txn: &RoTxn,
) -> Result<(FacetGroupKey<Vec<u8>>, FacetGroupValue)> {
assert!(level > 0);
let mut prefix = vec![];
prefix.extend_from_slice(&field_id.to_be_bytes());
prefix.push(level);
prefix.extend_from_slice(search_key);
prefix.extend_from_slice(facet_value);
let mut prefix_iter = self
.db
.as_polymorph()
.prefix_iter::<_, ByteSliceRef, FacetGroupValueCodec>(txn, &prefix.as_slice())?;
.prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(txn, prefix.as_slice())?;
if let Some(e) = prefix_iter.next() {
let (key_bytes, value) = e?;
Ok((
@ -115,10 +129,10 @@ impl FacetsUpdateIncrementalInner {
value,
))
} else {
let key = FacetGroupKey { field_id, level, left_bound: search_key };
let key = FacetGroupKey { field_id, level, left_bound: facet_value };
match self.db.get_lower_than(txn, &key)? {
Some((key, value)) => {
if key.level != level || key.field_id != field_id {
if key.level != level {
let mut prefix = vec![];
prefix.extend_from_slice(&field_id.to_be_bytes());
prefix.push(level);
@ -126,7 +140,7 @@ impl FacetsUpdateIncrementalInner {
let mut iter = self
.db
.as_polymorph()
.prefix_iter::<_, ByteSliceRef, FacetGroupValueCodec>(
.prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(
txn,
&prefix.as_slice(),
)?;
@ -146,15 +160,19 @@ impl FacetsUpdateIncrementalInner {
}
}
/// Insert the given facet value and corresponding document ids in the level 0 of the database
///
/// ## Return
/// See documentation of `insert_in_level`
fn insert_in_level_0<'t>(
&self,
txn: &'t mut RwTxn,
field_id: u16,
new_key: &[u8],
new_values: &RoaringBitmap,
facet_value: &[u8],
docids: &RoaringBitmap,
) -> Result<InsertionResult> {
let key = FacetGroupKey { field_id, level: 0, left_bound: new_key };
let value = FacetGroupValue { bitmap: new_values.clone(), size: 1 };
let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value };
let value = FacetGroupValue { bitmap: docids.clone(), size: 1 };
let mut level0_prefix = vec![];
level0_prefix.extend_from_slice(&field_id.to_be_bytes());
@ -163,7 +181,7 @@ impl FacetsUpdateIncrementalInner {
let mut iter = self
.db
.as_polymorph()
.prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &level0_prefix)?;
.prefix_iter::<_, ByteSlice, DecodeIgnore>(&txn, &level0_prefix)?;
if iter.next().is_none() {
drop(iter);
@ -186,143 +204,158 @@ impl FacetsUpdateIncrementalInner {
}
}
}
/// Insert the given facet value and corresponding document ids in all the levels of the database up to the given `level`.
/// This function works recursively.
///
/// ## Return
/// Returns the effect of adding the facet value to the database on the given `level`.
///
/// - `InsertionResult::InPlace` means that inserting the `facet_value` into the `level` did not have
/// an effect on the number of keys in that level. Therefore, it did not increase the number of children
/// of the parent node.
///
/// - `InsertionResult::Insert` means that inserting the `facet_value` into the `level` resulted
/// in the addition of a new key in that level, and that therefore the number of children
/// of the parent node should be incremented.
fn insert_in_level<'t>(
&self,
txn: &'t mut RwTxn,
field_id: u16,
level: u8,
new_key: &[u8],
new_values: &RoaringBitmap,
facet_value: &[u8],
docids: &RoaringBitmap,
) -> Result<InsertionResult> {
if level == 0 {
return self.insert_in_level_0(txn, field_id, new_key, new_values);
return self.insert_in_level_0(txn, field_id, facet_value, docids);
}
let max_group_size = self.max_group_size;
let (insertion_key, insertion_value) =
self.find_insertion_key_value(field_id, level, new_key, txn)?;
let result = self.insert_in_level(txn, field_id, level - 1, new_key.clone(), new_values)?;
let result = self.insert_in_level(txn, field_id, level - 1, facet_value.clone(), docids)?;
// level below inserted an element
let insertion_key = {
let mut new_insertion_key = insertion_key.clone();
let mut modified = false;
if new_key < insertion_key.left_bound.as_slice() {
new_insertion_key.left_bound = new_key.to_vec();
modified = true;
}
if modified {
let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?;
assert!(is_deleted);
self.db.put(txn, &new_insertion_key.as_ref(), &insertion_value)?;
}
new_insertion_key
};
let (insertion_key, insertion_value) =
self.find_insertion_key_value(field_id, level, facet_value, txn)?;
match result {
// TODO: this could go above the block recomputing insertion key
// because we know that if we inserted in place, the key is not a new one
// thus it doesn't extend a group
// because we know that we inserted in place, the facet_value is not a new one
// thus it doesn't extend a group, and thus the insertion key computed above is
// still correct
InsertionResult::InPlace => {
let mut updated_value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap();
updated_value.bitmap |= new_values;
let mut updated_value = insertion_value;
updated_value.bitmap |= docids;
self.db.put(txn, &insertion_key.as_ref(), &updated_value)?;
return Ok(InsertionResult::InPlace);
}
InsertionResult::Insert => {}
}
let mut updated_value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap();
// Here we know that inserting the facet value in the level below resulted in the creation
// of a new key. Therefore, it may be the case that we need to modify the left bound of the
// insertion key (see documentation of `find_insertion_key_value` for an example of when that
// could happen).
let insertion_key = {
let mut new_insertion_key = insertion_key.clone();
let mut key_should_be_modified = false;
if facet_value < insertion_key.left_bound.as_slice() {
new_insertion_key.left_bound = facet_value.to_vec();
key_should_be_modified = true;
}
if key_should_be_modified {
let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?;
assert!(is_deleted);
self.db.put(txn, &new_insertion_key.as_ref(), &insertion_value)?;
}
new_insertion_key
};
// Now we know that the insertion key contains the `facet_value`.
// We still need to update the insertion value by:
// 1. Incrementing the number of children (since the recursive call returned `InsertionResult::Insert`)
// 2. Merge the previous docids with the new one
let mut updated_value = insertion_value;
updated_value.size += 1;
if updated_value.size == max_group_size {
let size_left = max_group_size / 2;
let size_right = max_group_size - size_left;
let level_below = level - 1;
if updated_value.size < max_group_size {
updated_value.bitmap |= docids;
self.db.put(txn, &insertion_key.as_ref(), &updated_value)?;
let (start_key, _) = self
.db
.get_greater_than_or_equal_to(
&txn,
&FacetGroupKey {
field_id,
level: level_below,
left_bound: insertion_key.left_bound.as_slice(),
},
)?
.unwrap();
let mut iter = self.db.range(&txn, &(start_key..))?.take(max_group_size as usize);
let group_left = {
let mut values_left = RoaringBitmap::new();
let mut i = 0;
while let Some(next) = iter.next() {
let (_key, value) = next?;
i += 1;
values_left |= &value.bitmap;
if i == size_left {
break;
}
}
let key =
FacetGroupKey { field_id, level, left_bound: insertion_key.left_bound.clone() };
let value = FacetGroupValue { size: size_left as u8, bitmap: values_left };
(key, value)
};
let group_right = {
let mut values_right = RoaringBitmap::new();
let mut right_start_key = None;
while let Some(next) = iter.next() {
let (key, value) = next?;
if right_start_key.is_none() {
right_start_key = Some(key.left_bound);
}
values_right |= &value.bitmap;
}
let key = FacetGroupKey {
field_id,
level,
left_bound: right_start_key.unwrap().to_vec(),
};
let value = FacetGroupValue { size: size_right as u8, bitmap: values_right };
(key, value)
};
drop(iter);
let _ = self.db.delete(txn, &insertion_key.as_ref())?;
self.db.put(txn, &group_left.0.as_ref(), &group_left.1)?;
self.db.put(txn, &group_right.0.as_ref(), &group_right.1)?;
Ok(InsertionResult::Insert)
} else {
let mut value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap();
value.bitmap |= new_values;
value.size += 1;
self.db.put(txn, &insertion_key.as_ref(), &value).unwrap();
Ok(InsertionResult::InPlace)
return Ok(InsertionResult::InPlace);
}
// We've increased the group size of the value and realised it has become greater than or equal to `max_group_size`
// Therefore it must be split into two nodes.
let size_left = max_group_size / 2;
let size_right = max_group_size - size_left;
let level_below = level - 1;
let start_key = FacetGroupKey {
field_id,
level: level_below,
left_bound: insertion_key.left_bound.as_slice(),
};
let mut iter = self.db.range(&txn, &(start_key..))?.take(max_group_size as usize);
let group_left = {
let mut values_left = RoaringBitmap::new();
let mut i = 0;
while let Some(next) = iter.next() {
let (_key, value) = next?;
i += 1;
values_left |= &value.bitmap;
if i == size_left {
break;
}
}
let key =
FacetGroupKey { field_id, level, left_bound: insertion_key.left_bound.clone() };
let value = FacetGroupValue { size: size_left as u8, bitmap: values_left };
(key, value)
};
let group_right = {
let (
FacetGroupKey { left_bound: right_left_bound, .. },
FacetGroupValue { bitmap: mut values_right, .. },
) = iter.next().unwrap()?;
while let Some(next) = iter.next() {
let (_, value) = next?;
values_right |= &value.bitmap;
}
let key = FacetGroupKey { field_id, level, left_bound: right_left_bound.to_vec() };
let value = FacetGroupValue { size: size_right as u8, bitmap: values_right };
(key, value)
};
drop(iter);
let _ = self.db.delete(txn, &insertion_key.as_ref())?;
self.db.put(txn, &group_left.0.as_ref(), &group_left.1)?;
self.db.put(txn, &group_right.0.as_ref(), &group_right.1)?;
Ok(InsertionResult::Insert)
}
/// Insert the given facet value and corresponding document ids in the database.
pub fn insert<'a, 't>(
&self,
txn: &'t mut RwTxn,
field_id: u16,
new_key: &[u8],
new_values: &RoaringBitmap,
facet_value: &[u8],
docids: &RoaringBitmap,
) -> Result<()> {
if new_values.is_empty() {
if docids.is_empty() {
return Ok(());
}
let group_size = self.group_size;
@ -330,12 +363,15 @@ impl FacetsUpdateIncrementalInner {
let highest_level = get_highest_level(&txn, self.db, field_id)?;
let result =
self.insert_in_level(txn, field_id, highest_level as u8, new_key, new_values)?;
self.insert_in_level(txn, field_id, highest_level as u8, facet_value, docids)?;
match result {
InsertionResult::InPlace => return Ok(()),
InsertionResult::Insert => {}
}
// Here we check whether the highest level has exceeded `min_level_size` * `self.group_size`.
// If it has, we must build an addition level above it.
let mut highest_level_prefix = vec![];
highest_level_prefix.extend_from_slice(&field_id.to_be_bytes());
highest_level_prefix.push(highest_level);
@ -384,36 +420,61 @@ impl FacetsUpdateIncrementalInner {
Ok(())
}
/// Delete the given document id from the given facet value in the database, from level 0 to the
/// the given level.
///
/// ## Return
/// Returns the effect of removing the document id from the database on the given `level`.
///
/// - `DeletionResult::InPlace` means that deleting the document id did not have
/// an effect on the keys in that level.
///
/// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the
/// number of keys in the level. For example, removing a document id from the facet value `3` could
/// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted
/// entirely. In that case, `DeletionResult::Remove` is returned. The parent of the deleted key must
/// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well.
///
/// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the
/// bounds of the keys of the level. For example, removing a document id from the facet value
/// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore,
/// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4).
/// In that case `DeletionResult::Reduce` is returned. The parent of the reduced key may need to adjust
/// its left bound as well.
fn delete_in_level<'t>(
&self,
txn: &'t mut RwTxn,
field_id: u16,
level: u8,
key: &[u8],
value: u32,
facet_value: &[u8],
docid: u32,
) -> Result<DeletionResult> {
if level == 0 {
return self.delete_in_level_0(txn, field_id, key, value);
return self.delete_in_level_0(txn, field_id, facet_value, docid);
}
let (deletion_key, mut bitmap) =
self.find_insertion_key_value(field_id, level, key, txn)?;
self.find_insertion_key_value(field_id, level, facet_value, txn)?;
let result = self.delete_in_level(txn, field_id, level - 1, key.clone(), value)?;
let result = self.delete_in_level(txn, field_id, level - 1, facet_value.clone(), docid)?;
let mut decrease_size = false;
let (prev_key, next_key) = match result {
let next_key = match result {
DeletionResult::InPlace => {
bitmap.bitmap.remove(value);
bitmap.bitmap.remove(docid);
self.db.put(txn, &deletion_key.as_ref(), &bitmap)?;
return Ok(DeletionResult::InPlace);
}
DeletionResult::Reduce { prev, next } => (prev, next),
DeletionResult::Remove { prev, next } => {
DeletionResult::Reduce { next } => next,
DeletionResult::Remove { next } => {
decrease_size = true;
(prev, next)
next
}
};
// If either DeletionResult::Reduce or DeletionResult::Remove was returned,
// then we may need to adjust the left_bound of the deletion key.
// If DeletionResult::Remove was returned, then we need to decrease the group
// size of the deletion key.
let mut updated_value = bitmap;
if decrease_size {
updated_value.size -= 1;
@ -421,17 +482,21 @@ impl FacetsUpdateIncrementalInner {
if updated_value.size == 0 {
self.db.delete(txn, &deletion_key.as_ref())?;
Ok(DeletionResult::Remove { prev: prev_key, next: next_key })
Ok(DeletionResult::Remove { next: next_key })
} else {
let mut updated_deletion_key = deletion_key.clone();
if key == deletion_key.left_bound {
let reduced_range = facet_value == deletion_key.left_bound;
if reduced_range {
updated_deletion_key.left_bound = next_key.clone().unwrap();
}
updated_value.bitmap.remove(value);
updated_value.bitmap.remove(docid);
let _ = self.db.delete(txn, &deletion_key.as_ref())?;
self.db.put(txn, &updated_deletion_key.as_ref(), &updated_value)?;
Ok(DeletionResult::Reduce { prev: prev_key, next: next_key })
if reduced_range {
Ok(DeletionResult::Reduce { next: next_key })
} else {
Ok(DeletionResult::InPlace)
}
}
}
@ -439,27 +504,24 @@ impl FacetsUpdateIncrementalInner {
&self,
txn: &'t mut RwTxn,
field_id: u16,
key: &[u8],
value: u32,
facet_value: &[u8],
docid: u32,
) -> Result<DeletionResult> {
let key = FacetGroupKey { field_id, level: 0, left_bound: key };
let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value };
let mut bitmap = self.db.get(&txn, &key)?.unwrap().bitmap;
bitmap.remove(value);
bitmap.remove(docid);
if bitmap.is_empty() {
let mut prev_key = None;
let mut next_key = None;
if let Some(prev) = self.db.get_lower_than(&txn, &key)? {
prev_key = Some(prev.0.left_bound.to_vec());
}
if let Some(next) = self.db.get_greater_than(&txn, &key)? {
if next.0.level == 0 {
next_key = Some(next.0.left_bound.to_vec());
if let Some((next, _)) =
self.db.remap_data_type::<DecodeIgnore>().get_greater_than(&txn, &key)?
{
if next.field_id == field_id && next.level == 0 {
next_key = Some(next.left_bound.to_vec());
}
}
self.db.delete(txn, &key)?;
Ok(DeletionResult::Remove { prev: prev_key, next: next_key })
Ok(DeletionResult::Remove { next: next_key })
} else {
self.db.put(txn, &key, &FacetGroupValue { size: 1, bitmap })?;
Ok(DeletionResult::InPlace)
@ -470,22 +532,30 @@ impl FacetsUpdateIncrementalInner {
&self,
txn: &'t mut RwTxn,
field_id: u16,
key: &[u8],
value: u32,
facet_value: &[u8],
docid: u32,
) -> Result<()> {
if self.db.get(txn, &FacetGroupKey { field_id, level: 0, left_bound: key })?.is_none() {
if self
.db
.remap_data_type::<DecodeIgnore>()
.get(txn, &FacetGroupKey { field_id, level: 0, left_bound: facet_value })?
.is_none()
{
return Ok(());
}
let highest_level = get_highest_level(&txn, self.db, field_id)?;
// let key_bytes = BoundCodec::bytes_encode(&key).unwrap();
let result = self.delete_in_level(txn, field_id, highest_level as u8, key, value)?;
let result =
self.delete_in_level(txn, field_id, highest_level as u8, facet_value, docid)?;
match result {
DeletionResult::InPlace => return Ok(()),
DeletionResult::Reduce { .. } => {}
DeletionResult::Reduce { .. } => return Ok(()),
DeletionResult::Remove { .. } => {}
}
// if we either removed a key from the highest level, its size may have fallen
// below `min_level_size`, in which case we need to remove the entire level
let mut highest_level_prefix = vec![];
highest_level_prefix.extend_from_slice(&field_id.to_be_bytes());
highest_level_prefix.push(highest_level);
@ -521,6 +591,26 @@ impl FacetsUpdateIncrementalInner {
}
}
impl<'a> FacetGroupKey<&'a [u8]> {
pub fn into_owned(self) -> FacetGroupKey<Vec<u8>> {
FacetGroupKey {
field_id: self.field_id,
level: self.level,
left_bound: self.left_bound.to_vec(),
}
}
}
impl<'a> FacetGroupKey<Vec<u8>> {
pub fn as_ref(&self) -> FacetGroupKey<&[u8]> {
FacetGroupKey {
field_id: self.field_id,
level: self.level,
left_bound: self.left_bound.as_slice(),
}
}
}
#[cfg(test)]
mod tests {
use crate::heed_codec::facet::{OrderedF64Codec, StrRefCodec};

View File

@ -1,3 +1,79 @@
/*!
This module implements two different algorithms for updating the `facet_id_string_docids`
and `facet_id_f64_docids` databases. The first algorithm is a "bulk" algorithm, meaning that
it recreates the database from scratch when new elements are added to it. The second algorithm
is incremental: it modifies the database as little as possible.
The databases must be able to return results for queries such as:
1. Filter : find all the document ids that have a facet value greater than X and/or smaller than Y
2. Min/Max : find the minimum/maximum facet value among these document ids
3. Sort : sort these document ids by increasing/decreasing facet values
4. Distribution : given some document ids, make a list of each facet value
found in these documents along with the number of documents that contain it
The algorithms that implement these queries are found in the `src/search/facet` folder.
To make these queries fast to compute, the database adopts a tree structure:
```ignore
"ab" (2) "gaf" (2) "woz" (1)
Level 2
[a, b, d, f, z] [c, d, e, f, g] [u, y]
"ab" (2) "ba" (2) "gaf" (2) "form" (2) "woz" (2)
Level 1
[a, b, d, z] [a, b, f] [c, d, g] [e, f] [u, y]
"ab" "ac" "ba" "bac" "gaf" "gal" "form" "wow" "woz" "zz"
Level 0
[a, b] [d, z] [b, f] [a, f] [c, d] [g] [e] [e, f] [y] [u]
```
In the diagram above, each cell corresponds to a node in the tree. The first line of the cell
contains the left bound of the range of facet values as well as the number of children of the node.
The second line contains the document ids which have a facet value within the range of the node.
The nodes at level 0 are the leaf nodes. They have 0 children and a single facet value in their range.
In the diagram above, the first cell of level 2 is `ab (2)`. Its range is `ab .. gaf` (because
`gaf` is the left bound of the next node) and it has two children. Its document ids are `[a,b,d,f,z]`.
These documents all contain a facet value that is contained within `ab .. gaf`.
In the database, each node is represented by a key/value pair encoded as a [`FacetGroupKey`] and a
[`FacetGroupValue`], which have the following format:
```ignore
FacetGroupKey:
- field id : u16
- level : u8
- left bound: [u8] // the facet value encoded using either OrderedF64Codec or Str
FacetGroupValue:
- #children : u8
- docids : RoaringBitmap
```
When the database is first created using the "bulk" method, each node has a fixed number of children
(except for possibly the last one) given by the `group_size` parameter (default to `FACET_GROUP_SIZE`).
The tree is also built such that the highest level has more than `min_level_size`
(default to `FACET_MIN_LEVEL_SIZE`) elements in it.
When the database is incrementally updated, the number of children of a node can vary between
1 and `max_group_size`. This is done so that most incremental operations do not need to change
the structure of the tree. When the number of children of a node reaches `max_group_size`,
we split the node in two and update the number of children of its parent.
When adding documents to the databases, it is important to determine which method to use to
minimise indexing time. The incremental method is faster when adding few new facet values, but the
bulk method is faster when a large part of the database is modified. Empirically, it seems that
it takes 50x more time to incrementally add N facet values to an existing database than it is to
construct a database of N facet values. This is the heuristic that is used to choose between the
two methods.
*/
pub const FACET_MAX_GROUP_SIZE: u8 = 8;
pub const FACET_GROUP_SIZE: u8 = 4;
pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
use self::incremental::FacetsUpdateIncremental;
use super::FacetsUpdateBulk;
use crate::facet::FacetType;
@ -13,8 +89,8 @@ pub struct FacetsUpdate<'i> {
database: heed::Database<FacetGroupKeyCodec<ByteSliceRef>, FacetGroupValueCodec>,
facet_type: FacetType,
new_data: grenad::Reader<File>,
level_group_size: u8,
max_level_group_size: u8,
group_size: u8,
max_group_size: u8,
min_level_size: u8,
}
impl<'i> FacetsUpdate<'i> {
@ -30,57 +106,24 @@ impl<'i> FacetsUpdate<'i> {
Self {
index,
database,
level_group_size: 4,
max_level_group_size: 8,
min_level_size: 5,
group_size: FACET_GROUP_SIZE,
max_group_size: FACET_MAX_GROUP_SIZE,
min_level_size: FACET_MIN_LEVEL_SIZE,
facet_type,
new_data,
}
}
// TODO: use the options below?
// but I don't actually see why they should be configurable
// /// The minimum number of elements that a level is allowed to have.
// pub fn level_max_group_size(mut self, value: u8) -> Self {
// self.max_level_group_size = std::cmp::max(value, 4);
// self
// }
// /// The number of elements from the level below that are represented by a single element in the level above
// ///
// /// This setting is always greater than or equal to 2.
// pub fn level_group_size(mut self, value: u8) -> Self {
// self.level_group_size = std::cmp::max(value, 2);
// self
// }
// /// The minimum number of elements that a level is allowed to have.
// pub fn min_level_size(mut self, value: u8) -> Self {
// self.min_level_size = std::cmp::max(value, 2);
// self
// }
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
if self.new_data.is_empty() {
return Ok(());
}
// here, come up with a better condition!
// ideally we'd choose which method to use for each field id individually
// but I dont' think it's worth the effort yet
// As a first requirement, we ask that the length of the new data is less
// than a 1/50th of the length of the database in order to use the incremental
// method.
if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) {
let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data)
.level_group_size(self.level_group_size)
.min_level_size(self.min_level_size);
let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data, self.group_size, self.min_level_size);
bulk_update.execute(wtxn)?;
} else {
let incremental_update =
FacetsUpdateIncremental::new(self.index, self.facet_type, self.new_data)
.group_size(self.level_group_size)
.max_group_size(self.max_level_group_size)
.min_level_size(self.min_level_size);
FacetsUpdateIncremental::new(self.index, self.facet_type, self.new_data, self.group_size, self.min_level_size, self.max_group_size);
incremental_update.execute(wtxn)?;
}
Ok(())
@ -346,7 +389,7 @@ mod comparison_bench {
// of the incremental vs. bulk indexer.
// It appears that the incremental indexer is about 50 times slower than the
// bulk indexer.
#[test]
// #[test]
fn benchmark_facet_indexing() {
// then we add 10_000 documents at a time and compare the speed of adding 1, 100, and 1000 documents to it

View File

@ -7,7 +7,7 @@ mod typed_chunk;
use std::collections::HashSet;
use std::io::{Cursor, Read, Seek};
use std::iter::FromIterator;
use std::num::{NonZeroU32, NonZeroUsize};
use std::num::NonZeroU32;
use std::result::Result as StdResult;
use crossbeam_channel::{Receiver, Sender};
@ -82,8 +82,6 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a, F> {
#[derive(Default, Debug, Clone)]
pub struct IndexDocumentsConfig {
pub facet_level_group_size: Option<NonZeroUsize>,
pub facet_min_level_size: Option<NonZeroUsize>,
pub words_prefix_threshold: Option<u32>,
pub max_prefix_length: Option<usize>,
pub words_positions_level_group_size: Option<NonZeroU32>,