mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 23:04:26 +01:00
Document and refine facet indexing algorithms
This commit is contained in:
parent
bee3c23b45
commit
27454e9828
@ -29,31 +29,14 @@ pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The key in the [`facet_id_string_docids` and `facet_id_f64_docids`][`Index::facet_id_string_docids`]
|
||||||
|
/// databases.
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||||
pub struct FacetGroupKey<T> {
|
pub struct FacetGroupKey<T> {
|
||||||
pub field_id: u16,
|
pub field_id: u16,
|
||||||
pub level: u8,
|
pub level: u8,
|
||||||
pub left_bound: T,
|
pub left_bound: T,
|
||||||
}
|
}
|
||||||
impl<'a> FacetGroupKey<&'a [u8]> {
|
|
||||||
pub fn into_owned(self) -> FacetGroupKey<Vec<u8>> {
|
|
||||||
FacetGroupKey {
|
|
||||||
field_id: self.field_id,
|
|
||||||
level: self.level,
|
|
||||||
left_bound: self.left_bound.to_vec(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> FacetGroupKey<Vec<u8>> {
|
|
||||||
pub fn as_ref(&self) -> FacetGroupKey<&[u8]> {
|
|
||||||
FacetGroupKey {
|
|
||||||
field_id: self.field_id,
|
|
||||||
level: self.level,
|
|
||||||
left_bound: self.left_bound.as_slice(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct FacetGroupValue {
|
pub struct FacetGroupValue {
|
||||||
|
@ -1,24 +1,30 @@
|
|||||||
use std::borrow::Cow;
|
|
||||||
use std::cmp;
|
|
||||||
use std::fs::File;
|
|
||||||
|
|
||||||
use grenad::CompressionType;
|
|
||||||
use heed::types::ByteSlice;
|
|
||||||
use heed::{BytesEncode, Error, RoTxn, RwTxn};
|
|
||||||
use log::debug;
|
|
||||||
use roaring::RoaringBitmap;
|
|
||||||
use time::OffsetDateTime;
|
|
||||||
|
|
||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::heed_codec::facet::{
|
use crate::heed_codec::facet::{
|
||||||
ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
||||||
};
|
};
|
||||||
use crate::update::index_documents::{create_writer, writer_into_reader};
|
use crate::update::index_documents::{create_writer, writer_into_reader};
|
||||||
use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
|
use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
|
||||||
|
use grenad::CompressionType;
|
||||||
|
use heed::types::ByteSlice;
|
||||||
|
use heed::{BytesEncode, Error, RoTxn, RwTxn};
|
||||||
|
use log::debug;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
use std::borrow::Cow;
|
||||||
|
use std::fs::File;
|
||||||
|
use time::OffsetDateTime;
|
||||||
|
|
||||||
|
use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
|
||||||
|
|
||||||
|
/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases
|
||||||
|
/// by rebuilding the database "from scratch".
|
||||||
|
///
|
||||||
|
/// First, the new elements are inserted into the level 0 of the database. Then, the
|
||||||
|
/// higher levels are cleared and recomputed from the content of level 0.
|
||||||
|
///
|
||||||
|
/// Finally, the `faceted_documents_ids` value in the main database of `Index`
|
||||||
|
/// is updated to contain the new set of faceted documents.
|
||||||
pub struct FacetsUpdateBulk<'i> {
|
pub struct FacetsUpdateBulk<'i> {
|
||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
database: heed::Database<FacetGroupKeyCodec<ByteSliceRef>, FacetGroupValueCodec>,
|
|
||||||
group_size: u8,
|
group_size: u8,
|
||||||
min_level_size: u8,
|
min_level_size: u8,
|
||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
@ -31,22 +37,10 @@ impl<'i> FacetsUpdateBulk<'i> {
|
|||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
new_data: grenad::Reader<File>,
|
new_data: grenad::Reader<File>,
|
||||||
|
group_size: u8,
|
||||||
|
min_level_size: u8,
|
||||||
) -> FacetsUpdateBulk<'i> {
|
) -> FacetsUpdateBulk<'i> {
|
||||||
FacetsUpdateBulk {
|
FacetsUpdateBulk { index, group_size, min_level_size, facet_type, new_data: Some(new_data) }
|
||||||
index,
|
|
||||||
database: match facet_type {
|
|
||||||
FacetType::String => index
|
|
||||||
.facet_id_string_docids
|
|
||||||
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRef>>(),
|
|
||||||
FacetType::Number => {
|
|
||||||
index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRef>>()
|
|
||||||
}
|
|
||||||
},
|
|
||||||
group_size: 4,
|
|
||||||
min_level_size: 5,
|
|
||||||
facet_type,
|
|
||||||
new_data: Some(new_data),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn new_not_updating_level_0(
|
pub fn new_not_updating_level_0(
|
||||||
@ -55,44 +49,31 @@ impl<'i> FacetsUpdateBulk<'i> {
|
|||||||
) -> FacetsUpdateBulk<'i> {
|
) -> FacetsUpdateBulk<'i> {
|
||||||
FacetsUpdateBulk {
|
FacetsUpdateBulk {
|
||||||
index,
|
index,
|
||||||
database: match facet_type {
|
group_size: FACET_GROUP_SIZE,
|
||||||
FacetType::String => index
|
min_level_size: FACET_MIN_LEVEL_SIZE,
|
||||||
.facet_id_string_docids
|
|
||||||
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRef>>(),
|
|
||||||
FacetType::Number => {
|
|
||||||
index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRef>>()
|
|
||||||
}
|
|
||||||
},
|
|
||||||
group_size: 4,
|
|
||||||
min_level_size: 5,
|
|
||||||
facet_type,
|
facet_type,
|
||||||
new_data: None,
|
new_data: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The number of elements from the level below that are represented by a single element in the level above
|
|
||||||
///
|
|
||||||
/// This setting is always greater than or equal to 2.
|
|
||||||
pub fn level_group_size(mut self, value: u8) -> Self {
|
|
||||||
self.group_size = cmp::max(value, 2);
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The minimum number of elements that a level is allowed to have.
|
|
||||||
pub fn min_level_size(mut self, value: u8) -> Self {
|
|
||||||
self.min_level_size = cmp::max(value, 2);
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
#[logging_timer::time("FacetsUpdateBulk::{}")]
|
#[logging_timer::time("FacetsUpdateBulk::{}")]
|
||||||
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
||||||
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
|
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
|
||||||
|
|
||||||
let Self { index, database, group_size, min_level_size, facet_type, new_data } = self;
|
let Self { index, group_size, min_level_size, facet_type, new_data } = self;
|
||||||
|
|
||||||
|
let db = match facet_type {
|
||||||
|
FacetType::String => {
|
||||||
|
index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRef>>()
|
||||||
|
}
|
||||||
|
FacetType::Number => {
|
||||||
|
index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRef>>()
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
||||||
|
|
||||||
let inner = FacetsUpdateBulkInner { db: database, new_data, group_size, min_level_size };
|
let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size };
|
||||||
|
|
||||||
let field_ids = index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Box<[_]>>();
|
let field_ids = index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Box<[_]>>();
|
||||||
|
|
||||||
@ -105,6 +86,7 @@ impl<'i> FacetsUpdateBulk<'i> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type
|
||||||
pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
|
pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
|
||||||
pub db: heed::Database<FacetGroupKeyCodec<ByteSliceRef>, FacetGroupValueCodec>,
|
pub db: heed::Database<FacetGroupKeyCodec<ByteSliceRef>, FacetGroupValueCodec>,
|
||||||
pub new_data: Option<grenad::Reader<R>>,
|
pub new_data: Option<grenad::Reader<R>>,
|
||||||
|
@ -1,16 +1,14 @@
|
|||||||
use std::collections::HashMap;
|
|
||||||
use std::fs::File;
|
|
||||||
|
|
||||||
use heed::types::ByteSlice;
|
|
||||||
use heed::{BytesDecode, Error, RoTxn, RwTxn};
|
|
||||||
use roaring::RoaringBitmap;
|
|
||||||
|
|
||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::heed_codec::facet::{
|
use crate::heed_codec::facet::{
|
||||||
ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
||||||
};
|
};
|
||||||
use crate::search::facet::get_highest_level;
|
use crate::search::facet::get_highest_level;
|
||||||
use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
|
use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
|
||||||
|
use heed::types::{ByteSlice, DecodeIgnore};
|
||||||
|
use heed::{BytesDecode, Error, RoTxn, RwTxn};
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::fs::File;
|
||||||
|
|
||||||
enum InsertionResult {
|
enum InsertionResult {
|
||||||
InPlace,
|
InPlace,
|
||||||
@ -18,10 +16,15 @@ enum InsertionResult {
|
|||||||
}
|
}
|
||||||
enum DeletionResult {
|
enum DeletionResult {
|
||||||
InPlace,
|
InPlace,
|
||||||
Reduce { prev: Option<Vec<u8>>, next: Option<Vec<u8>> },
|
Reduce { next: Option<Vec<u8>> },
|
||||||
Remove { prev: Option<Vec<u8>>, next: Option<Vec<u8>> },
|
Remove { next: Option<Vec<u8>> },
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Algorithm to incrementally insert and delete elememts into the
|
||||||
|
/// `facet_id_(string/f64)_docids` databases.
|
||||||
|
///
|
||||||
|
/// Rhe `faceted_documents_ids` value in the main database of `Index`
|
||||||
|
/// is also updated to contain the new set of faceted documents.
|
||||||
pub struct FacetsUpdateIncremental<'i> {
|
pub struct FacetsUpdateIncremental<'i> {
|
||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
inner: FacetsUpdateIncrementalInner,
|
inner: FacetsUpdateIncrementalInner,
|
||||||
@ -30,7 +33,14 @@ pub struct FacetsUpdateIncremental<'i> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'i> FacetsUpdateIncremental<'i> {
|
impl<'i> FacetsUpdateIncremental<'i> {
|
||||||
pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader<File>) -> Self {
|
pub fn new(
|
||||||
|
index: &'i Index,
|
||||||
|
facet_type: FacetType,
|
||||||
|
new_data: grenad::Reader<File>,
|
||||||
|
group_size: u8,
|
||||||
|
min_level_size: u8,
|
||||||
|
max_group_size: u8,
|
||||||
|
) -> Self {
|
||||||
FacetsUpdateIncremental {
|
FacetsUpdateIncremental {
|
||||||
index,
|
index,
|
||||||
inner: FacetsUpdateIncrementalInner {
|
inner: FacetsUpdateIncrementalInner {
|
||||||
@ -42,26 +52,15 @@ impl<'i> FacetsUpdateIncremental<'i> {
|
|||||||
.facet_id_f64_docids
|
.facet_id_f64_docids
|
||||||
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRef>>(),
|
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRef>>(),
|
||||||
},
|
},
|
||||||
group_size: 4,
|
group_size,
|
||||||
max_group_size: 8,
|
max_group_size,
|
||||||
min_level_size: 5,
|
min_level_size,
|
||||||
},
|
},
|
||||||
facet_type,
|
facet_type,
|
||||||
new_data,
|
new_data,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pub fn group_size(mut self, size: u8) -> Self {
|
|
||||||
self.inner.group_size = size;
|
|
||||||
self
|
|
||||||
}
|
|
||||||
pub fn min_level_size(mut self, size: u8) -> Self {
|
|
||||||
self.inner.min_level_size = size;
|
|
||||||
self
|
|
||||||
}
|
|
||||||
pub fn max_group_size(mut self, size: u8) -> Self {
|
|
||||||
self.inner.max_group_size = size;
|
|
||||||
self
|
|
||||||
}
|
|
||||||
pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> {
|
pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> {
|
||||||
let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default();
|
let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default();
|
||||||
|
|
||||||
@ -83,6 +82,7 @@ impl<'i> FacetsUpdateIncremental<'i> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Implementation of `FacetsUpdateIncremental` that is independent of milli's `Index` type
|
||||||
pub struct FacetsUpdateIncrementalInner {
|
pub struct FacetsUpdateIncrementalInner {
|
||||||
pub db: heed::Database<FacetGroupKeyCodec<ByteSliceRef>, FacetGroupValueCodec>,
|
pub db: heed::Database<FacetGroupKeyCodec<ByteSliceRef>, FacetGroupValueCodec>,
|
||||||
pub group_size: u8,
|
pub group_size: u8,
|
||||||
@ -90,22 +90,36 @@ pub struct FacetsUpdateIncrementalInner {
|
|||||||
pub max_group_size: u8,
|
pub max_group_size: u8,
|
||||||
}
|
}
|
||||||
impl FacetsUpdateIncrementalInner {
|
impl FacetsUpdateIncrementalInner {
|
||||||
|
/// Find the `FacetGroupKey`/`FacetGroupValue` in the database that
|
||||||
|
/// should be used to insert the new `facet_value` for the given `field_id` and `level`
|
||||||
|
/// where `level` must be strictly greater than 0.
|
||||||
|
///
|
||||||
|
/// For example, when inserting the facet value `4`, there are two possibilities:
|
||||||
|
///
|
||||||
|
/// 1. We find a key whose lower bound is 3 followed by a key whose lower bound is 6. Therefore,
|
||||||
|
/// we know that the implicit range of the first key is 3..6, which contains 4.
|
||||||
|
/// So the new facet value belongs in that first key/value pair.
|
||||||
|
///
|
||||||
|
/// 2. The first key of the level has a lower bound of `5`. We return this key/value pair
|
||||||
|
/// but will need to change the lowerbound of this key to `4` in order to insert this facet value.
|
||||||
fn find_insertion_key_value(
|
fn find_insertion_key_value(
|
||||||
&self,
|
&self,
|
||||||
field_id: u16,
|
field_id: u16,
|
||||||
level: u8,
|
level: u8,
|
||||||
search_key: &[u8],
|
facet_value: &[u8],
|
||||||
txn: &RoTxn,
|
txn: &RoTxn,
|
||||||
) -> Result<(FacetGroupKey<Vec<u8>>, FacetGroupValue)> {
|
) -> Result<(FacetGroupKey<Vec<u8>>, FacetGroupValue)> {
|
||||||
|
assert!(level > 0);
|
||||||
|
|
||||||
let mut prefix = vec![];
|
let mut prefix = vec![];
|
||||||
prefix.extend_from_slice(&field_id.to_be_bytes());
|
prefix.extend_from_slice(&field_id.to_be_bytes());
|
||||||
prefix.push(level);
|
prefix.push(level);
|
||||||
prefix.extend_from_slice(search_key);
|
prefix.extend_from_slice(facet_value);
|
||||||
|
|
||||||
let mut prefix_iter = self
|
let mut prefix_iter = self
|
||||||
.db
|
.db
|
||||||
.as_polymorph()
|
.as_polymorph()
|
||||||
.prefix_iter::<_, ByteSliceRef, FacetGroupValueCodec>(txn, &prefix.as_slice())?;
|
.prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(txn, prefix.as_slice())?;
|
||||||
if let Some(e) = prefix_iter.next() {
|
if let Some(e) = prefix_iter.next() {
|
||||||
let (key_bytes, value) = e?;
|
let (key_bytes, value) = e?;
|
||||||
Ok((
|
Ok((
|
||||||
@ -115,10 +129,10 @@ impl FacetsUpdateIncrementalInner {
|
|||||||
value,
|
value,
|
||||||
))
|
))
|
||||||
} else {
|
} else {
|
||||||
let key = FacetGroupKey { field_id, level, left_bound: search_key };
|
let key = FacetGroupKey { field_id, level, left_bound: facet_value };
|
||||||
match self.db.get_lower_than(txn, &key)? {
|
match self.db.get_lower_than(txn, &key)? {
|
||||||
Some((key, value)) => {
|
Some((key, value)) => {
|
||||||
if key.level != level || key.field_id != field_id {
|
if key.level != level {
|
||||||
let mut prefix = vec![];
|
let mut prefix = vec![];
|
||||||
prefix.extend_from_slice(&field_id.to_be_bytes());
|
prefix.extend_from_slice(&field_id.to_be_bytes());
|
||||||
prefix.push(level);
|
prefix.push(level);
|
||||||
@ -126,7 +140,7 @@ impl FacetsUpdateIncrementalInner {
|
|||||||
let mut iter = self
|
let mut iter = self
|
||||||
.db
|
.db
|
||||||
.as_polymorph()
|
.as_polymorph()
|
||||||
.prefix_iter::<_, ByteSliceRef, FacetGroupValueCodec>(
|
.prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(
|
||||||
txn,
|
txn,
|
||||||
&prefix.as_slice(),
|
&prefix.as_slice(),
|
||||||
)?;
|
)?;
|
||||||
@ -146,15 +160,19 @@ impl FacetsUpdateIncrementalInner {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Insert the given facet value and corresponding document ids in the level 0 of the database
|
||||||
|
///
|
||||||
|
/// ## Return
|
||||||
|
/// See documentation of `insert_in_level`
|
||||||
fn insert_in_level_0<'t>(
|
fn insert_in_level_0<'t>(
|
||||||
&self,
|
&self,
|
||||||
txn: &'t mut RwTxn,
|
txn: &'t mut RwTxn,
|
||||||
field_id: u16,
|
field_id: u16,
|
||||||
new_key: &[u8],
|
facet_value: &[u8],
|
||||||
new_values: &RoaringBitmap,
|
docids: &RoaringBitmap,
|
||||||
) -> Result<InsertionResult> {
|
) -> Result<InsertionResult> {
|
||||||
let key = FacetGroupKey { field_id, level: 0, left_bound: new_key };
|
let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value };
|
||||||
let value = FacetGroupValue { bitmap: new_values.clone(), size: 1 };
|
let value = FacetGroupValue { bitmap: docids.clone(), size: 1 };
|
||||||
|
|
||||||
let mut level0_prefix = vec![];
|
let mut level0_prefix = vec![];
|
||||||
level0_prefix.extend_from_slice(&field_id.to_be_bytes());
|
level0_prefix.extend_from_slice(&field_id.to_be_bytes());
|
||||||
@ -163,7 +181,7 @@ impl FacetsUpdateIncrementalInner {
|
|||||||
let mut iter = self
|
let mut iter = self
|
||||||
.db
|
.db
|
||||||
.as_polymorph()
|
.as_polymorph()
|
||||||
.prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &level0_prefix)?;
|
.prefix_iter::<_, ByteSlice, DecodeIgnore>(&txn, &level0_prefix)?;
|
||||||
|
|
||||||
if iter.next().is_none() {
|
if iter.next().is_none() {
|
||||||
drop(iter);
|
drop(iter);
|
||||||
@ -186,75 +204,102 @@ impl FacetsUpdateIncrementalInner {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Insert the given facet value and corresponding document ids in all the levels of the database up to the given `level`.
|
||||||
|
/// This function works recursively.
|
||||||
|
///
|
||||||
|
/// ## Return
|
||||||
|
/// Returns the effect of adding the facet value to the database on the given `level`.
|
||||||
|
///
|
||||||
|
/// - `InsertionResult::InPlace` means that inserting the `facet_value` into the `level` did not have
|
||||||
|
/// an effect on the number of keys in that level. Therefore, it did not increase the number of children
|
||||||
|
/// of the parent node.
|
||||||
|
///
|
||||||
|
/// - `InsertionResult::Insert` means that inserting the `facet_value` into the `level` resulted
|
||||||
|
/// in the addition of a new key in that level, and that therefore the number of children
|
||||||
|
/// of the parent node should be incremented.
|
||||||
fn insert_in_level<'t>(
|
fn insert_in_level<'t>(
|
||||||
&self,
|
&self,
|
||||||
txn: &'t mut RwTxn,
|
txn: &'t mut RwTxn,
|
||||||
field_id: u16,
|
field_id: u16,
|
||||||
level: u8,
|
level: u8,
|
||||||
new_key: &[u8],
|
facet_value: &[u8],
|
||||||
new_values: &RoaringBitmap,
|
docids: &RoaringBitmap,
|
||||||
) -> Result<InsertionResult> {
|
) -> Result<InsertionResult> {
|
||||||
if level == 0 {
|
if level == 0 {
|
||||||
return self.insert_in_level_0(txn, field_id, new_key, new_values);
|
return self.insert_in_level_0(txn, field_id, facet_value, docids);
|
||||||
}
|
}
|
||||||
|
|
||||||
let max_group_size = self.max_group_size;
|
let max_group_size = self.max_group_size;
|
||||||
|
|
||||||
let (insertion_key, insertion_value) =
|
let result = self.insert_in_level(txn, field_id, level - 1, facet_value.clone(), docids)?;
|
||||||
self.find_insertion_key_value(field_id, level, new_key, txn)?;
|
|
||||||
|
|
||||||
let result = self.insert_in_level(txn, field_id, level - 1, new_key.clone(), new_values)?;
|
|
||||||
// level below inserted an element
|
// level below inserted an element
|
||||||
|
|
||||||
let insertion_key = {
|
let (insertion_key, insertion_value) =
|
||||||
let mut new_insertion_key = insertion_key.clone();
|
self.find_insertion_key_value(field_id, level, facet_value, txn)?;
|
||||||
let mut modified = false;
|
|
||||||
|
|
||||||
if new_key < insertion_key.left_bound.as_slice() {
|
|
||||||
new_insertion_key.left_bound = new_key.to_vec();
|
|
||||||
modified = true;
|
|
||||||
}
|
|
||||||
if modified {
|
|
||||||
let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?;
|
|
||||||
assert!(is_deleted);
|
|
||||||
self.db.put(txn, &new_insertion_key.as_ref(), &insertion_value)?;
|
|
||||||
}
|
|
||||||
new_insertion_key
|
|
||||||
};
|
|
||||||
|
|
||||||
match result {
|
match result {
|
||||||
// TODO: this could go above the block recomputing insertion key
|
// because we know that we inserted in place, the facet_value is not a new one
|
||||||
// because we know that if we inserted in place, the key is not a new one
|
// thus it doesn't extend a group, and thus the insertion key computed above is
|
||||||
// thus it doesn't extend a group
|
// still correct
|
||||||
InsertionResult::InPlace => {
|
InsertionResult::InPlace => {
|
||||||
let mut updated_value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap();
|
let mut updated_value = insertion_value;
|
||||||
updated_value.bitmap |= new_values;
|
updated_value.bitmap |= docids;
|
||||||
self.db.put(txn, &insertion_key.as_ref(), &updated_value)?;
|
self.db.put(txn, &insertion_key.as_ref(), &updated_value)?;
|
||||||
|
|
||||||
return Ok(InsertionResult::InPlace);
|
return Ok(InsertionResult::InPlace);
|
||||||
}
|
}
|
||||||
InsertionResult::Insert => {}
|
InsertionResult::Insert => {}
|
||||||
}
|
}
|
||||||
let mut updated_value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap();
|
|
||||||
|
// Here we know that inserting the facet value in the level below resulted in the creation
|
||||||
|
// of a new key. Therefore, it may be the case that we need to modify the left bound of the
|
||||||
|
// insertion key (see documentation of `find_insertion_key_value` for an example of when that
|
||||||
|
// could happen).
|
||||||
|
let insertion_key = {
|
||||||
|
let mut new_insertion_key = insertion_key.clone();
|
||||||
|
let mut key_should_be_modified = false;
|
||||||
|
|
||||||
|
if facet_value < insertion_key.left_bound.as_slice() {
|
||||||
|
new_insertion_key.left_bound = facet_value.to_vec();
|
||||||
|
key_should_be_modified = true;
|
||||||
|
}
|
||||||
|
if key_should_be_modified {
|
||||||
|
let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?;
|
||||||
|
assert!(is_deleted);
|
||||||
|
self.db.put(txn, &new_insertion_key.as_ref(), &insertion_value)?;
|
||||||
|
}
|
||||||
|
new_insertion_key
|
||||||
|
};
|
||||||
|
// Now we know that the insertion key contains the `facet_value`.
|
||||||
|
|
||||||
|
// We still need to update the insertion value by:
|
||||||
|
// 1. Incrementing the number of children (since the recursive call returned `InsertionResult::Insert`)
|
||||||
|
// 2. Merge the previous docids with the new one
|
||||||
|
let mut updated_value = insertion_value;
|
||||||
|
|
||||||
updated_value.size += 1;
|
updated_value.size += 1;
|
||||||
if updated_value.size == max_group_size {
|
|
||||||
|
if updated_value.size < max_group_size {
|
||||||
|
updated_value.bitmap |= docids;
|
||||||
|
self.db.put(txn, &insertion_key.as_ref(), &updated_value)?;
|
||||||
|
|
||||||
|
return Ok(InsertionResult::InPlace);
|
||||||
|
}
|
||||||
|
|
||||||
|
// We've increased the group size of the value and realised it has become greater than or equal to `max_group_size`
|
||||||
|
// Therefore it must be split into two nodes.
|
||||||
|
|
||||||
let size_left = max_group_size / 2;
|
let size_left = max_group_size / 2;
|
||||||
let size_right = max_group_size - size_left;
|
let size_right = max_group_size - size_left;
|
||||||
|
|
||||||
let level_below = level - 1;
|
let level_below = level - 1;
|
||||||
|
|
||||||
let (start_key, _) = self
|
let start_key = FacetGroupKey {
|
||||||
.db
|
|
||||||
.get_greater_than_or_equal_to(
|
|
||||||
&txn,
|
|
||||||
&FacetGroupKey {
|
|
||||||
field_id,
|
field_id,
|
||||||
level: level_below,
|
level: level_below,
|
||||||
left_bound: insertion_key.left_bound.as_slice(),
|
left_bound: insertion_key.left_bound.as_slice(),
|
||||||
},
|
};
|
||||||
)?
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let mut iter = self.db.range(&txn, &(start_key..))?.take(max_group_size as usize);
|
let mut iter = self.db.range(&txn, &(start_key..))?.take(max_group_size as usize);
|
||||||
|
|
||||||
@ -278,22 +323,17 @@ impl FacetsUpdateIncrementalInner {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let group_right = {
|
let group_right = {
|
||||||
let mut values_right = RoaringBitmap::new();
|
let (
|
||||||
let mut right_start_key = None;
|
FacetGroupKey { left_bound: right_left_bound, .. },
|
||||||
|
FacetGroupValue { bitmap: mut values_right, .. },
|
||||||
|
) = iter.next().unwrap()?;
|
||||||
|
|
||||||
while let Some(next) = iter.next() {
|
while let Some(next) = iter.next() {
|
||||||
let (key, value) = next?;
|
let (_, value) = next?;
|
||||||
if right_start_key.is_none() {
|
|
||||||
right_start_key = Some(key.left_bound);
|
|
||||||
}
|
|
||||||
values_right |= &value.bitmap;
|
values_right |= &value.bitmap;
|
||||||
}
|
}
|
||||||
|
|
||||||
let key = FacetGroupKey {
|
let key = FacetGroupKey { field_id, level, left_bound: right_left_bound.to_vec() };
|
||||||
field_id,
|
|
||||||
level,
|
|
||||||
left_bound: right_start_key.unwrap().to_vec(),
|
|
||||||
};
|
|
||||||
let value = FacetGroupValue { size: size_right as u8, bitmap: values_right };
|
let value = FacetGroupValue { size: size_right as u8, bitmap: values_right };
|
||||||
(key, value)
|
(key, value)
|
||||||
};
|
};
|
||||||
@ -305,24 +345,17 @@ impl FacetsUpdateIncrementalInner {
|
|||||||
self.db.put(txn, &group_right.0.as_ref(), &group_right.1)?;
|
self.db.put(txn, &group_right.0.as_ref(), &group_right.1)?;
|
||||||
|
|
||||||
Ok(InsertionResult::Insert)
|
Ok(InsertionResult::Insert)
|
||||||
} else {
|
|
||||||
let mut value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap();
|
|
||||||
value.bitmap |= new_values;
|
|
||||||
value.size += 1;
|
|
||||||
self.db.put(txn, &insertion_key.as_ref(), &value).unwrap();
|
|
||||||
|
|
||||||
Ok(InsertionResult::InPlace)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Insert the given facet value and corresponding document ids in the database.
|
||||||
pub fn insert<'a, 't>(
|
pub fn insert<'a, 't>(
|
||||||
&self,
|
&self,
|
||||||
txn: &'t mut RwTxn,
|
txn: &'t mut RwTxn,
|
||||||
field_id: u16,
|
field_id: u16,
|
||||||
new_key: &[u8],
|
facet_value: &[u8],
|
||||||
new_values: &RoaringBitmap,
|
docids: &RoaringBitmap,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
if new_values.is_empty() {
|
if docids.is_empty() {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
let group_size = self.group_size;
|
let group_size = self.group_size;
|
||||||
@ -330,12 +363,15 @@ impl FacetsUpdateIncrementalInner {
|
|||||||
let highest_level = get_highest_level(&txn, self.db, field_id)?;
|
let highest_level = get_highest_level(&txn, self.db, field_id)?;
|
||||||
|
|
||||||
let result =
|
let result =
|
||||||
self.insert_in_level(txn, field_id, highest_level as u8, new_key, new_values)?;
|
self.insert_in_level(txn, field_id, highest_level as u8, facet_value, docids)?;
|
||||||
match result {
|
match result {
|
||||||
InsertionResult::InPlace => return Ok(()),
|
InsertionResult::InPlace => return Ok(()),
|
||||||
InsertionResult::Insert => {}
|
InsertionResult::Insert => {}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Here we check whether the highest level has exceeded `min_level_size` * `self.group_size`.
|
||||||
|
// If it has, we must build an addition level above it.
|
||||||
|
|
||||||
let mut highest_level_prefix = vec![];
|
let mut highest_level_prefix = vec![];
|
||||||
highest_level_prefix.extend_from_slice(&field_id.to_be_bytes());
|
highest_level_prefix.extend_from_slice(&field_id.to_be_bytes());
|
||||||
highest_level_prefix.push(highest_level);
|
highest_level_prefix.push(highest_level);
|
||||||
@ -384,36 +420,61 @@ impl FacetsUpdateIncrementalInner {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Delete the given document id from the given facet value in the database, from level 0 to the
|
||||||
|
/// the given level.
|
||||||
|
///
|
||||||
|
/// ## Return
|
||||||
|
/// Returns the effect of removing the document id from the database on the given `level`.
|
||||||
|
///
|
||||||
|
/// - `DeletionResult::InPlace` means that deleting the document id did not have
|
||||||
|
/// an effect on the keys in that level.
|
||||||
|
///
|
||||||
|
/// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the
|
||||||
|
/// number of keys in the level. For example, removing a document id from the facet value `3` could
|
||||||
|
/// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted
|
||||||
|
/// entirely. In that case, `DeletionResult::Remove` is returned. The parent of the deleted key must
|
||||||
|
/// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well.
|
||||||
|
///
|
||||||
|
/// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the
|
||||||
|
/// bounds of the keys of the level. For example, removing a document id from the facet value
|
||||||
|
/// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore,
|
||||||
|
/// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4).
|
||||||
|
/// In that case `DeletionResult::Reduce` is returned. The parent of the reduced key may need to adjust
|
||||||
|
/// its left bound as well.
|
||||||
fn delete_in_level<'t>(
|
fn delete_in_level<'t>(
|
||||||
&self,
|
&self,
|
||||||
txn: &'t mut RwTxn,
|
txn: &'t mut RwTxn,
|
||||||
field_id: u16,
|
field_id: u16,
|
||||||
level: u8,
|
level: u8,
|
||||||
key: &[u8],
|
facet_value: &[u8],
|
||||||
value: u32,
|
docid: u32,
|
||||||
) -> Result<DeletionResult> {
|
) -> Result<DeletionResult> {
|
||||||
if level == 0 {
|
if level == 0 {
|
||||||
return self.delete_in_level_0(txn, field_id, key, value);
|
return self.delete_in_level_0(txn, field_id, facet_value, docid);
|
||||||
}
|
}
|
||||||
let (deletion_key, mut bitmap) =
|
let (deletion_key, mut bitmap) =
|
||||||
self.find_insertion_key_value(field_id, level, key, txn)?;
|
self.find_insertion_key_value(field_id, level, facet_value, txn)?;
|
||||||
|
|
||||||
let result = self.delete_in_level(txn, field_id, level - 1, key.clone(), value)?;
|
let result = self.delete_in_level(txn, field_id, level - 1, facet_value.clone(), docid)?;
|
||||||
|
|
||||||
let mut decrease_size = false;
|
let mut decrease_size = false;
|
||||||
let (prev_key, next_key) = match result {
|
let next_key = match result {
|
||||||
DeletionResult::InPlace => {
|
DeletionResult::InPlace => {
|
||||||
bitmap.bitmap.remove(value);
|
bitmap.bitmap.remove(docid);
|
||||||
self.db.put(txn, &deletion_key.as_ref(), &bitmap)?;
|
self.db.put(txn, &deletion_key.as_ref(), &bitmap)?;
|
||||||
return Ok(DeletionResult::InPlace);
|
return Ok(DeletionResult::InPlace);
|
||||||
}
|
}
|
||||||
DeletionResult::Reduce { prev, next } => (prev, next),
|
DeletionResult::Reduce { next } => next,
|
||||||
DeletionResult::Remove { prev, next } => {
|
DeletionResult::Remove { next } => {
|
||||||
decrease_size = true;
|
decrease_size = true;
|
||||||
(prev, next)
|
next
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
// If either DeletionResult::Reduce or DeletionResult::Remove was returned,
|
||||||
|
// then we may need to adjust the left_bound of the deletion key.
|
||||||
|
|
||||||
|
// If DeletionResult::Remove was returned, then we need to decrease the group
|
||||||
|
// size of the deletion key.
|
||||||
let mut updated_value = bitmap;
|
let mut updated_value = bitmap;
|
||||||
if decrease_size {
|
if decrease_size {
|
||||||
updated_value.size -= 1;
|
updated_value.size -= 1;
|
||||||
@ -421,17 +482,21 @@ impl FacetsUpdateIncrementalInner {
|
|||||||
|
|
||||||
if updated_value.size == 0 {
|
if updated_value.size == 0 {
|
||||||
self.db.delete(txn, &deletion_key.as_ref())?;
|
self.db.delete(txn, &deletion_key.as_ref())?;
|
||||||
Ok(DeletionResult::Remove { prev: prev_key, next: next_key })
|
Ok(DeletionResult::Remove { next: next_key })
|
||||||
} else {
|
} else {
|
||||||
let mut updated_deletion_key = deletion_key.clone();
|
let mut updated_deletion_key = deletion_key.clone();
|
||||||
if key == deletion_key.left_bound {
|
let reduced_range = facet_value == deletion_key.left_bound;
|
||||||
|
if reduced_range {
|
||||||
updated_deletion_key.left_bound = next_key.clone().unwrap();
|
updated_deletion_key.left_bound = next_key.clone().unwrap();
|
||||||
}
|
}
|
||||||
updated_value.bitmap.remove(value);
|
updated_value.bitmap.remove(docid);
|
||||||
let _ = self.db.delete(txn, &deletion_key.as_ref())?;
|
let _ = self.db.delete(txn, &deletion_key.as_ref())?;
|
||||||
self.db.put(txn, &updated_deletion_key.as_ref(), &updated_value)?;
|
self.db.put(txn, &updated_deletion_key.as_ref(), &updated_value)?;
|
||||||
|
if reduced_range {
|
||||||
Ok(DeletionResult::Reduce { prev: prev_key, next: next_key })
|
Ok(DeletionResult::Reduce { next: next_key })
|
||||||
|
} else {
|
||||||
|
Ok(DeletionResult::InPlace)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -439,27 +504,24 @@ impl FacetsUpdateIncrementalInner {
|
|||||||
&self,
|
&self,
|
||||||
txn: &'t mut RwTxn,
|
txn: &'t mut RwTxn,
|
||||||
field_id: u16,
|
field_id: u16,
|
||||||
key: &[u8],
|
facet_value: &[u8],
|
||||||
value: u32,
|
docid: u32,
|
||||||
) -> Result<DeletionResult> {
|
) -> Result<DeletionResult> {
|
||||||
let key = FacetGroupKey { field_id, level: 0, left_bound: key };
|
let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value };
|
||||||
let mut bitmap = self.db.get(&txn, &key)?.unwrap().bitmap;
|
let mut bitmap = self.db.get(&txn, &key)?.unwrap().bitmap;
|
||||||
bitmap.remove(value);
|
bitmap.remove(docid);
|
||||||
|
|
||||||
if bitmap.is_empty() {
|
if bitmap.is_empty() {
|
||||||
let mut prev_key = None;
|
|
||||||
let mut next_key = None;
|
let mut next_key = None;
|
||||||
|
if let Some((next, _)) =
|
||||||
if let Some(prev) = self.db.get_lower_than(&txn, &key)? {
|
self.db.remap_data_type::<DecodeIgnore>().get_greater_than(&txn, &key)?
|
||||||
prev_key = Some(prev.0.left_bound.to_vec());
|
{
|
||||||
}
|
if next.field_id == field_id && next.level == 0 {
|
||||||
if let Some(next) = self.db.get_greater_than(&txn, &key)? {
|
next_key = Some(next.left_bound.to_vec());
|
||||||
if next.0.level == 0 {
|
|
||||||
next_key = Some(next.0.left_bound.to_vec());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
self.db.delete(txn, &key)?;
|
self.db.delete(txn, &key)?;
|
||||||
Ok(DeletionResult::Remove { prev: prev_key, next: next_key })
|
Ok(DeletionResult::Remove { next: next_key })
|
||||||
} else {
|
} else {
|
||||||
self.db.put(txn, &key, &FacetGroupValue { size: 1, bitmap })?;
|
self.db.put(txn, &key, &FacetGroupValue { size: 1, bitmap })?;
|
||||||
Ok(DeletionResult::InPlace)
|
Ok(DeletionResult::InPlace)
|
||||||
@ -470,22 +532,30 @@ impl FacetsUpdateIncrementalInner {
|
|||||||
&self,
|
&self,
|
||||||
txn: &'t mut RwTxn,
|
txn: &'t mut RwTxn,
|
||||||
field_id: u16,
|
field_id: u16,
|
||||||
key: &[u8],
|
facet_value: &[u8],
|
||||||
value: u32,
|
docid: u32,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
if self.db.get(txn, &FacetGroupKey { field_id, level: 0, left_bound: key })?.is_none() {
|
if self
|
||||||
|
.db
|
||||||
|
.remap_data_type::<DecodeIgnore>()
|
||||||
|
.get(txn, &FacetGroupKey { field_id, level: 0, left_bound: facet_value })?
|
||||||
|
.is_none()
|
||||||
|
{
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
let highest_level = get_highest_level(&txn, self.db, field_id)?;
|
let highest_level = get_highest_level(&txn, self.db, field_id)?;
|
||||||
|
|
||||||
// let key_bytes = BoundCodec::bytes_encode(&key).unwrap();
|
let result =
|
||||||
|
self.delete_in_level(txn, field_id, highest_level as u8, facet_value, docid)?;
|
||||||
let result = self.delete_in_level(txn, field_id, highest_level as u8, key, value)?;
|
|
||||||
match result {
|
match result {
|
||||||
DeletionResult::InPlace => return Ok(()),
|
DeletionResult::InPlace => return Ok(()),
|
||||||
DeletionResult::Reduce { .. } => {}
|
DeletionResult::Reduce { .. } => return Ok(()),
|
||||||
DeletionResult::Remove { .. } => {}
|
DeletionResult::Remove { .. } => {}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// if we either removed a key from the highest level, its size may have fallen
|
||||||
|
// below `min_level_size`, in which case we need to remove the entire level
|
||||||
|
|
||||||
let mut highest_level_prefix = vec![];
|
let mut highest_level_prefix = vec![];
|
||||||
highest_level_prefix.extend_from_slice(&field_id.to_be_bytes());
|
highest_level_prefix.extend_from_slice(&field_id.to_be_bytes());
|
||||||
highest_level_prefix.push(highest_level);
|
highest_level_prefix.push(highest_level);
|
||||||
@ -521,6 +591,26 @@ impl FacetsUpdateIncrementalInner {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<'a> FacetGroupKey<&'a [u8]> {
|
||||||
|
pub fn into_owned(self) -> FacetGroupKey<Vec<u8>> {
|
||||||
|
FacetGroupKey {
|
||||||
|
field_id: self.field_id,
|
||||||
|
level: self.level,
|
||||||
|
left_bound: self.left_bound.to_vec(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> FacetGroupKey<Vec<u8>> {
|
||||||
|
pub fn as_ref(&self) -> FacetGroupKey<&[u8]> {
|
||||||
|
FacetGroupKey {
|
||||||
|
field_id: self.field_id,
|
||||||
|
level: self.level,
|
||||||
|
left_bound: self.left_bound.as_slice(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use crate::heed_codec::facet::{OrderedF64Codec, StrRefCodec};
|
use crate::heed_codec::facet::{OrderedF64Codec, StrRefCodec};
|
||||||
|
@ -1,3 +1,79 @@
|
|||||||
|
/*!
|
||||||
|
This module implements two different algorithms for updating the `facet_id_string_docids`
|
||||||
|
and `facet_id_f64_docids` databases. The first algorithm is a "bulk" algorithm, meaning that
|
||||||
|
it recreates the database from scratch when new elements are added to it. The second algorithm
|
||||||
|
is incremental: it modifies the database as little as possible.
|
||||||
|
|
||||||
|
The databases must be able to return results for queries such as:
|
||||||
|
1. Filter : find all the document ids that have a facet value greater than X and/or smaller than Y
|
||||||
|
2. Min/Max : find the minimum/maximum facet value among these document ids
|
||||||
|
3. Sort : sort these document ids by increasing/decreasing facet values
|
||||||
|
4. Distribution : given some document ids, make a list of each facet value
|
||||||
|
found in these documents along with the number of documents that contain it
|
||||||
|
|
||||||
|
The algorithms that implement these queries are found in the `src/search/facet` folder.
|
||||||
|
|
||||||
|
To make these queries fast to compute, the database adopts a tree structure:
|
||||||
|
```ignore
|
||||||
|
┌───────────────────────────────┬───────────────────────────────┬───────────────┐
|
||||||
|
┌───────┐ │ "ab" (2) │ "gaf" (2) │ "woz" (1) │
|
||||||
|
│Level 2│ │ │ │ │
|
||||||
|
└───────┘ │ [a, b, d, f, z] │ [c, d, e, f, g] │ [u, y] │
|
||||||
|
├───────────────┬───────────────┼───────────────┬───────────────┼───────────────┤
|
||||||
|
┌───────┐ │ "ab" (2) │ "ba" (2) │ "gaf" (2) │ "form" (2) │ "woz" (2) │
|
||||||
|
│Level 1│ │ │ │ │ │ │
|
||||||
|
└───────┘ │ [a, b, d, z] │ [a, b, f] │ [c, d, g] │ [e, f] │ [u, y] │
|
||||||
|
├───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┤
|
||||||
|
┌───────┐ │ "ab" │ "ac" │ "ba" │ "bac" │ "gaf" │ "gal" │ "form"│ "wow" │ "woz" │ "zz" │
|
||||||
|
│Level 0│ │ │ │ │ │ │ │ │ │ │ │
|
||||||
|
└───────┘ │ [a, b]│ [d, z]│ [b, f]│ [a, f]│ [c, d]│ [g] │ [e] │ [e, f]│ [y] │ [u] │
|
||||||
|
└───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘
|
||||||
|
```
|
||||||
|
In the diagram above, each cell corresponds to a node in the tree. The first line of the cell
|
||||||
|
contains the left bound of the range of facet values as well as the number of children of the node.
|
||||||
|
The second line contains the document ids which have a facet value within the range of the node.
|
||||||
|
The nodes at level 0 are the leaf nodes. They have 0 children and a single facet value in their range.
|
||||||
|
|
||||||
|
In the diagram above, the first cell of level 2 is `ab (2)`. Its range is `ab .. gaf` (because
|
||||||
|
`gaf` is the left bound of the next node) and it has two children. Its document ids are `[a,b,d,f,z]`.
|
||||||
|
These documents all contain a facet value that is contained within `ab .. gaf`.
|
||||||
|
|
||||||
|
In the database, each node is represented by a key/value pair encoded as a [`FacetGroupKey`] and a
|
||||||
|
[`FacetGroupValue`], which have the following format:
|
||||||
|
|
||||||
|
```ignore
|
||||||
|
FacetGroupKey:
|
||||||
|
- field id : u16
|
||||||
|
- level : u8
|
||||||
|
- left bound: [u8] // the facet value encoded using either OrderedF64Codec or Str
|
||||||
|
|
||||||
|
FacetGroupValue:
|
||||||
|
- #children : u8
|
||||||
|
- docids : RoaringBitmap
|
||||||
|
```
|
||||||
|
|
||||||
|
When the database is first created using the "bulk" method, each node has a fixed number of children
|
||||||
|
(except for possibly the last one) given by the `group_size` parameter (default to `FACET_GROUP_SIZE`).
|
||||||
|
The tree is also built such that the highest level has more than `min_level_size`
|
||||||
|
(default to `FACET_MIN_LEVEL_SIZE`) elements in it.
|
||||||
|
|
||||||
|
When the database is incrementally updated, the number of children of a node can vary between
|
||||||
|
1 and `max_group_size`. This is done so that most incremental operations do not need to change
|
||||||
|
the structure of the tree. When the number of children of a node reaches `max_group_size`,
|
||||||
|
we split the node in two and update the number of children of its parent.
|
||||||
|
|
||||||
|
When adding documents to the databases, it is important to determine which method to use to
|
||||||
|
minimise indexing time. The incremental method is faster when adding few new facet values, but the
|
||||||
|
bulk method is faster when a large part of the database is modified. Empirically, it seems that
|
||||||
|
it takes 50x more time to incrementally add N facet values to an existing database than it is to
|
||||||
|
construct a database of N facet values. This is the heuristic that is used to choose between the
|
||||||
|
two methods.
|
||||||
|
*/
|
||||||
|
|
||||||
|
pub const FACET_MAX_GROUP_SIZE: u8 = 8;
|
||||||
|
pub const FACET_GROUP_SIZE: u8 = 4;
|
||||||
|
pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
|
||||||
|
|
||||||
use self::incremental::FacetsUpdateIncremental;
|
use self::incremental::FacetsUpdateIncremental;
|
||||||
use super::FacetsUpdateBulk;
|
use super::FacetsUpdateBulk;
|
||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
@ -13,8 +89,8 @@ pub struct FacetsUpdate<'i> {
|
|||||||
database: heed::Database<FacetGroupKeyCodec<ByteSliceRef>, FacetGroupValueCodec>,
|
database: heed::Database<FacetGroupKeyCodec<ByteSliceRef>, FacetGroupValueCodec>,
|
||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
new_data: grenad::Reader<File>,
|
new_data: grenad::Reader<File>,
|
||||||
level_group_size: u8,
|
group_size: u8,
|
||||||
max_level_group_size: u8,
|
max_group_size: u8,
|
||||||
min_level_size: u8,
|
min_level_size: u8,
|
||||||
}
|
}
|
||||||
impl<'i> FacetsUpdate<'i> {
|
impl<'i> FacetsUpdate<'i> {
|
||||||
@ -30,57 +106,24 @@ impl<'i> FacetsUpdate<'i> {
|
|||||||
Self {
|
Self {
|
||||||
index,
|
index,
|
||||||
database,
|
database,
|
||||||
level_group_size: 4,
|
group_size: FACET_GROUP_SIZE,
|
||||||
max_level_group_size: 8,
|
max_group_size: FACET_MAX_GROUP_SIZE,
|
||||||
min_level_size: 5,
|
min_level_size: FACET_MIN_LEVEL_SIZE,
|
||||||
facet_type,
|
facet_type,
|
||||||
new_data,
|
new_data,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: use the options below?
|
|
||||||
// but I don't actually see why they should be configurable
|
|
||||||
// /// The minimum number of elements that a level is allowed to have.
|
|
||||||
// pub fn level_max_group_size(mut self, value: u8) -> Self {
|
|
||||||
// self.max_level_group_size = std::cmp::max(value, 4);
|
|
||||||
// self
|
|
||||||
// }
|
|
||||||
|
|
||||||
// /// The number of elements from the level below that are represented by a single element in the level above
|
|
||||||
// ///
|
|
||||||
// /// This setting is always greater than or equal to 2.
|
|
||||||
// pub fn level_group_size(mut self, value: u8) -> Self {
|
|
||||||
// self.level_group_size = std::cmp::max(value, 2);
|
|
||||||
// self
|
|
||||||
// }
|
|
||||||
|
|
||||||
// /// The minimum number of elements that a level is allowed to have.
|
|
||||||
// pub fn min_level_size(mut self, value: u8) -> Self {
|
|
||||||
// self.min_level_size = std::cmp::max(value, 2);
|
|
||||||
// self
|
|
||||||
// }
|
|
||||||
|
|
||||||
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
||||||
if self.new_data.is_empty() {
|
if self.new_data.is_empty() {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
// here, come up with a better condition!
|
|
||||||
// ideally we'd choose which method to use for each field id individually
|
|
||||||
// but I dont' think it's worth the effort yet
|
|
||||||
// As a first requirement, we ask that the length of the new data is less
|
|
||||||
// than a 1/50th of the length of the database in order to use the incremental
|
|
||||||
// method.
|
|
||||||
if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) {
|
if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) {
|
||||||
let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data)
|
let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data, self.group_size, self.min_level_size);
|
||||||
.level_group_size(self.level_group_size)
|
|
||||||
.min_level_size(self.min_level_size);
|
|
||||||
bulk_update.execute(wtxn)?;
|
bulk_update.execute(wtxn)?;
|
||||||
} else {
|
} else {
|
||||||
let incremental_update =
|
let incremental_update =
|
||||||
FacetsUpdateIncremental::new(self.index, self.facet_type, self.new_data)
|
FacetsUpdateIncremental::new(self.index, self.facet_type, self.new_data, self.group_size, self.min_level_size, self.max_group_size);
|
||||||
.group_size(self.level_group_size)
|
|
||||||
.max_group_size(self.max_level_group_size)
|
|
||||||
.min_level_size(self.min_level_size);
|
|
||||||
incremental_update.execute(wtxn)?;
|
incremental_update.execute(wtxn)?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -346,7 +389,7 @@ mod comparison_bench {
|
|||||||
// of the incremental vs. bulk indexer.
|
// of the incremental vs. bulk indexer.
|
||||||
// It appears that the incremental indexer is about 50 times slower than the
|
// It appears that the incremental indexer is about 50 times slower than the
|
||||||
// bulk indexer.
|
// bulk indexer.
|
||||||
#[test]
|
// #[test]
|
||||||
fn benchmark_facet_indexing() {
|
fn benchmark_facet_indexing() {
|
||||||
// then we add 10_000 documents at a time and compare the speed of adding 1, 100, and 1000 documents to it
|
// then we add 10_000 documents at a time and compare the speed of adding 1, 100, and 1000 documents to it
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@ mod typed_chunk;
|
|||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::io::{Cursor, Read, Seek};
|
use std::io::{Cursor, Read, Seek};
|
||||||
use std::iter::FromIterator;
|
use std::iter::FromIterator;
|
||||||
use std::num::{NonZeroU32, NonZeroUsize};
|
use std::num::NonZeroU32;
|
||||||
use std::result::Result as StdResult;
|
use std::result::Result as StdResult;
|
||||||
|
|
||||||
use crossbeam_channel::{Receiver, Sender};
|
use crossbeam_channel::{Receiver, Sender};
|
||||||
@ -82,8 +82,6 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a, F> {
|
|||||||
|
|
||||||
#[derive(Default, Debug, Clone)]
|
#[derive(Default, Debug, Clone)]
|
||||||
pub struct IndexDocumentsConfig {
|
pub struct IndexDocumentsConfig {
|
||||||
pub facet_level_group_size: Option<NonZeroUsize>,
|
|
||||||
pub facet_min_level_size: Option<NonZeroUsize>,
|
|
||||||
pub words_prefix_threshold: Option<u32>,
|
pub words_prefix_threshold: Option<u32>,
|
||||||
pub max_prefix_length: Option<usize>,
|
pub max_prefix_length: Option<usize>,
|
||||||
pub words_positions_level_group_size: Option<NonZeroU32>,
|
pub words_positions_level_group_size: Option<NonZeroU32>,
|
||||||
|
Loading…
Reference in New Issue
Block a user