mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-10 05:24:32 +01:00
Merge #5205
5205: Incremental facets on v1.12 r=curquiza a=dureuill # Pull Request ## Related issue Fixes https://github.com/meilisearch/meilisearch/issues/5213 ## What does this PR do? - Add `new_incremental`module that computes incremental facet indexing for the new indexer - Change heuristics for incremental vs bulk facet choice: under 1000 operations is now always incremental, over 100_000 operations is now always bulk) - Add sanity checks in debug ## Future improvements - Use multi ops from Roaring to decrease the number of allocations - Consider removing or adding multiple levels at once instead of max once per update - Consider using information about the tree structure + the operations that were done (e.g. only addition) to avoid recomputing the group from all children - Consider making the algorithm parallel and looking into the roaring values to know which actually changed Co-authored-by: Louis Dureuil <louis@meilisearch.com>
This commit is contained in:
commit
dcb4c49cf2
@ -79,22 +79,29 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
|
|||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::BufReader;
|
use std::io::BufReader;
|
||||||
|
use std::ops::Bound;
|
||||||
|
|
||||||
use grenad::Merger;
|
use grenad::Merger;
|
||||||
use heed::types::{Bytes, DecodeIgnore};
|
use heed::types::{Bytes, DecodeIgnore};
|
||||||
|
use heed::BytesDecode as _;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
use self::incremental::FacetsUpdateIncremental;
|
use self::incremental::FacetsUpdateIncremental;
|
||||||
use super::{FacetsUpdateBulk, MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps};
|
use super::{FacetsUpdateBulk, MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps};
|
||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
|
use crate::heed_codec::facet::{
|
||||||
|
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec,
|
||||||
|
};
|
||||||
use crate::heed_codec::BytesRefCodec;
|
use crate::heed_codec::BytesRefCodec;
|
||||||
|
use crate::search::facet::get_highest_level;
|
||||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||||
use crate::{try_split_array_at, FieldId, Index, Result};
|
use crate::{try_split_array_at, FieldId, Index, Result};
|
||||||
|
|
||||||
pub mod bulk;
|
pub mod bulk;
|
||||||
pub mod incremental;
|
pub mod incremental;
|
||||||
|
pub mod new_incremental;
|
||||||
|
|
||||||
/// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases.
|
/// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases.
|
||||||
///
|
///
|
||||||
@ -646,3 +653,194 @@ mod comparison_bench {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Run sanity checks on the specified fid tree
|
||||||
|
///
|
||||||
|
/// 1. No "orphan" child value, any child value has a parent
|
||||||
|
/// 2. Any docid in the child appears in the parent
|
||||||
|
/// 3. No docid in the parent is missing from all its children
|
||||||
|
/// 4. no group is bigger than max_group_size
|
||||||
|
/// 5. Less than 50% of groups are bigger than group_size
|
||||||
|
/// 6. group size matches the number of children
|
||||||
|
/// 7. max_level is < 255
|
||||||
|
pub(crate) fn sanity_checks(
|
||||||
|
index: &Index,
|
||||||
|
rtxn: &heed::RoTxn,
|
||||||
|
field_id: FieldId,
|
||||||
|
facet_type: FacetType,
|
||||||
|
group_size: usize,
|
||||||
|
_min_level_size: usize, // might add a check on level size later
|
||||||
|
max_group_size: usize,
|
||||||
|
) -> Result<()> {
|
||||||
|
tracing::info!(%field_id, ?facet_type, "performing sanity checks");
|
||||||
|
let database = match facet_type {
|
||||||
|
FacetType::String => {
|
||||||
|
index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>()
|
||||||
|
}
|
||||||
|
FacetType::Number => {
|
||||||
|
index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>()
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let leaf_prefix: FacetGroupKey<&[u8]> = FacetGroupKey { field_id, level: 0, left_bound: &[] };
|
||||||
|
|
||||||
|
let leaf_it = database.prefix_iter(rtxn, &leaf_prefix)?;
|
||||||
|
|
||||||
|
let max_level = get_highest_level(rtxn, database, field_id)?;
|
||||||
|
if max_level == u8::MAX {
|
||||||
|
panic!("max_level == 255");
|
||||||
|
}
|
||||||
|
|
||||||
|
for leaf in leaf_it {
|
||||||
|
let (leaf_facet_value, leaf_docids) = leaf?;
|
||||||
|
let mut current_level = 0;
|
||||||
|
|
||||||
|
let mut current_parent_facet_value: Option<FacetGroupKey<&[u8]>> = None;
|
||||||
|
let mut current_parent_docids: Option<crate::heed_codec::facet::FacetGroupValue> = None;
|
||||||
|
loop {
|
||||||
|
current_level += 1;
|
||||||
|
if current_level >= max_level {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let parent_key_right_bound = FacetGroupKey {
|
||||||
|
field_id,
|
||||||
|
level: current_level,
|
||||||
|
left_bound: leaf_facet_value.left_bound,
|
||||||
|
};
|
||||||
|
let (parent_facet_value, parent_docids) = database
|
||||||
|
.get_lower_than_or_equal_to(rtxn, &parent_key_right_bound)?
|
||||||
|
.expect("no parent found");
|
||||||
|
if parent_facet_value.level != current_level {
|
||||||
|
panic!(
|
||||||
|
"wrong parent level, found_level={}, expected_level={}",
|
||||||
|
parent_facet_value.level, current_level
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if parent_facet_value.field_id != field_id {
|
||||||
|
panic!("wrong parent fid");
|
||||||
|
}
|
||||||
|
if parent_facet_value.left_bound > leaf_facet_value.left_bound {
|
||||||
|
panic!("wrong parent left bound");
|
||||||
|
}
|
||||||
|
|
||||||
|
if !leaf_docids.bitmap.is_subset(&parent_docids.bitmap) {
|
||||||
|
panic!(
|
||||||
|
"missing docids from leaf in parent, current_level={}, parent={}, child={}, missing={missing:?}, child_len={}, child={:?}",
|
||||||
|
current_level,
|
||||||
|
facet_to_string(parent_facet_value.left_bound, facet_type),
|
||||||
|
facet_to_string(leaf_facet_value.left_bound, facet_type),
|
||||||
|
leaf_docids.bitmap.len(),
|
||||||
|
leaf_docids.bitmap.clone(),
|
||||||
|
missing=leaf_docids.bitmap - parent_docids.bitmap,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(current_parent_facet_value) = current_parent_facet_value {
|
||||||
|
if current_parent_facet_value.field_id != parent_facet_value.field_id {
|
||||||
|
panic!("wrong parent parent fid");
|
||||||
|
}
|
||||||
|
if current_parent_facet_value.level + 1 != parent_facet_value.level {
|
||||||
|
panic!("wrong parent parent level");
|
||||||
|
}
|
||||||
|
if current_parent_facet_value.left_bound < parent_facet_value.left_bound {
|
||||||
|
panic!("wrong parent parent left bound");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(current_parent_docids) = current_parent_docids {
|
||||||
|
if !current_parent_docids.bitmap.is_subset(&parent_docids.bitmap) {
|
||||||
|
panic!("missing docids from intermediate node in parent, parent_level={}, parent={}, intermediate={}, missing={missing:?}, intermediate={:?}",
|
||||||
|
parent_facet_value.level,
|
||||||
|
facet_to_string(parent_facet_value.left_bound, facet_type),
|
||||||
|
facet_to_string(current_parent_facet_value.unwrap().left_bound, facet_type),
|
||||||
|
current_parent_docids.bitmap.clone(),
|
||||||
|
missing=current_parent_docids.bitmap - parent_docids.bitmap,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
current_parent_facet_value = Some(parent_facet_value);
|
||||||
|
current_parent_docids = Some(parent_docids);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tracing::info!(%field_id, ?facet_type, "checked all leaves");
|
||||||
|
|
||||||
|
let mut current_level = max_level;
|
||||||
|
let mut greater_than_group = 0usize;
|
||||||
|
let mut total = 0usize;
|
||||||
|
loop {
|
||||||
|
if current_level == 0 {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let child_level = current_level - 1;
|
||||||
|
tracing::info!(%field_id, ?facet_type, %current_level, "checked groups for level");
|
||||||
|
let level_groups_prefix: FacetGroupKey<&[u8]> =
|
||||||
|
FacetGroupKey { field_id, level: current_level, left_bound: &[] };
|
||||||
|
let mut level_groups_it = database.prefix_iter(rtxn, &level_groups_prefix)?.peekable();
|
||||||
|
|
||||||
|
'group_it: loop {
|
||||||
|
let Some(group) = level_groups_it.next() else { break 'group_it };
|
||||||
|
|
||||||
|
let (group_facet_value, group_docids) = group?;
|
||||||
|
let child_left_bound = group_facet_value.left_bound.to_owned();
|
||||||
|
let mut expected_docids = RoaringBitmap::new();
|
||||||
|
let mut expected_size = 0usize;
|
||||||
|
let right_bound = level_groups_it
|
||||||
|
.peek()
|
||||||
|
.and_then(|res| res.as_ref().ok())
|
||||||
|
.map(|(key, _)| key.left_bound);
|
||||||
|
let child_left_bound = FacetGroupKey {
|
||||||
|
field_id,
|
||||||
|
level: child_level,
|
||||||
|
left_bound: child_left_bound.as_slice(),
|
||||||
|
};
|
||||||
|
let child_left_bound = Bound::Included(&child_left_bound);
|
||||||
|
let child_right_bound;
|
||||||
|
let child_right_bound = if let Some(right_bound) = right_bound {
|
||||||
|
child_right_bound =
|
||||||
|
FacetGroupKey { field_id, level: child_level, left_bound: right_bound };
|
||||||
|
Bound::Excluded(&child_right_bound)
|
||||||
|
} else {
|
||||||
|
Bound::Unbounded
|
||||||
|
};
|
||||||
|
let children = database.range(rtxn, &(child_left_bound, child_right_bound))?;
|
||||||
|
for child in children {
|
||||||
|
let (child_facet_value, child_docids) = child?;
|
||||||
|
if child_facet_value.field_id != field_id {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if child_facet_value.level != child_level {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
expected_size += 1;
|
||||||
|
expected_docids |= &child_docids.bitmap;
|
||||||
|
}
|
||||||
|
assert_eq!(expected_size, group_docids.size as usize);
|
||||||
|
assert!(expected_size <= max_group_size);
|
||||||
|
assert_eq!(expected_docids, group_docids.bitmap);
|
||||||
|
total += 1;
|
||||||
|
if expected_size > group_size {
|
||||||
|
greater_than_group += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
current_level -= 1;
|
||||||
|
}
|
||||||
|
if greater_than_group * 2 > total {
|
||||||
|
panic!("too many groups have a size > group_size");
|
||||||
|
}
|
||||||
|
|
||||||
|
tracing::info!("sanity checks OK");
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn facet_to_string(facet_value: &[u8], facet_type: FacetType) -> String {
|
||||||
|
match facet_type {
|
||||||
|
FacetType::String => bstr::BStr::new(facet_value).to_string(),
|
||||||
|
FacetType::Number => match OrderedF64Codec::bytes_decode(facet_value) {
|
||||||
|
Ok(value) => value.to_string(),
|
||||||
|
Err(e) => format!("error: {e} (bytes: {facet_value:?}"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
498
crates/milli/src/update/facet/new_incremental.rs
Normal file
498
crates/milli/src/update/facet/new_incremental.rs
Normal file
@ -0,0 +1,498 @@
|
|||||||
|
use std::ops::Bound;
|
||||||
|
|
||||||
|
use heed::types::{Bytes, DecodeIgnore};
|
||||||
|
use heed::{BytesDecode as _, Database, RwTxn};
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
use crate::facet::FacetType;
|
||||||
|
use crate::heed_codec::facet::{
|
||||||
|
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
||||||
|
};
|
||||||
|
use crate::heed_codec::BytesRefCodec;
|
||||||
|
use crate::search::facet::get_highest_level;
|
||||||
|
use crate::update::valid_facet_value;
|
||||||
|
use crate::{FieldId, Index, Result};
|
||||||
|
|
||||||
|
pub struct FacetsUpdateIncremental {
|
||||||
|
inner: FacetsUpdateIncrementalInner,
|
||||||
|
delta_data: Vec<FacetFieldIdChange>,
|
||||||
|
}
|
||||||
|
|
||||||
|
struct FacetsUpdateIncrementalInner {
|
||||||
|
db: Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||||
|
field_id: FieldId,
|
||||||
|
group_size: u8,
|
||||||
|
min_level_size: u8,
|
||||||
|
max_group_size: u8,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FacetsUpdateIncremental {
|
||||||
|
pub fn new(
|
||||||
|
index: &Index,
|
||||||
|
facet_type: FacetType,
|
||||||
|
field_id: FieldId,
|
||||||
|
delta_data: Vec<FacetFieldIdChange>,
|
||||||
|
group_size: u8,
|
||||||
|
min_level_size: u8,
|
||||||
|
max_group_size: u8,
|
||||||
|
) -> Self {
|
||||||
|
FacetsUpdateIncremental {
|
||||||
|
inner: FacetsUpdateIncrementalInner {
|
||||||
|
db: match facet_type {
|
||||||
|
FacetType::String => index
|
||||||
|
.facet_id_string_docids
|
||||||
|
.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
|
||||||
|
FacetType::Number => index
|
||||||
|
.facet_id_f64_docids
|
||||||
|
.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
|
||||||
|
},
|
||||||
|
field_id,
|
||||||
|
group_size,
|
||||||
|
min_level_size,
|
||||||
|
max_group_size,
|
||||||
|
},
|
||||||
|
|
||||||
|
delta_data,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::facets::incremental")]
|
||||||
|
pub fn execute(mut self, wtxn: &mut RwTxn) -> Result<()> {
|
||||||
|
if self.delta_data.is_empty() {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
self.delta_data.sort_unstable_by(
|
||||||
|
|FacetFieldIdChange { facet_value: left, .. },
|
||||||
|
FacetFieldIdChange { facet_value: right, .. }| {
|
||||||
|
left.cmp(right)
|
||||||
|
// sort in **reverse** lexicographic order
|
||||||
|
.reverse()
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
self.inner.find_changed_parents(wtxn, self.delta_data)?;
|
||||||
|
|
||||||
|
self.inner.add_or_delete_level(wtxn)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FacetsUpdateIncrementalInner {
|
||||||
|
/// WARNING: `changed_children` must be sorted in **reverse** lexicographic order.
|
||||||
|
fn find_changed_parents(
|
||||||
|
&self,
|
||||||
|
wtxn: &mut RwTxn,
|
||||||
|
mut changed_children: Vec<FacetFieldIdChange>,
|
||||||
|
) -> Result<()> {
|
||||||
|
let mut changed_parents = vec![];
|
||||||
|
for child_level in 0u8..u8::MAX {
|
||||||
|
// child_level < u8::MAX by construction
|
||||||
|
let parent_level = child_level + 1;
|
||||||
|
let parent_level_left_bound: FacetGroupKey<&[u8]> =
|
||||||
|
FacetGroupKey { field_id: self.field_id, level: parent_level, left_bound: &[] };
|
||||||
|
|
||||||
|
let mut last_parent: Option<Box<[u8]>> = None;
|
||||||
|
let mut child_it = changed_children
|
||||||
|
// drain all changed children
|
||||||
|
.drain(..)
|
||||||
|
// keep only children whose value is valid in the LMDB sense
|
||||||
|
.filter(|child| valid_facet_value(&child.facet_value));
|
||||||
|
// `while let` rather than `for` because we advance `child_it` inside of the loop
|
||||||
|
'current_level: while let Some(child) = child_it.next() {
|
||||||
|
if let Some(last_parent) = &last_parent {
|
||||||
|
if &child.facet_value >= last_parent {
|
||||||
|
self.compute_parent_group(wtxn, child_level, child.facet_value)?;
|
||||||
|
continue 'current_level;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// need to find a new parent
|
||||||
|
let parent_key_prefix = FacetGroupKey {
|
||||||
|
field_id: self.field_id,
|
||||||
|
level: parent_level,
|
||||||
|
left_bound: &*child.facet_value,
|
||||||
|
};
|
||||||
|
|
||||||
|
let parent = self
|
||||||
|
.db
|
||||||
|
.remap_data_type::<DecodeIgnore>()
|
||||||
|
.rev_range(
|
||||||
|
wtxn,
|
||||||
|
&(
|
||||||
|
Bound::Excluded(&parent_level_left_bound),
|
||||||
|
Bound::Included(&parent_key_prefix),
|
||||||
|
),
|
||||||
|
)?
|
||||||
|
.next();
|
||||||
|
|
||||||
|
match parent {
|
||||||
|
Some(Ok((parent_key, _parent_value))) => {
|
||||||
|
// found parent, cache it for next keys
|
||||||
|
last_parent = Some(parent_key.left_bound.to_owned().into_boxed_slice());
|
||||||
|
|
||||||
|
// add to modified list for parent level
|
||||||
|
changed_parents.push(FacetFieldIdChange {
|
||||||
|
facet_value: parent_key.left_bound.to_owned().into_boxed_slice(),
|
||||||
|
});
|
||||||
|
self.compute_parent_group(wtxn, child_level, child.facet_value)?;
|
||||||
|
}
|
||||||
|
Some(Err(err)) => return Err(err.into()),
|
||||||
|
None => {
|
||||||
|
// no parent for that key
|
||||||
|
let mut parent_it = self
|
||||||
|
.db
|
||||||
|
.remap_data_type::<DecodeIgnore>()
|
||||||
|
.prefix_iter_mut(wtxn, &parent_level_left_bound)?;
|
||||||
|
match parent_it.next() {
|
||||||
|
// 1. left of the current left bound, or
|
||||||
|
Some(Ok((first_key, _first_value))) => {
|
||||||
|
// make sure we don't spill on the neighboring fid (level also included defensively)
|
||||||
|
if first_key.field_id != self.field_id
|
||||||
|
|| first_key.level != parent_level
|
||||||
|
{
|
||||||
|
// max level reached, exit
|
||||||
|
drop(parent_it);
|
||||||
|
self.compute_parent_group(
|
||||||
|
wtxn,
|
||||||
|
child_level,
|
||||||
|
child.facet_value,
|
||||||
|
)?;
|
||||||
|
for child in child_it.by_ref() {
|
||||||
|
self.compute_parent_group(
|
||||||
|
wtxn,
|
||||||
|
child_level,
|
||||||
|
child.facet_value,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
// remove old left bound
|
||||||
|
unsafe { parent_it.del_current()? };
|
||||||
|
drop(parent_it);
|
||||||
|
changed_parents.push(FacetFieldIdChange {
|
||||||
|
facet_value: child.facet_value.clone(),
|
||||||
|
});
|
||||||
|
self.compute_parent_group(wtxn, child_level, child.facet_value)?;
|
||||||
|
// pop all elements in order to visit the new left bound
|
||||||
|
let new_left_bound =
|
||||||
|
&mut changed_parents.last_mut().unwrap().facet_value;
|
||||||
|
for child in child_it.by_ref() {
|
||||||
|
new_left_bound.clone_from(&child.facet_value);
|
||||||
|
|
||||||
|
self.compute_parent_group(
|
||||||
|
wtxn,
|
||||||
|
child_level,
|
||||||
|
child.facet_value,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some(Err(err)) => return Err(err.into()),
|
||||||
|
// 2. max level reached, exit
|
||||||
|
None => {
|
||||||
|
drop(parent_it);
|
||||||
|
self.compute_parent_group(wtxn, child_level, child.facet_value)?;
|
||||||
|
for child in child_it.by_ref() {
|
||||||
|
self.compute_parent_group(
|
||||||
|
wtxn,
|
||||||
|
child_level,
|
||||||
|
child.facet_value,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if changed_parents.is_empty() {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
drop(child_it);
|
||||||
|
std::mem::swap(&mut changed_children, &mut changed_parents);
|
||||||
|
// changed_parents is now empty because changed_children was emptied by the drain
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn compute_parent_group(
|
||||||
|
&self,
|
||||||
|
wtxn: &mut RwTxn<'_>,
|
||||||
|
parent_level: u8,
|
||||||
|
parent_left_bound: Box<[u8]>,
|
||||||
|
) -> Result<()> {
|
||||||
|
let mut range_left_bound: Vec<u8> = parent_left_bound.into();
|
||||||
|
if parent_level == 0 {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
let child_level = parent_level - 1;
|
||||||
|
|
||||||
|
let parent_key = FacetGroupKey {
|
||||||
|
field_id: self.field_id,
|
||||||
|
level: parent_level,
|
||||||
|
left_bound: &*range_left_bound,
|
||||||
|
};
|
||||||
|
let child_right_bound = self
|
||||||
|
.db
|
||||||
|
.remap_data_type::<DecodeIgnore>()
|
||||||
|
.get_greater_than(wtxn, &parent_key)?
|
||||||
|
.and_then(
|
||||||
|
|(
|
||||||
|
FacetGroupKey {
|
||||||
|
level: right_level,
|
||||||
|
field_id: right_fid,
|
||||||
|
left_bound: right_bound,
|
||||||
|
},
|
||||||
|
_,
|
||||||
|
)| {
|
||||||
|
if parent_level != right_level || self.field_id != right_fid {
|
||||||
|
// there was a greater key, but with a greater level or fid, so not a sibling to the parent: ignore
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
Some(right_bound.to_owned())
|
||||||
|
},
|
||||||
|
);
|
||||||
|
let child_right_bound = match &child_right_bound {
|
||||||
|
Some(right_bound) => Bound::Excluded(FacetGroupKey {
|
||||||
|
left_bound: right_bound.as_slice(),
|
||||||
|
field_id: self.field_id,
|
||||||
|
level: child_level,
|
||||||
|
}),
|
||||||
|
None => Bound::Unbounded,
|
||||||
|
};
|
||||||
|
|
||||||
|
let child_left_key = FacetGroupKey {
|
||||||
|
field_id: self.field_id,
|
||||||
|
level: child_level,
|
||||||
|
left_bound: &*range_left_bound,
|
||||||
|
};
|
||||||
|
let mut child_left_bound = Bound::Included(child_left_key);
|
||||||
|
|
||||||
|
loop {
|
||||||
|
// do a first pass on the range to find the number of children
|
||||||
|
let child_count = self
|
||||||
|
.db
|
||||||
|
.remap_data_type::<DecodeIgnore>()
|
||||||
|
.range(wtxn, &(child_left_bound, child_right_bound))?
|
||||||
|
.take(self.max_group_size as usize * 2)
|
||||||
|
.count();
|
||||||
|
let mut child_it = self.db.range(wtxn, &(child_left_bound, child_right_bound))?;
|
||||||
|
|
||||||
|
// pick the right group_size depending on the number of children
|
||||||
|
let group_size = if child_count >= self.max_group_size as usize * 2 {
|
||||||
|
// more than twice the max_group_size => there will be space for at least 2 groups of max_group_size
|
||||||
|
self.max_group_size as usize
|
||||||
|
} else if child_count >= self.group_size as usize {
|
||||||
|
// size in [group_size, max_group_size * 2[
|
||||||
|
// divided by 2 it is between [group_size / 2, max_group_size[
|
||||||
|
// this ensures that the tree is balanced
|
||||||
|
child_count / 2
|
||||||
|
} else {
|
||||||
|
// take everything
|
||||||
|
child_count
|
||||||
|
};
|
||||||
|
|
||||||
|
let res: Result<_> = child_it
|
||||||
|
.by_ref()
|
||||||
|
.take(group_size)
|
||||||
|
// stop if we go to the next level or field id
|
||||||
|
.take_while(|res| match res {
|
||||||
|
Ok((child_key, _)) => {
|
||||||
|
child_key.field_id == self.field_id && child_key.level == child_level
|
||||||
|
}
|
||||||
|
Err(_) => true,
|
||||||
|
})
|
||||||
|
.try_fold(
|
||||||
|
(None, FacetGroupValue { size: 0, bitmap: Default::default() }),
|
||||||
|
|(bounds, mut group_value), child_res| {
|
||||||
|
let (child_key, child_value) = child_res?;
|
||||||
|
let bounds = match bounds {
|
||||||
|
Some((left_bound, _)) => Some((left_bound, child_key.left_bound)),
|
||||||
|
None => Some((child_key.left_bound, child_key.left_bound)),
|
||||||
|
};
|
||||||
|
// max_group_size <= u8::MAX
|
||||||
|
group_value.size += 1;
|
||||||
|
group_value.bitmap |= &child_value.bitmap;
|
||||||
|
Ok((bounds, group_value))
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
let (bounds, group_value) = res?;
|
||||||
|
|
||||||
|
let Some((group_left_bound, right_bound)) = bounds else {
|
||||||
|
let update_key = FacetGroupKey {
|
||||||
|
field_id: self.field_id,
|
||||||
|
level: parent_level,
|
||||||
|
left_bound: &*range_left_bound,
|
||||||
|
};
|
||||||
|
drop(child_it);
|
||||||
|
if let Bound::Included(_) = child_left_bound {
|
||||||
|
self.db.delete(wtxn, &update_key)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
};
|
||||||
|
|
||||||
|
drop(child_it);
|
||||||
|
let current_left_bound = group_left_bound.to_owned();
|
||||||
|
|
||||||
|
let delete_old_bound = match child_left_bound {
|
||||||
|
Bound::Included(bound) => {
|
||||||
|
if bound.left_bound != current_left_bound {
|
||||||
|
Some(range_left_bound.clone())
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => None,
|
||||||
|
};
|
||||||
|
|
||||||
|
range_left_bound.clear();
|
||||||
|
range_left_bound.extend_from_slice(right_bound);
|
||||||
|
let child_left_key = FacetGroupKey {
|
||||||
|
field_id: self.field_id,
|
||||||
|
level: child_level,
|
||||||
|
left_bound: range_left_bound.as_slice(),
|
||||||
|
};
|
||||||
|
child_left_bound = Bound::Excluded(child_left_key);
|
||||||
|
|
||||||
|
if let Some(old_bound) = delete_old_bound {
|
||||||
|
let update_key = FacetGroupKey {
|
||||||
|
field_id: self.field_id,
|
||||||
|
level: parent_level,
|
||||||
|
left_bound: old_bound.as_slice(),
|
||||||
|
};
|
||||||
|
self.db.delete(wtxn, &update_key)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let update_key = FacetGroupKey {
|
||||||
|
field_id: self.field_id,
|
||||||
|
level: parent_level,
|
||||||
|
left_bound: current_left_bound.as_slice(),
|
||||||
|
};
|
||||||
|
if group_value.bitmap.is_empty() {
|
||||||
|
self.db.delete(wtxn, &update_key)?;
|
||||||
|
} else {
|
||||||
|
self.db.put(wtxn, &update_key, &group_value)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check whether the highest level has exceeded `min_level_size` * `self.group_size`.
|
||||||
|
/// If it has, we must build an addition level above it.
|
||||||
|
/// Then check whether the highest level is under `min_level_size`.
|
||||||
|
/// If it has, we must remove the complete level.
|
||||||
|
pub(crate) fn add_or_delete_level(&self, txn: &mut RwTxn<'_>) -> Result<()> {
|
||||||
|
let highest_level = get_highest_level(txn, self.db, self.field_id)?;
|
||||||
|
let mut highest_level_prefix = vec![];
|
||||||
|
highest_level_prefix.extend_from_slice(&self.field_id.to_be_bytes());
|
||||||
|
highest_level_prefix.push(highest_level);
|
||||||
|
|
||||||
|
let size_highest_level =
|
||||||
|
self.db.remap_types::<Bytes, Bytes>().prefix_iter(txn, &highest_level_prefix)?.count();
|
||||||
|
|
||||||
|
if size_highest_level >= self.group_size as usize * self.min_level_size as usize {
|
||||||
|
self.add_level(txn, highest_level, &highest_level_prefix, size_highest_level)
|
||||||
|
} else if size_highest_level < self.min_level_size as usize && highest_level != 0 {
|
||||||
|
self.delete_level(txn, &highest_level_prefix)
|
||||||
|
} else {
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Delete a level.
|
||||||
|
fn delete_level(&self, txn: &mut RwTxn<'_>, highest_level_prefix: &[u8]) -> Result<()> {
|
||||||
|
let mut to_delete = vec![];
|
||||||
|
let mut iter =
|
||||||
|
self.db.remap_types::<Bytes, Bytes>().prefix_iter(txn, highest_level_prefix)?;
|
||||||
|
for el in iter.by_ref() {
|
||||||
|
let (k, _) = el?;
|
||||||
|
to_delete.push(
|
||||||
|
FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(k)
|
||||||
|
.map_err(heed::Error::Encoding)?
|
||||||
|
.into_owned(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
drop(iter);
|
||||||
|
for k in to_delete {
|
||||||
|
self.db.delete(txn, &k.as_ref())?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build an additional level for the field id.
|
||||||
|
fn add_level(
|
||||||
|
&self,
|
||||||
|
txn: &mut RwTxn<'_>,
|
||||||
|
highest_level: u8,
|
||||||
|
highest_level_prefix: &[u8],
|
||||||
|
size_highest_level: usize,
|
||||||
|
) -> Result<()> {
|
||||||
|
let mut groups_iter = self
|
||||||
|
.db
|
||||||
|
.remap_types::<Bytes, FacetGroupValueCodec>()
|
||||||
|
.prefix_iter(txn, highest_level_prefix)?;
|
||||||
|
|
||||||
|
let nbr_new_groups = size_highest_level / self.group_size as usize;
|
||||||
|
let nbr_leftover_elements = size_highest_level % self.group_size as usize;
|
||||||
|
|
||||||
|
let mut to_add = vec![];
|
||||||
|
for _ in 0..nbr_new_groups {
|
||||||
|
let mut first_key = None;
|
||||||
|
let mut values = RoaringBitmap::new();
|
||||||
|
for _ in 0..self.group_size {
|
||||||
|
let (key_bytes, value_i) = groups_iter.next().unwrap()?;
|
||||||
|
let key_i = FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key_bytes)
|
||||||
|
.map_err(heed::Error::Encoding)?;
|
||||||
|
|
||||||
|
if first_key.is_none() {
|
||||||
|
first_key = Some(key_i);
|
||||||
|
}
|
||||||
|
values |= value_i.bitmap;
|
||||||
|
}
|
||||||
|
let key = FacetGroupKey {
|
||||||
|
field_id: self.field_id,
|
||||||
|
level: highest_level + 1,
|
||||||
|
left_bound: first_key.unwrap().left_bound,
|
||||||
|
};
|
||||||
|
let value = FacetGroupValue { size: self.group_size, bitmap: values };
|
||||||
|
to_add.push((key.into_owned(), value));
|
||||||
|
}
|
||||||
|
// now we add the rest of the level, in case its size is > group_size * min_level_size
|
||||||
|
// this can indeed happen if the min_level_size parameter changes between two calls to `insert`
|
||||||
|
if nbr_leftover_elements > 0 {
|
||||||
|
let mut first_key = None;
|
||||||
|
let mut values = RoaringBitmap::new();
|
||||||
|
for _ in 0..nbr_leftover_elements {
|
||||||
|
let (key_bytes, value_i) = groups_iter.next().unwrap()?;
|
||||||
|
let key_i = FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key_bytes)
|
||||||
|
.map_err(heed::Error::Encoding)?;
|
||||||
|
|
||||||
|
if first_key.is_none() {
|
||||||
|
first_key = Some(key_i);
|
||||||
|
}
|
||||||
|
values |= value_i.bitmap;
|
||||||
|
}
|
||||||
|
let key = FacetGroupKey {
|
||||||
|
field_id: self.field_id,
|
||||||
|
level: highest_level + 1,
|
||||||
|
left_bound: first_key.unwrap().left_bound,
|
||||||
|
};
|
||||||
|
// Note: nbr_leftover_elements can be casted to a u8 since it is bounded by `max_group_size`
|
||||||
|
// when it is created above.
|
||||||
|
let value = FacetGroupValue { size: nbr_leftover_elements as u8, bitmap: values };
|
||||||
|
to_add.push((key.into_owned(), value));
|
||||||
|
}
|
||||||
|
|
||||||
|
drop(groups_iter);
|
||||||
|
for (key, value) in to_add {
|
||||||
|
self.db.put(txn, &key.as_ref(), &value)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct FacetFieldIdChange {
|
||||||
|
pub facet_value: Box<[u8]>,
|
||||||
|
}
|
@ -10,10 +10,14 @@ use fst::{IntoStreamer, Streamer};
|
|||||||
pub use grenad_helpers::*;
|
pub use grenad_helpers::*;
|
||||||
pub use merge_functions::*;
|
pub use merge_functions::*;
|
||||||
|
|
||||||
use crate::MAX_WORD_LENGTH;
|
use crate::MAX_LMDB_KEY_LENGTH;
|
||||||
|
|
||||||
pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool {
|
pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool {
|
||||||
key.as_ref().len() <= MAX_WORD_LENGTH * 2 && !key.as_ref().is_empty()
|
key.as_ref().len() <= MAX_LMDB_KEY_LENGTH - 3 && !key.as_ref().is_empty()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn valid_facet_value(facet_value: impl AsRef<[u8]>) -> bool {
|
||||||
|
facet_value.as_ref().len() <= MAX_LMDB_KEY_LENGTH - 3 && !facet_value.as_ref().is_empty()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Divides one slice into two at an index, returns `None` if mid is out of bounds.
|
/// Divides one slice into two at an index, returns `None` if mid is out of bounds.
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
---
|
---
|
||||||
source: milli/src/update/index_documents/mod.rs
|
source: crates/milli/src/update/index_documents/mod.rs
|
||||||
---
|
---
|
||||||
3 0 48.9021 1 [19, ]
|
3 0 48.9021 1 [19, ]
|
||||||
3 0 49.9314 1 [17, ]
|
3 0 49.9314 1 [17, ]
|
||||||
@ -15,6 +15,11 @@ source: milli/src/update/index_documents/mod.rs
|
|||||||
3 0 50.7453 1 [7, ]
|
3 0 50.7453 1 [7, ]
|
||||||
3 0 50.8466 1 [10, ]
|
3 0 50.8466 1 [10, ]
|
||||||
3 0 51.0537 1 [9, ]
|
3 0 51.0537 1 [9, ]
|
||||||
|
3 1 48.9021 2 [17, 19, ]
|
||||||
|
3 1 50.1793 3 [13, 14, 15, ]
|
||||||
|
3 1 50.4502 4 [0, 3, 8, 12, ]
|
||||||
|
3 1 50.6312 2 [1, 2, ]
|
||||||
|
3 1 50.7453 3 [7, 9, 10, ]
|
||||||
4 0 2.271 1 [17, ]
|
4 0 2.271 1 [17, ]
|
||||||
4 0 2.3708 1 [19, ]
|
4 0 2.3708 1 [19, ]
|
||||||
4 0 2.7637 1 [14, ]
|
4 0 2.7637 1 [14, ]
|
||||||
@ -28,4 +33,3 @@ source: milli/src/update/index_documents/mod.rs
|
|||||||
4 0 3.6957 1 [9, ]
|
4 0 3.6957 1 [9, ]
|
||||||
4 0 3.9623 1 [12, ]
|
4 0 3.9623 1 [12, ]
|
||||||
4 0 4.337 1 [10, ]
|
4 0 4.337 1 [10, ]
|
||||||
|
|
||||||
|
@ -36,6 +36,8 @@ use crate::index::main_key::{WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY};
|
|||||||
use crate::progress::Progress;
|
use crate::progress::Progress;
|
||||||
use crate::proximity::ProximityPrecision;
|
use crate::proximity::ProximityPrecision;
|
||||||
use crate::update::del_add::DelAdd;
|
use crate::update::del_add::DelAdd;
|
||||||
|
use crate::update::facet::new_incremental::FacetsUpdateIncremental;
|
||||||
|
use crate::update::facet::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
|
||||||
use crate::update::new::extract::EmbeddingExtractor;
|
use crate::update::new::extract::EmbeddingExtractor;
|
||||||
use crate::update::new::merger::merge_and_send_rtree;
|
use crate::update::new::merger::merge_and_send_rtree;
|
||||||
use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids;
|
use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids;
|
||||||
@ -203,6 +205,7 @@ where
|
|||||||
caches,
|
caches,
|
||||||
FacetDatabases::new(index),
|
FacetDatabases::new(index),
|
||||||
index,
|
index,
|
||||||
|
&rtxn,
|
||||||
extractor_sender.facet_docids(),
|
extractor_sender.facet_docids(),
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
@ -735,27 +738,66 @@ fn compute_facet_search_database(
|
|||||||
fn compute_facet_level_database(
|
fn compute_facet_level_database(
|
||||||
index: &Index,
|
index: &Index,
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
facet_field_ids_delta: FacetFieldIdsDelta,
|
mut facet_field_ids_delta: FacetFieldIdsDelta,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
if let Some(modified_facet_string_ids) = facet_field_ids_delta.modified_facet_string_ids() {
|
for (fid, delta) in facet_field_ids_delta.consume_facet_string_delta() {
|
||||||
let span = tracing::trace_span!(target: "indexing::facet_field_ids", "string");
|
let span = tracing::trace_span!(target: "indexing::facet_field_ids", "string");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
FacetsUpdateBulk::new_not_updating_level_0(
|
match delta {
|
||||||
index,
|
super::merger::FacetFieldIdDelta::Bulk => {
|
||||||
modified_facet_string_ids,
|
tracing::debug!(%fid, "bulk string facet processing");
|
||||||
FacetType::String,
|
FacetsUpdateBulk::new_not_updating_level_0(index, vec![fid], FacetType::String)
|
||||||
)
|
.execute(wtxn)?
|
||||||
.execute(wtxn)?;
|
|
||||||
}
|
}
|
||||||
if let Some(modified_facet_number_ids) = facet_field_ids_delta.modified_facet_number_ids() {
|
super::merger::FacetFieldIdDelta::Incremental(delta_data) => {
|
||||||
|
tracing::debug!(%fid, len=%delta_data.len(), "incremental string facet processing");
|
||||||
|
FacetsUpdateIncremental::new(
|
||||||
|
index,
|
||||||
|
FacetType::String,
|
||||||
|
fid,
|
||||||
|
delta_data,
|
||||||
|
FACET_GROUP_SIZE,
|
||||||
|
FACET_MIN_LEVEL_SIZE,
|
||||||
|
FACET_MAX_GROUP_SIZE,
|
||||||
|
)
|
||||||
|
.execute(wtxn)?
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (fid, delta) in facet_field_ids_delta.consume_facet_number_delta() {
|
||||||
let span = tracing::trace_span!(target: "indexing::facet_field_ids", "number");
|
let span = tracing::trace_span!(target: "indexing::facet_field_ids", "number");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
FacetsUpdateBulk::new_not_updating_level_0(
|
match delta {
|
||||||
|
super::merger::FacetFieldIdDelta::Bulk => {
|
||||||
|
tracing::debug!(%fid, "bulk number facet processing");
|
||||||
|
FacetsUpdateBulk::new_not_updating_level_0(index, vec![fid], FacetType::Number)
|
||||||
|
.execute(wtxn)?
|
||||||
|
}
|
||||||
|
super::merger::FacetFieldIdDelta::Incremental(delta_data) => {
|
||||||
|
tracing::debug!(%fid, len=%delta_data.len(), "incremental number facet processing");
|
||||||
|
FacetsUpdateIncremental::new(
|
||||||
index,
|
index,
|
||||||
modified_facet_number_ids,
|
|
||||||
FacetType::Number,
|
FacetType::Number,
|
||||||
|
fid,
|
||||||
|
delta_data,
|
||||||
|
FACET_GROUP_SIZE,
|
||||||
|
FACET_MIN_LEVEL_SIZE,
|
||||||
|
FACET_MAX_GROUP_SIZE,
|
||||||
)
|
)
|
||||||
.execute(wtxn)?;
|
.execute(wtxn)?
|
||||||
|
}
|
||||||
|
}
|
||||||
|
debug_assert!(crate::update::facet::sanity_checks(
|
||||||
|
index,
|
||||||
|
wtxn,
|
||||||
|
fid,
|
||||||
|
FacetType::Number,
|
||||||
|
FACET_GROUP_SIZE as usize,
|
||||||
|
FACET_MIN_LEVEL_SIZE as usize,
|
||||||
|
FACET_MAX_GROUP_SIZE as usize,
|
||||||
|
)
|
||||||
|
.is_ok());
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
use std::cell::RefCell;
|
use std::cell::RefCell;
|
||||||
|
|
||||||
use hashbrown::HashSet;
|
use hashbrown::HashMap;
|
||||||
use heed::types::Bytes;
|
use heed::types::Bytes;
|
||||||
use heed::{Database, RoTxn};
|
use heed::{Database, RoTxn};
|
||||||
use memmap2::Mmap;
|
use memmap2::Mmap;
|
||||||
@ -12,6 +12,7 @@ use super::extract::{
|
|||||||
merge_caches_sorted, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap,
|
merge_caches_sorted, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap,
|
||||||
FacetKind, GeoExtractorData,
|
FacetKind, GeoExtractorData,
|
||||||
};
|
};
|
||||||
|
use crate::update::facet::new_incremental::FacetFieldIdChange;
|
||||||
use crate::{CboRoaringBitmapCodec, FieldId, GeoPoint, Index, InternalError, Result};
|
use crate::{CboRoaringBitmapCodec, FieldId, GeoPoint, Index, InternalError, Result};
|
||||||
|
|
||||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
|
||||||
@ -100,12 +101,18 @@ pub fn merge_and_send_facet_docids<'extractor>(
|
|||||||
mut caches: Vec<BalancedCaches<'extractor>>,
|
mut caches: Vec<BalancedCaches<'extractor>>,
|
||||||
database: FacetDatabases,
|
database: FacetDatabases,
|
||||||
index: &Index,
|
index: &Index,
|
||||||
|
rtxn: &RoTxn,
|
||||||
docids_sender: FacetDocidsSender,
|
docids_sender: FacetDocidsSender,
|
||||||
) -> Result<FacetFieldIdsDelta> {
|
) -> Result<FacetFieldIdsDelta> {
|
||||||
|
let max_string_count = (index.facet_id_string_docids.len(rtxn)? / 500) as usize;
|
||||||
|
let max_number_count = (index.facet_id_f64_docids.len(rtxn)? / 500) as usize;
|
||||||
|
let max_string_count = max_string_count.clamp(1000, 100_000);
|
||||||
|
let max_number_count = max_number_count.clamp(1000, 100_000);
|
||||||
transpose_and_freeze_caches(&mut caches)?
|
transpose_and_freeze_caches(&mut caches)?
|
||||||
.into_par_iter()
|
.into_par_iter()
|
||||||
.map(|frozen| {
|
.map(|frozen| {
|
||||||
let mut facet_field_ids_delta = FacetFieldIdsDelta::default();
|
let mut facet_field_ids_delta =
|
||||||
|
FacetFieldIdsDelta::new(max_string_count, max_number_count);
|
||||||
let rtxn = index.read_txn()?;
|
let rtxn = index.read_txn()?;
|
||||||
merge_caches_sorted(frozen, |key, DelAddRoaringBitmap { del, add }| {
|
merge_caches_sorted(frozen, |key, DelAddRoaringBitmap { del, add }| {
|
||||||
let current = database.get_cbo_roaring_bytes_value(&rtxn, key)?;
|
let current = database.get_cbo_roaring_bytes_value(&rtxn, key)?;
|
||||||
@ -126,7 +133,10 @@ pub fn merge_and_send_facet_docids<'extractor>(
|
|||||||
|
|
||||||
Ok(facet_field_ids_delta)
|
Ok(facet_field_ids_delta)
|
||||||
})
|
})
|
||||||
.reduce(|| Ok(FacetFieldIdsDelta::default()), |lhs, rhs| Ok(lhs?.merge(rhs?)))
|
.reduce(
|
||||||
|
|| Ok(FacetFieldIdsDelta::new(max_string_count, max_number_count)),
|
||||||
|
|lhs, rhs| Ok(lhs?.merge(rhs?)),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct FacetDatabases<'a> {
|
pub struct FacetDatabases<'a> {
|
||||||
@ -155,60 +165,131 @@ impl<'a> FacetDatabases<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Default)]
|
#[derive(Debug)]
|
||||||
|
pub enum FacetFieldIdDelta {
|
||||||
|
Bulk,
|
||||||
|
Incremental(Vec<FacetFieldIdChange>),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FacetFieldIdDelta {
|
||||||
|
fn push(&mut self, facet_value: &[u8], max_count: usize) {
|
||||||
|
*self = match std::mem::replace(self, FacetFieldIdDelta::Bulk) {
|
||||||
|
FacetFieldIdDelta::Bulk => FacetFieldIdDelta::Bulk,
|
||||||
|
FacetFieldIdDelta::Incremental(mut v) => {
|
||||||
|
if v.len() >= max_count {
|
||||||
|
FacetFieldIdDelta::Bulk
|
||||||
|
} else {
|
||||||
|
v.push(FacetFieldIdChange { facet_value: facet_value.into() });
|
||||||
|
FacetFieldIdDelta::Incremental(v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn merge(&mut self, rhs: Option<Self>, max_count: usize) {
|
||||||
|
let Some(rhs) = rhs else {
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
*self = match (std::mem::replace(self, FacetFieldIdDelta::Bulk), rhs) {
|
||||||
|
(FacetFieldIdDelta::Bulk, _) | (_, FacetFieldIdDelta::Bulk) => FacetFieldIdDelta::Bulk,
|
||||||
|
(
|
||||||
|
FacetFieldIdDelta::Incremental(mut left),
|
||||||
|
FacetFieldIdDelta::Incremental(mut right),
|
||||||
|
) => {
|
||||||
|
if left.len() + right.len() >= max_count {
|
||||||
|
FacetFieldIdDelta::Bulk
|
||||||
|
} else {
|
||||||
|
left.append(&mut right);
|
||||||
|
FacetFieldIdDelta::Incremental(left)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
pub struct FacetFieldIdsDelta {
|
pub struct FacetFieldIdsDelta {
|
||||||
/// The field ids that have been modified
|
/// The field ids that have been modified
|
||||||
modified_facet_string_ids: HashSet<FieldId>,
|
modified_facet_string_ids: HashMap<FieldId, FacetFieldIdDelta, rustc_hash::FxBuildHasher>,
|
||||||
modified_facet_number_ids: HashSet<FieldId>,
|
modified_facet_number_ids: HashMap<FieldId, FacetFieldIdDelta, rustc_hash::FxBuildHasher>,
|
||||||
|
max_string_count: usize,
|
||||||
|
max_number_count: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FacetFieldIdsDelta {
|
impl FacetFieldIdsDelta {
|
||||||
fn register_facet_string_id(&mut self, field_id: FieldId) {
|
pub fn new(max_string_count: usize, max_number_count: usize) -> Self {
|
||||||
self.modified_facet_string_ids.insert(field_id);
|
Self {
|
||||||
|
max_string_count,
|
||||||
|
max_number_count,
|
||||||
|
modified_facet_string_ids: Default::default(),
|
||||||
|
modified_facet_number_ids: Default::default(),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn register_facet_number_id(&mut self, field_id: FieldId) {
|
fn register_facet_string_id(&mut self, field_id: FieldId, facet_value: &[u8]) {
|
||||||
self.modified_facet_number_ids.insert(field_id);
|
self.modified_facet_string_ids
|
||||||
|
.entry(field_id)
|
||||||
|
.or_insert(FacetFieldIdDelta::Incremental(Default::default()))
|
||||||
|
.push(facet_value, self.max_string_count);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn register_facet_number_id(&mut self, field_id: FieldId, facet_value: &[u8]) {
|
||||||
|
self.modified_facet_number_ids
|
||||||
|
.entry(field_id)
|
||||||
|
.or_insert(FacetFieldIdDelta::Incremental(Default::default()))
|
||||||
|
.push(facet_value, self.max_number_count);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn register_from_key(&mut self, key: &[u8]) {
|
fn register_from_key(&mut self, key: &[u8]) {
|
||||||
let (facet_kind, field_id) = self.extract_key_data(key);
|
let (facet_kind, field_id, facet_value) = self.extract_key_data(key);
|
||||||
match facet_kind {
|
match (facet_kind, facet_value) {
|
||||||
FacetKind::Number => self.register_facet_number_id(field_id),
|
(FacetKind::Number, Some(facet_value)) => {
|
||||||
FacetKind::String => self.register_facet_string_id(field_id),
|
self.register_facet_number_id(field_id, facet_value)
|
||||||
|
}
|
||||||
|
(FacetKind::String, Some(facet_value)) => {
|
||||||
|
self.register_facet_string_id(field_id, facet_value)
|
||||||
|
}
|
||||||
_ => (),
|
_ => (),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_key_data(&self, key: &[u8]) -> (FacetKind, FieldId) {
|
fn extract_key_data<'key>(&self, key: &'key [u8]) -> (FacetKind, FieldId, Option<&'key [u8]>) {
|
||||||
let facet_kind = FacetKind::from(key[0]);
|
let facet_kind = FacetKind::from(key[0]);
|
||||||
let field_id = FieldId::from_be_bytes([key[1], key[2]]);
|
let field_id = FieldId::from_be_bytes([key[1], key[2]]);
|
||||||
(facet_kind, field_id)
|
let facet_value = if key.len() >= 4 {
|
||||||
|
// level is also stored in the key at [3] (always 0)
|
||||||
|
Some(&key[4..])
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
(facet_kind, field_id, facet_value)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn modified_facet_string_ids(&self) -> Option<Vec<FieldId>> {
|
pub fn consume_facet_string_delta(
|
||||||
if self.modified_facet_string_ids.is_empty() {
|
&mut self,
|
||||||
None
|
) -> impl Iterator<Item = (FieldId, FacetFieldIdDelta)> + '_ {
|
||||||
} else {
|
self.modified_facet_string_ids.drain()
|
||||||
Some(self.modified_facet_string_ids.iter().copied().collect())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn modified_facet_number_ids(&self) -> Option<Vec<FieldId>> {
|
pub fn consume_facet_number_delta(
|
||||||
if self.modified_facet_number_ids.is_empty() {
|
&mut self,
|
||||||
None
|
) -> impl Iterator<Item = (FieldId, FacetFieldIdDelta)> + '_ {
|
||||||
} else {
|
self.modified_facet_number_ids.drain()
|
||||||
Some(self.modified_facet_number_ids.iter().copied().collect())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn merge(mut self, rhs: Self) -> Self {
|
pub fn merge(mut self, rhs: Self) -> Self {
|
||||||
let Self { modified_facet_number_ids, modified_facet_string_ids } = rhs;
|
// rhs.max_xx_count is assumed to be equal to self.max_xx_count, and so gets unused
|
||||||
modified_facet_number_ids.into_iter().for_each(|fid| {
|
let Self { modified_facet_number_ids, modified_facet_string_ids, .. } = rhs;
|
||||||
self.modified_facet_number_ids.insert(fid);
|
modified_facet_number_ids.into_iter().for_each(|(fid, mut delta)| {
|
||||||
|
let old_delta = self.modified_facet_number_ids.remove(&fid);
|
||||||
|
delta.merge(old_delta, self.max_number_count);
|
||||||
|
self.modified_facet_number_ids.insert(fid, delta);
|
||||||
});
|
});
|
||||||
modified_facet_string_ids.into_iter().for_each(|fid| {
|
modified_facet_string_ids.into_iter().for_each(|(fid, mut delta)| {
|
||||||
self.modified_facet_string_ids.insert(fid);
|
let old_delta = self.modified_facet_string_ids.remove(&fid);
|
||||||
|
delta.merge(old_delta, self.max_string_count);
|
||||||
|
self.modified_facet_string_ids.insert(fid, delta);
|
||||||
});
|
});
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user