mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-08 20:44:30 +01:00
Add sanity checks for facet values
This commit is contained in:
parent
5908aec6cb
commit
6b3a2c7281
@ -79,17 +79,23 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
|
|||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::BufReader;
|
use std::io::BufReader;
|
||||||
|
use std::ops::Bound;
|
||||||
|
|
||||||
use grenad::Merger;
|
use grenad::Merger;
|
||||||
use heed::types::{Bytes, DecodeIgnore};
|
use heed::types::{Bytes, DecodeIgnore};
|
||||||
|
use heed::BytesDecode as _;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
use self::incremental::FacetsUpdateIncremental;
|
use self::incremental::FacetsUpdateIncremental;
|
||||||
use super::{FacetsUpdateBulk, MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps};
|
use super::{FacetsUpdateBulk, MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps};
|
||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
|
use crate::heed_codec::facet::{
|
||||||
|
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec,
|
||||||
|
};
|
||||||
use crate::heed_codec::BytesRefCodec;
|
use crate::heed_codec::BytesRefCodec;
|
||||||
|
use crate::search::facet::get_highest_level;
|
||||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||||
use crate::{try_split_array_at, FieldId, Index, Result};
|
use crate::{try_split_array_at, FieldId, Index, Result};
|
||||||
|
|
||||||
@ -646,3 +652,194 @@ mod comparison_bench {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Run sanity checks on the specified fid tree
|
||||||
|
///
|
||||||
|
/// 1. No "orphan" child value, any child value has a parent
|
||||||
|
/// 2. Any docid in the child appears in the parent
|
||||||
|
/// 3. No docid in the parent is missing from all its children
|
||||||
|
/// 4. no group is bigger than max_group_size
|
||||||
|
/// 5. Less than 50% of groups are bigger than group_size
|
||||||
|
/// 6. group size matches the number of children
|
||||||
|
/// 7. max_level is < 255
|
||||||
|
pub(crate) fn sanity_checks(
|
||||||
|
index: &Index,
|
||||||
|
rtxn: &heed::RoTxn,
|
||||||
|
field_id: FieldId,
|
||||||
|
facet_type: FacetType,
|
||||||
|
group_size: usize,
|
||||||
|
_min_level_size: usize, // might add a check on level size later
|
||||||
|
max_group_size: usize,
|
||||||
|
) -> Result<()> {
|
||||||
|
tracing::info!(%field_id, ?facet_type, "performing sanity checks");
|
||||||
|
let database = match facet_type {
|
||||||
|
FacetType::String => {
|
||||||
|
index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>()
|
||||||
|
}
|
||||||
|
FacetType::Number => {
|
||||||
|
index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>()
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let leaf_prefix: FacetGroupKey<&[u8]> = FacetGroupKey { field_id, level: 0, left_bound: &[] };
|
||||||
|
|
||||||
|
let leaf_it = database.prefix_iter(rtxn, &leaf_prefix)?;
|
||||||
|
|
||||||
|
let max_level = get_highest_level(rtxn, database, field_id)?;
|
||||||
|
if max_level == u8::MAX {
|
||||||
|
panic!("max_level == 255");
|
||||||
|
}
|
||||||
|
|
||||||
|
for leaf in leaf_it {
|
||||||
|
let (leaf_facet_value, leaf_docids) = leaf?;
|
||||||
|
let mut current_level = 0;
|
||||||
|
|
||||||
|
let mut current_parent_facet_value: Option<FacetGroupKey<&[u8]>> = None;
|
||||||
|
let mut current_parent_docids: Option<crate::heed_codec::facet::FacetGroupValue> = None;
|
||||||
|
loop {
|
||||||
|
current_level += 1;
|
||||||
|
if current_level >= max_level {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let parent_key_right_bound = FacetGroupKey {
|
||||||
|
field_id,
|
||||||
|
level: current_level,
|
||||||
|
left_bound: leaf_facet_value.left_bound,
|
||||||
|
};
|
||||||
|
let (parent_facet_value, parent_docids) = database
|
||||||
|
.get_lower_than_or_equal_to(rtxn, &parent_key_right_bound)?
|
||||||
|
.expect("no parent found");
|
||||||
|
if parent_facet_value.level != current_level {
|
||||||
|
panic!(
|
||||||
|
"wrong parent level, found_level={}, expected_level={}",
|
||||||
|
parent_facet_value.level, current_level
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if parent_facet_value.field_id != field_id {
|
||||||
|
panic!("wrong parent fid");
|
||||||
|
}
|
||||||
|
if parent_facet_value.left_bound > leaf_facet_value.left_bound {
|
||||||
|
panic!("wrong parent left bound");
|
||||||
|
}
|
||||||
|
|
||||||
|
if !leaf_docids.bitmap.is_subset(&parent_docids.bitmap) {
|
||||||
|
panic!(
|
||||||
|
"missing docids from leaf in parent, current_level={}, parent={}, child={}, missing={missing:?}, child_len={}, child={:?}",
|
||||||
|
current_level,
|
||||||
|
facet_to_string(parent_facet_value.left_bound, facet_type),
|
||||||
|
facet_to_string(leaf_facet_value.left_bound, facet_type),
|
||||||
|
leaf_docids.bitmap.len(),
|
||||||
|
leaf_docids.bitmap.clone(),
|
||||||
|
missing=leaf_docids.bitmap - parent_docids.bitmap,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(current_parent_facet_value) = current_parent_facet_value {
|
||||||
|
if current_parent_facet_value.field_id != parent_facet_value.field_id {
|
||||||
|
panic!("wrong parent parent fid");
|
||||||
|
}
|
||||||
|
if current_parent_facet_value.level + 1 != parent_facet_value.level {
|
||||||
|
panic!("wrong parent parent level");
|
||||||
|
}
|
||||||
|
if current_parent_facet_value.left_bound < parent_facet_value.left_bound {
|
||||||
|
panic!("wrong parent parent left bound");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(current_parent_docids) = current_parent_docids {
|
||||||
|
if !current_parent_docids.bitmap.is_subset(&parent_docids.bitmap) {
|
||||||
|
panic!("missing docids from intermediate node in parent, parent_level={}, parent={}, intermediate={}, missing={missing:?}, intermediate={:?}",
|
||||||
|
parent_facet_value.level,
|
||||||
|
facet_to_string(parent_facet_value.left_bound, facet_type),
|
||||||
|
facet_to_string(current_parent_facet_value.unwrap().left_bound, facet_type),
|
||||||
|
current_parent_docids.bitmap.clone(),
|
||||||
|
missing=current_parent_docids.bitmap - parent_docids.bitmap,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
current_parent_facet_value = Some(parent_facet_value);
|
||||||
|
current_parent_docids = Some(parent_docids);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tracing::info!(%field_id, ?facet_type, "checked all leaves");
|
||||||
|
|
||||||
|
let mut current_level = max_level;
|
||||||
|
let mut greater_than_group = 0usize;
|
||||||
|
let mut total = 0usize;
|
||||||
|
loop {
|
||||||
|
if current_level == 0 {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let child_level = current_level - 1;
|
||||||
|
tracing::info!(%field_id, ?facet_type, %current_level, "checked groups for level");
|
||||||
|
let level_groups_prefix: FacetGroupKey<&[u8]> =
|
||||||
|
FacetGroupKey { field_id, level: current_level, left_bound: &[] };
|
||||||
|
let mut level_groups_it = database.prefix_iter(rtxn, &level_groups_prefix)?.peekable();
|
||||||
|
|
||||||
|
'group_it: loop {
|
||||||
|
let Some(group) = level_groups_it.next() else { break 'group_it };
|
||||||
|
|
||||||
|
let (group_facet_value, group_docids) = group?;
|
||||||
|
let child_left_bound = group_facet_value.left_bound.to_owned();
|
||||||
|
let mut expected_docids = RoaringBitmap::new();
|
||||||
|
let mut expected_size = 0usize;
|
||||||
|
let right_bound = level_groups_it
|
||||||
|
.peek()
|
||||||
|
.and_then(|res| res.as_ref().ok())
|
||||||
|
.map(|(key, _)| key.left_bound);
|
||||||
|
let child_left_bound = FacetGroupKey {
|
||||||
|
field_id,
|
||||||
|
level: child_level,
|
||||||
|
left_bound: child_left_bound.as_slice(),
|
||||||
|
};
|
||||||
|
let child_left_bound = Bound::Included(&child_left_bound);
|
||||||
|
let child_right_bound;
|
||||||
|
let child_right_bound = if let Some(right_bound) = right_bound {
|
||||||
|
child_right_bound =
|
||||||
|
FacetGroupKey { field_id, level: child_level, left_bound: right_bound };
|
||||||
|
Bound::Excluded(&child_right_bound)
|
||||||
|
} else {
|
||||||
|
Bound::Unbounded
|
||||||
|
};
|
||||||
|
let children = database.range(rtxn, &(child_left_bound, child_right_bound))?;
|
||||||
|
for child in children {
|
||||||
|
let (child_facet_value, child_docids) = child?;
|
||||||
|
if child_facet_value.field_id != field_id {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if child_facet_value.level != child_level {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
expected_size += 1;
|
||||||
|
expected_docids |= &child_docids.bitmap;
|
||||||
|
}
|
||||||
|
assert_eq!(expected_size, group_docids.size as usize);
|
||||||
|
assert!(expected_size <= max_group_size);
|
||||||
|
assert_eq!(expected_docids, group_docids.bitmap);
|
||||||
|
total += 1;
|
||||||
|
if expected_size > group_size {
|
||||||
|
greater_than_group += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
current_level -= 1;
|
||||||
|
}
|
||||||
|
if greater_than_group * 2 > total {
|
||||||
|
panic!("too many groups have a size > group_size");
|
||||||
|
}
|
||||||
|
|
||||||
|
tracing::info!("sanity checks OK");
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn facet_to_string(facet_value: &[u8], facet_type: FacetType) -> String {
|
||||||
|
match facet_type {
|
||||||
|
FacetType::String => bstr::BStr::new(facet_value).to_string(),
|
||||||
|
FacetType::Number => match OrderedF64Codec::bytes_decode(facet_value) {
|
||||||
|
Ok(value) => value.to_string(),
|
||||||
|
Err(e) => format!("error: {e} (bytes: {facet_value:?}"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user