Refactor Settings Indexing process

**Changes:**
The transform structure is now relying on FieldIdMapWithMetadata and AttributePatterns to prepare
the obkv documents during a settings reindexing.
The InnerIndexSettingsDiff and InnerIndexSettings structs are now relying on FieldIdMapWithMetadata, FilterableAttributesRule and AttributePatterns to define the field and the databases that should be reindexed.
The faceted_fields_ids, localized_searchable_fields_ids and localized_faceted_fields_ids have been removed in favor of the FieldIdMapWithMetadata.
We are now relying on the FieldIdMapWithMetadata to retain vectors_fids from the facets and the searchables.

The searchable database computing is now relying on the FieldIdMapWithMetadata to know if a field is searchable and retrieve the locales.

The facet database computing is now relying on the FieldIdMapWithMetadata to compute the facet databases, the facet-search and retrieve the locales.

The facet level database computing is now relying on the FieldIdMapWithMetadata and the facet level database are cleared depending on the settings differences (clear_facet_levels_based_on_settings_diff).

The vector point extraction uses the FieldIdMapWithMetadata instead of FieldsIdsMapWithMetadata.

**Impact:**
- Dump import
- Settings update
This commit is contained in:
ManyTheFish 2025-03-03 10:32:02 +01:00
parent 286d310287
commit 659855c88e
12 changed files with 375 additions and 272 deletions

View file

@ -6,7 +6,7 @@ use heed::types::Bytes;
use heed::{BytesDecode, BytesEncode, Error, PutFlags, RoTxn, RwTxn};
use roaring::RoaringBitmap;
use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
use super::{clear_facet_levels, FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
use crate::facet::FacetType;
use crate::heed_codec::facet::{
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
@ -97,9 +97,7 @@ pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
pub fn update(mut self, wtxn: &mut RwTxn<'_>, field_ids: &[u16]) -> Result<()> {
self.update_level0(wtxn)?;
for &field_id in field_ids.iter() {
self.clear_levels(wtxn, field_id)?;
}
clear_facet_levels(wtxn, &self.db.remap_data_type(), field_ids)?;
for &field_id in field_ids.iter() {
let level_readers = self.compute_levels_for_field_id(field_id, wtxn)?;
@ -114,14 +112,6 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
Ok(())
}
fn clear_levels(&self, wtxn: &mut heed::RwTxn<'_>, field_id: FieldId) -> Result<()> {
let left = FacetGroupKey::<&[u8]> { field_id, level: 1, left_bound: &[] };
let right = FacetGroupKey::<&[u8]> { field_id, level: u8::MAX, left_bound: &[] };
let range = left..=right;
self.db.delete_range(wtxn, &range).map(drop)?;
Ok(())
}
fn update_level0(&mut self, wtxn: &mut RwTxn<'_>) -> Result<()> {
let delta_data = match self.delta_data.take() {
Some(x) => x,
@ -365,8 +355,6 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
mod tests {
use std::iter::once;
use big_s::S;
use maplit::hashset;
use roaring::RoaringBitmap;
use crate::documents::mmap_from_objects;
@ -374,7 +362,7 @@ mod tests {
use crate::heed_codec::StrRefCodec;
use crate::index::tests::TempIndex;
use crate::update::facet::test_helpers::{ordered_string, FacetIndex};
use crate::{db_snap, milli_snap};
use crate::{db_snap, milli_snap, FilterableAttributesRule};
#[test]
fn insert() {
@ -474,7 +462,8 @@ mod tests {
index
.update_settings(|settings| {
settings.set_primary_key("id".to_owned());
settings.set_filterable_fields(hashset! { S("id") });
settings
.set_filterable_fields(vec![FilterableAttributesRule::Field("id".to_string())]);
})
.unwrap();

View file

@ -89,6 +89,7 @@ use time::OffsetDateTime;
use tracing::debug;
use self::incremental::FacetsUpdateIncremental;
use super::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
use super::{FacetsUpdateBulk, MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps};
use crate::facet::FacetType;
use crate::heed_codec::facet::{
@ -147,7 +148,11 @@ impl<'i> FacetsUpdate<'i> {
}
}
pub fn execute(self, wtxn: &mut heed::RwTxn<'_>) -> Result<()> {
pub fn execute(
self,
wtxn: &mut heed::RwTxn<'_>,
new_settings: &InnerIndexSettings,
) -> Result<()> {
if self.data_size == 0 {
return Ok(());
}
@ -156,8 +161,7 @@ impl<'i> FacetsUpdate<'i> {
// See self::comparison_bench::benchmark_facet_indexing
if self.data_size >= (self.database.len(wtxn)? / 500) {
let field_ids =
self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>();
let field_ids = facet_levels_field_ids(new_settings);
let bulk_update = FacetsUpdateBulk::new(
self.index,
field_ids,
@ -291,6 +295,53 @@ fn index_facet_search(
Ok(())
}
/// Clear all the levels greater than 0 for given field ids.
pub fn clear_facet_levels<'a, I>(
wtxn: &mut heed::RwTxn<'_>,
db: &heed::Database<FacetGroupKeyCodec<BytesRefCodec>, DecodeIgnore>,
field_ids: I,
) -> Result<()>
where
I: IntoIterator<Item = &'a FieldId>,
{
for field_id in field_ids {
let field_id = *field_id;
let left = FacetGroupKey::<&[u8]> { field_id, level: 1, left_bound: &[] };
let right = FacetGroupKey::<&[u8]> { field_id, level: u8::MAX, left_bound: &[] };
let range = left..=right;
db.delete_range(wtxn, &range).map(drop)?;
}
Ok(())
}
pub fn clear_facet_levels_based_on_settings_diff(
wtxn: &mut heed::RwTxn<'_>,
index: &Index,
settings_diff: &InnerIndexSettingsDiff,
) -> Result<()> {
let new_field_ids: BTreeSet<_> = facet_levels_field_ids(&settings_diff.new);
let old_field_ids: BTreeSet<_> = facet_levels_field_ids(&settings_diff.old);
let field_ids_to_clear: Vec<_> = old_field_ids.difference(&new_field_ids).copied().collect();
clear_facet_levels(wtxn, &index.facet_id_string_docids.remap_types(), &field_ids_to_clear)?;
clear_facet_levels(wtxn, &index.facet_id_f64_docids.remap_types(), &field_ids_to_clear)?;
Ok(())
}
fn facet_levels_field_ids<B>(settings: &InnerIndexSettings) -> B
where
B: FromIterator<FieldId>,
{
settings
.fields_ids_map
.iter_id_metadata()
.filter(|(_, metadata)| {
metadata.require_facet_level_database(&settings.filterable_attributes_rules)
})
.map(|(id, _)| id)
.collect()
}
#[cfg(test)]
pub(crate) mod test_helpers {
use std::cell::Cell;