Refactor Document indexing process (Facets)

**Changes:**
The Documents changes now take a selector closure instead of a list of field to match the field to extract.
The seek_leaf_values_in_object function now uses a selector closure of a list of field to match the field to extract
The facet database extraction is now relying on the FilterableAttributesRule to match the field to extract.
The facet-search database extraction is now relying on the FieldIdMapWithMetadata to select the field to index.
The facet level database extraction is now relying on the FieldIdMapWithMetadata to select the field to index.

**Important:**
Because the filterable attributes are patterns now,
the fieldIdMap will only register the fields that exists in at least one document.
if a field doesn't exist in any document, it will not be registered even if it has been specified in the filterable fields.

**Impact:**
- Document Addition/modification facet indexing
- Document deletion facet indexing
This commit is contained in:
ManyTheFish 2025-03-03 10:30:42 +01:00
parent 659855c88e
commit 95bccaf5f5
8 changed files with 233 additions and 179 deletions

View file

@ -9,12 +9,14 @@ use heed::{BytesDecode, BytesEncode, RoTxn, RwTxn};
use super::fst_merger_builder::FstMergerBuilder;
use super::KvReaderDelAdd;
use crate::attribute_patterns::PatternMatch;
use crate::heed_codec::facet::FacetGroupKey;
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
use crate::update::{create_sorter, MergeDeladdBtreesetString};
use crate::{
BEU16StrCodec, FieldId, GlobalFieldsIdsMap, Index, LocalizedAttributesRule, Result,
MAX_FACET_VALUE_LENGTH,
BEU16StrCodec, FieldId, FieldIdMapMissingEntry, FilterableAttributesFeatures,
FilterableAttributesRule, GlobalFieldsIdsMap, Index, InternalError, LocalizedAttributesRule,
Result, MAX_FACET_VALUE_LENGTH,
};
pub struct FacetSearchBuilder<'indexer> {
@ -22,6 +24,7 @@ pub struct FacetSearchBuilder<'indexer> {
normalized_facet_string_docids_sorter: Sorter<MergeDeladdBtreesetString>,
global_fields_ids_map: GlobalFieldsIdsMap<'indexer>,
localized_attributes_rules: Vec<LocalizedAttributesRule>,
filterable_attributes_rules: Vec<FilterableAttributesRule>,
// Buffered data below
buffer: Vec<u8>,
localized_field_ids: HashMap<FieldId, Option<Vec<Language>>>,
@ -31,6 +34,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> {
pub fn new(
global_fields_ids_map: GlobalFieldsIdsMap<'indexer>,
localized_attributes_rules: Vec<LocalizedAttributesRule>,
filterable_attributes_rules: Vec<FilterableAttributesRule>,
) -> Self {
let registered_facets = HashMap::new();
let normalized_facet_string_docids_sorter = create_sorter(
@ -49,6 +53,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> {
buffer: Vec::new(),
global_fields_ids_map,
localized_attributes_rules,
filterable_attributes_rules,
localized_field_ids: HashMap::new(),
}
}
@ -60,6 +65,13 @@ impl<'indexer> FacetSearchBuilder<'indexer> {
) -> Result<()> {
let FacetGroupKey { field_id, level: _level, left_bound } = facet_key;
let filterable_attributes_features = self.filterable_attributes_features(field_id)?;
// if facet search is disabled, we don't need to register the facet
if !filterable_attributes_features.is_facet_searchable() {
return Ok(());
};
if deladd == DelAdd::Addition {
self.registered_facets.entry(field_id).and_modify(|count| *count += 1).or_insert(1);
}
@ -83,6 +95,24 @@ impl<'indexer> FacetSearchBuilder<'indexer> {
Ok(())
}
fn filterable_attributes_features(
&mut self,
field_id: u16,
) -> Result<FilterableAttributesFeatures> {
let Some(filterable_attributes_features) =
self.global_fields_ids_map.metadata(field_id).map(|metadata| {
metadata.filterable_attributes_features(&self.filterable_attributes_rules)
})
else {
return Err(InternalError::FieldIdMapMissingEntry(FieldIdMapMissingEntry::FieldId {
field_id,
process: "facet_search_builder::register_from_key",
})
.into());
};
Ok(filterable_attributes_features)
}
fn locales(&mut self, field_id: FieldId) -> Option<&[Language]> {
if let Entry::Vacant(e) = self.localized_field_ids.entry(field_id) {
let Some(field_name) = self.global_fields_ids_map.name(field_id) else {
@ -92,7 +122,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> {
let locales = self
.localized_attributes_rules
.iter()
.find(|rule| rule.match_str(field_name))
.find(|rule| rule.match_str(field_name) == PatternMatch::Match)
.map(|rule| rule.locales.clone());
e.insert(locales);