Refactor Settings Indexing process

**Changes:**
The transform structure is now relying on FieldIdMapWithMetadata and AttributePatterns to prepare
the obkv documents during a settings reindexing.
The InnerIndexSettingsDiff and InnerIndexSettings structs are now relying on FieldIdMapWithMetadata, FilterableAttributesRule and AttributePatterns to define the field and the databases that should be reindexed.
The faceted_fields_ids, localized_searchable_fields_ids and localized_faceted_fields_ids have been removed in favor of the FieldIdMapWithMetadata.
We are now relying on the FieldIdMapWithMetadata to retain vectors_fids from the facets and the searchables.

The searchable database computing is now relying on the FieldIdMapWithMetadata to know if a field is searchable and retrieve the locales.

The facet database computing is now relying on the FieldIdMapWithMetadata to compute the facet databases, the facet-search and retrieve the locales.

The facet level database computing is now relying on the FieldIdMapWithMetadata and the facet level database are cleared depending on the settings differences (clear_facet_levels_based_on_settings_diff).

The vector point extraction uses the FieldIdMapWithMetadata instead of FieldsIdsMapWithMetadata.

**Impact:**
- Dump import
- Settings update
This commit is contained in:
ManyTheFish 2025-03-03 10:32:02 +01:00
parent 286d310287
commit 659855c88e
12 changed files with 375 additions and 272 deletions

View file

@ -6,17 +6,20 @@ use std::sync::Arc;
use charabia::{Normalize, Tokenizer, TokenizerBuilder};
use deserr::{DeserializeError, Deserr};
use itertools::{EitherOrBoth, Itertools};
use itertools::{merge_join_by, EitherOrBoth, Itertools};
use roaring::RoaringBitmap;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use time::OffsetDateTime;
use super::del_add::DelAddOperation;
use super::del_add::{DelAdd, DelAddOperation};
use super::index_documents::{IndexDocumentsConfig, Transform};
use super::IndexerConfig;
use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME};
use crate::attribute_patterns::PatternMatch;
use crate::constants::RESERVED_GEO_FIELD_NAME;
use crate::criterion::Criterion;
use crate::error::UserError;
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
use crate::filterable_attributes_rules::match_faceted_field;
use crate::index::{
IndexEmbeddingConfig, PrefixSearch, DEFAULT_MIN_WORD_LEN_ONE_TYPO,
DEFAULT_MIN_WORD_LEN_TWO_TYPOS,
@ -31,7 +34,7 @@ use crate::vector::settings::{
WriteBackToDocuments,
};
use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs};
use crate::{FieldId, FieldsIdsMap, Index, LocalizedAttributesRule, LocalizedFieldIds, Result};
use crate::{FieldId, FilterableAttributesRule, Index, LocalizedAttributesRule, Result};
#[derive(Debug, Clone, PartialEq, Eq, Copy)]
pub enum Setting<T> {
@ -155,7 +158,7 @@ pub struct Settings<'a, 't, 'i> {
searchable_fields: Setting<Vec<String>>,
displayed_fields: Setting<Vec<String>>,
filterable_fields: Setting<HashSet<String>>,
filterable_fields: Setting<Vec<FilterableAttributesRule>>,
sortable_fields: Setting<HashSet<String>>,
criteria: Setting<Vec<Criterion>>,
stop_words: Setting<BTreeSet<String>>,
@ -241,8 +244,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
self.filterable_fields = Setting::Reset;
}
pub fn set_filterable_fields(&mut self, names: HashSet<String>) {
self.filterable_fields = Setting::Set(names);
pub fn set_filterable_fields(&mut self, rules: Vec<FilterableAttributesRule>) {
self.filterable_fields = Setting::Set(rules);
}
pub fn set_sortable_fields(&mut self, names: HashSet<String>) {
@ -516,7 +519,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
}
/// Updates the index's searchable attributes.
fn update_searchable(&mut self) -> Result<bool> {
fn update_user_defined_searchable_attributes(&mut self) -> Result<bool> {
match self.searchable_fields {
Setting::Set(ref fields) => {
// Check to see if the searchable fields changed before doing anything else
@ -529,26 +532,10 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
return Ok(false);
}
// Since we're updating the settings we can only add new fields at the end of the field id map
let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
// fields are deduplicated, only the first occurrence is taken into account
let names = fields.iter().unique().map(String::as_str).collect::<Vec<_>>();
// Add all the searchable attributes to the field map, and then add the
// remaining fields from the old field map to the new one
for name in names.iter() {
// The fields ids map won't change the field id of already present elements thus only the
// new fields will be inserted.
fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?;
}
self.index.put_all_searchable_fields_from_fields_ids_map(
self.wtxn,
&names,
&fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME),
&fields_ids_map,
)?;
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
self.index.put_user_defined_searchable_fields(self.wtxn, &names)?;
Ok(true)
}
Setting::Reset => Ok(self.index.delete_all_searchable_fields(self.wtxn)?),
@ -760,14 +747,10 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
fn update_filterable(&mut self) -> Result<()> {
match self.filterable_fields {
Setting::Set(ref fields) => {
let mut new_facets = HashSet::new();
for name in fields {
new_facets.insert(name.clone());
}
self.index.put_filterable_fields(self.wtxn, &new_facets)?;
self.index.put_filterable_attributes_rules(self.wtxn, fields)?;
}
Setting::Reset => {
self.index.delete_filterable_fields(self.wtxn)?;
self.index.delete_filterable_attributes_rules(self.wtxn)?;
}
Setting::NotSet => (),
}
@ -1257,7 +1240,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
self.update_separator_tokens()?;
self.update_dictionary()?;
self.update_synonyms()?;
self.update_searchable()?;
self.update_user_defined_searchable_attributes()?;
self.update_exact_attributes()?;
self.update_proximity_precision()?;
self.update_prefix_search()?;
@ -1267,7 +1250,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
let embedding_config_updates = self.update_embedding_configs()?;
let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn, None)?;
new_inner_settings.recompute_facets(self.wtxn, self.index)?;
new_inner_settings.recompute_searchables(self.wtxn, self.index)?;
let primary_key_id = self
.index
@ -1319,8 +1302,8 @@ impl InnerIndexSettingsDiff {
settings_update_only: bool,
) -> Self {
let only_additional_fields = match (
&old_settings.user_defined_searchable_fields,
&new_settings.user_defined_searchable_fields,
&old_settings.user_defined_searchable_attributes,
&new_settings.user_defined_searchable_attributes,
) {
(None, None) | (Some(_), None) | (None, Some(_)) => None, // None means *
(Some(old), Some(new)) => {
@ -1342,14 +1325,14 @@ impl InnerIndexSettingsDiff {
|| old_settings.dictionary != new_settings.dictionary
|| old_settings.proximity_precision != new_settings.proximity_precision
|| old_settings.prefix_search != new_settings.prefix_search
|| old_settings.localized_searchable_fields_ids
!= new_settings.localized_searchable_fields_ids
|| old_settings.localized_attributes_rules
!= new_settings.localized_attributes_rules
};
let cache_exact_attributes = old_settings.exact_attributes != new_settings.exact_attributes;
let cache_user_defined_searchables = old_settings.user_defined_searchable_fields
!= new_settings.user_defined_searchable_fields;
let cache_user_defined_searchables = old_settings.user_defined_searchable_attributes
!= new_settings.user_defined_searchable_attributes;
// if the user-defined searchables changed, then we need to reindex prompts.
if cache_user_defined_searchables {
@ -1432,30 +1415,70 @@ impl InnerIndexSettingsDiff {
}
}
/// List the faceted fields from the inner fid map.
/// This is used to list the faceted fields when we are reindexing,
/// but it can't be used in document addition because the field id map must be exhaustive.
pub fn list_faceted_fields_from_fid_map(&self, del_add: DelAdd) -> BTreeSet<FieldId> {
let settings = match del_add {
DelAdd::Deletion => &self.old,
DelAdd::Addition => &self.new,
};
settings
.fields_ids_map
.iter_id_metadata()
.filter(|(_, metadata)| metadata.is_faceted(&settings.filterable_attributes_rules))
.map(|(id, _)| id)
.collect()
}
pub fn facet_fids_changed(&self) -> bool {
let existing_fields = &self.new.existing_fields;
if existing_fields.iter().any(|field| field.contains('.')) {
return true;
for eob in merge_join_by(
self.old.fields_ids_map.iter().filter(|(_, _, metadata)| {
metadata.is_faceted(&self.old.filterable_attributes_rules)
}),
self.new.fields_ids_map.iter().filter(|(_, _, metadata)| {
metadata.is_faceted(&self.new.filterable_attributes_rules)
}),
|(old_fid, _, _), (new_fid, _, _)| old_fid.cmp(new_fid),
) {
match eob {
// If there is a difference, we need to reindex facet databases.
EitherOrBoth::Left(_) | EitherOrBoth::Right(_) => return true,
// If the field is faceted in both old and new settings, we check the facet-searchable and facet level database.
EitherOrBoth::Both((_, _, old_metadata), (_, _, new_metadata)) => {
// Check if the field is facet-searchable in the old and new settings.
// If there is a difference, we need to reindex facet-search database.
let old_filterable_features = old_metadata
.filterable_attributes_features(&self.old.filterable_attributes_rules);
let new_filterable_features = new_metadata
.filterable_attributes_features(&self.new.filterable_attributes_rules);
let is_old_facet_searchable =
old_filterable_features.is_facet_searchable() && self.old.facet_search;
let is_new_facet_searchable =
new_filterable_features.is_facet_searchable() && self.new.facet_search;
if is_old_facet_searchable != is_new_facet_searchable {
return true;
}
// Check if the field needs a facet level database in the old and new settings.
// If there is a difference, we need to reindex facet level databases.
let old_facet_level_database = old_metadata
.require_facet_level_database(&self.old.filterable_attributes_rules);
let new_facet_level_database = new_metadata
.require_facet_level_database(&self.new.filterable_attributes_rules);
if old_facet_level_database != new_facet_level_database {
return true;
}
}
}
}
let old_faceted_fields = &self.old.user_defined_faceted_fields;
if old_faceted_fields.iter().any(|field| field.contains('.')) {
return true;
}
// If there is new faceted fields we indicate that we must reindex as we must
// index new fields as facets. It means that the distinct attribute,
// an Asc/Desc criterion or a filtered attribute as be added or removed.
let new_faceted_fields = &self.new.user_defined_faceted_fields;
if new_faceted_fields.iter().any(|field| field.contains('.')) {
return true;
}
(existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields)
false
}
pub fn global_facet_settings_changed(&self) -> bool {
self.old.localized_faceted_fields_ids != self.new.localized_faceted_fields_ids
self.old.localized_attributes_rules != self.new.localized_attributes_rules
|| self.old.facet_search != self.new.facet_search
}
@ -1475,10 +1498,6 @@ impl InnerIndexSettingsDiff {
self.old.geo_fields_ids != self.new.geo_fields_ids
|| (!self.settings_update_only && self.new.geo_fields_ids.is_some())
}
pub fn modified_faceted_fields(&self) -> HashSet<String> {
&self.old.user_defined_faceted_fields ^ &self.new.user_defined_faceted_fields
}
}
#[derive(Clone)]
@ -1486,20 +1505,17 @@ pub(crate) struct InnerIndexSettings {
pub stop_words: Option<fst::Set<Vec<u8>>>,
pub allowed_separators: Option<BTreeSet<String>>,
pub dictionary: Option<BTreeSet<String>>,
pub fields_ids_map: FieldsIdsMap,
pub user_defined_faceted_fields: HashSet<String>,
pub user_defined_searchable_fields: Option<Vec<String>>,
pub faceted_fields_ids: HashSet<FieldId>,
pub searchable_fields_ids: Vec<FieldId>,
pub fields_ids_map: FieldIdMapWithMetadata,
pub localized_attributes_rules: Vec<LocalizedAttributesRule>,
pub filterable_attributes_rules: Vec<FilterableAttributesRule>,
pub asc_desc_fields: HashSet<String>,
pub distinct_field: Option<String>,
pub user_defined_searchable_attributes: Option<Vec<String>>,
pub sortable_fields: HashSet<String>,
pub exact_attributes: HashSet<FieldId>,
pub proximity_precision: ProximityPrecision,
pub embedding_configs: EmbeddingConfigs,
pub existing_fields: HashSet<String>,
pub geo_fields_ids: Option<(FieldId, FieldId)>,
pub non_searchable_fields_ids: Vec<FieldId>,
pub non_faceted_fields_ids: Vec<FieldId>,
pub localized_searchable_fields_ids: LocalizedFieldIds,
pub localized_faceted_fields_ids: LocalizedFieldIds,
pub prefix_search: PrefixSearch,
pub facet_search: bool,
}
@ -1515,12 +1531,6 @@ impl InnerIndexSettings {
let allowed_separators = index.allowed_separators(rtxn)?;
let dictionary = index.dictionary(rtxn)?;
let mut fields_ids_map = index.fields_ids_map(rtxn)?;
let user_defined_searchable_fields = index.user_defined_searchable_fields(rtxn)?;
let user_defined_searchable_fields =
user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect());
let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?;
let mut searchable_fields_ids = index.searchable_fields_ids(rtxn)?;
let mut faceted_fields_ids = index.faceted_fields_ids(rtxn)?;
let exact_attributes = index.exact_attributes_ids(rtxn)?;
let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default();
let embedding_configs = match embedding_configs {
@ -1529,87 +1539,57 @@ impl InnerIndexSettings {
};
let prefix_search = index.prefix_search(rtxn)?.unwrap_or_default();
let facet_search = index.facet_search(rtxn)?;
let existing_fields: HashSet<_> = index
.field_distribution(rtxn)?
.into_iter()
.filter_map(|(field, count)| (count != 0).then_some(field))
.collect();
// index.fields_ids_map($a)? ==>> fields_ids_map
let geo_fields_ids = match fields_ids_map.id(RESERVED_GEO_FIELD_NAME) {
Some(gfid) => {
let is_sortable = index.sortable_fields_ids(rtxn)?.contains(&gfid);
let is_filterable = index.filterable_fields_ids(rtxn)?.contains(&gfid);
Some(_) if index.is_geo_enabled(rtxn)? => {
// if `_geo` is faceted then we get the `lat` and `lng`
if is_sortable || is_filterable {
let field_ids = fields_ids_map
.insert("_geo.lat")
.zip(fields_ids_map.insert("_geo.lng"))
.ok_or(UserError::AttributeLimitReached)?;
Some(field_ids)
} else {
None
}
let field_ids = fields_ids_map
.insert("_geo.lat")
.zip(fields_ids_map.insert("_geo.lng"))
.ok_or(UserError::AttributeLimitReached)?;
Some(field_ids)
}
None => None,
_ => None,
};
let localized_attributes_rules = index.localized_attributes_rules(rtxn)?;
let localized_searchable_fields_ids = LocalizedFieldIds::new(
&localized_attributes_rules,
&fields_ids_map,
searchable_fields_ids.iter().cloned(),
);
let localized_faceted_fields_ids = LocalizedFieldIds::new(
&localized_attributes_rules,
&fields_ids_map,
faceted_fields_ids.iter().cloned(),
);
let vectors_fids = fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME);
searchable_fields_ids.retain(|id| !vectors_fids.contains(id));
faceted_fields_ids.retain(|id| !vectors_fids.contains(id));
let localized_attributes_rules =
index.localized_attributes_rules(rtxn)?.unwrap_or_default();
let filterable_attributes_rules = index.filterable_attributes_rules(rtxn)?;
let sortable_fields = index.sortable_fields(rtxn)?;
let asc_desc_fields = index.asc_desc_fields(rtxn)?;
let distinct_field = index.distinct_field(rtxn)?.map(|f| f.to_string());
let user_defined_searchable_attributes = index
.user_defined_searchable_fields(rtxn)?
.map(|fields| fields.into_iter().map(|f| f.to_string()).collect());
let builder = MetadataBuilder::from_index(index, rtxn)?;
let fields_ids_map = FieldIdMapWithMetadata::new(fields_ids_map, builder);
Ok(Self {
stop_words,
allowed_separators,
dictionary,
fields_ids_map,
user_defined_faceted_fields,
user_defined_searchable_fields,
faceted_fields_ids,
searchable_fields_ids,
localized_attributes_rules,
filterable_attributes_rules,
asc_desc_fields,
distinct_field,
user_defined_searchable_attributes,
sortable_fields,
exact_attributes,
proximity_precision,
embedding_configs,
existing_fields,
geo_fields_ids,
non_searchable_fields_ids: vectors_fids.clone(),
non_faceted_fields_ids: vectors_fids.clone(),
localized_searchable_fields_ids,
localized_faceted_fields_ids,
prefix_search,
facet_search,
})
}
// find and insert the new field ids
pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn<'_>, index: &Index) -> Result<()> {
let new_facets = self
.fields_ids_map
.iter()
.filter(|(fid, _field)| !self.non_faceted_fields_ids.contains(fid))
.filter(|(_fid, field)| crate::is_faceted(field, &self.user_defined_faceted_fields))
.map(|(_fid, field)| field.to_string())
.collect();
index.put_faceted_fields(wtxn, &new_facets)?;
self.faceted_fields_ids = index.faceted_fields_ids(wtxn)?;
let localized_attributes_rules = index.localized_attributes_rules(wtxn)?;
self.localized_faceted_fields_ids = LocalizedFieldIds::new(
&localized_attributes_rules,
&self.fields_ids_map,
self.faceted_fields_ids.iter().cloned(),
);
Ok(())
pub fn match_faceted_field(&self, field: &str) -> PatternMatch {
match_faceted_field(
field,
&self.filterable_attributes_rules,
&self.sortable_fields,
&self.asc_desc_fields,
&self.distinct_field,
)
}
// find and insert the new field ids
@ -1619,7 +1599,7 @@ impl InnerIndexSettings {
index: &Index,
) -> Result<()> {
let searchable_fields = self
.user_defined_searchable_fields
.user_defined_searchable_attributes
.as_ref()
.map(|searchable| searchable.iter().map(|s| s.as_str()).collect::<Vec<_>>());
@ -1628,17 +1608,9 @@ impl InnerIndexSettings {
index.put_all_searchable_fields_from_fields_ids_map(
wtxn,
&searchable_fields,
&self.non_searchable_fields_ids,
&self.fields_ids_map,
)?;
}
self.searchable_fields_ids = index.searchable_fields_ids(wtxn)?;
let localized_attributes_rules = index.localized_attributes_rules(wtxn)?;
self.localized_searchable_fields_ids = LocalizedFieldIds::new(
&localized_attributes_rules,
&self.fields_ids_map,
self.searchable_fields_ids.iter().cloned(),
);
Ok(())
}