diff --git a/crates/dump/src/lib.rs b/crates/dump/src/lib.rs index 31cd3028e..e494e6b9f 100644 --- a/crates/dump/src/lib.rs +++ b/crates/dump/src/lib.rs @@ -226,8 +226,8 @@ pub(crate) mod test { use meilisearch_types::features::RuntimeTogglableFeatures; use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::keys::{Action, Key}; - use meilisearch_types::milli; use meilisearch_types::milli::update::Setting; + use meilisearch_types::milli::{self, FilterableAttributesRule}; use meilisearch_types::settings::{Checked, FacetingSettings, Settings}; use meilisearch_types::tasks::{Details, Status}; use serde_json::{json, Map, Value}; @@ -271,7 +271,10 @@ pub(crate) mod test { let settings = Settings { displayed_attributes: Setting::Set(vec![S("race"), S("name")]).into(), searchable_attributes: Setting::Set(vec![S("name"), S("race")]).into(), - filterable_attributes: Setting::Set(btreeset! { S("race"), S("age") }), + filterable_attributes: Setting::Set(vec![ + FilterableAttributesRule::Field(S("race")), + FilterableAttributesRule::Field(S("age")), + ]), sortable_attributes: Setting::Set(btreeset! { S("age") }), ranking_rules: Setting::NotSet, stop_words: Setting::NotSet, diff --git a/crates/dump/src/reader/compat/v5_to_v6.rs b/crates/dump/src/reader/compat/v5_to_v6.rs index 6b2655bdf..99eeead56 100644 --- a/crates/dump/src/reader/compat/v5_to_v6.rs +++ b/crates/dump/src/reader/compat/v5_to_v6.rs @@ -318,7 +318,16 @@ impl From> for v6::Settings { v6::Settings { displayed_attributes: v6::Setting::from(settings.displayed_attributes).into(), searchable_attributes: v6::Setting::from(settings.searchable_attributes).into(), - filterable_attributes: settings.filterable_attributes.into(), + filterable_attributes: match settings.filterable_attributes { + v5::settings::Setting::Set(filterable_attributes) => v6::Setting::Set( + filterable_attributes + .into_iter() + .map(|attr| v6::FilterableAttributesRule::Field(attr)) + .collect(), + ), + v5::settings::Setting::Reset => v6::Setting::Reset, + v5::settings::Setting::NotSet => v6::Setting::NotSet, + }, sortable_attributes: settings.sortable_attributes.into(), ranking_rules: { match settings.ranking_rules { diff --git a/crates/dump/src/reader/v6/mod.rs b/crates/dump/src/reader/v6/mod.rs index 50b9751a2..7fc05884f 100644 --- a/crates/dump/src/reader/v6/mod.rs +++ b/crates/dump/src/reader/v6/mod.rs @@ -43,6 +43,8 @@ pub type ResponseError = meilisearch_types::error::ResponseError; pub type Code = meilisearch_types::error::Code; pub type RankingRuleView = meilisearch_types::settings::RankingRuleView; +pub type FilterableAttributesRule = meilisearch_types::milli::FilterableAttributesRule; + pub struct V6Reader { dump: TempDir, instance_uid: Option, diff --git a/crates/milli/src/attribute_patterns.rs b/crates/milli/src/attribute_patterns.rs index 717db3042..26aea8ace 100644 --- a/crates/milli/src/attribute_patterns.rs +++ b/crates/milli/src/attribute_patterns.rs @@ -74,6 +74,21 @@ pub fn match_field_legacy(pattern: &str, field: &str) -> PatternMatch { } } +/// Match a field against a distinct field. +pub fn match_distinct_field(distinct_field: Option<&str>, field: &str) -> PatternMatch { + if let Some(distinct_field) = distinct_field { + if field == distinct_field { + // If the field matches exactly the distinct field, return Match + return PatternMatch::Match; + } else if is_faceted_by(distinct_field, field) { + // If the field is a parent field of the distinct field, return Parent + return PatternMatch::Parent; + } + } + // If the field does not match the distinct field and is not a parent of a nested field that matches the distinct field, return NoMatch + PatternMatch::NoMatch +} + #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum PatternMatch { /// The field is a parent of the of a nested field that matches the pattern diff --git a/crates/milli/src/fields_ids_map/metadata.rs b/crates/milli/src/fields_ids_map/metadata.rs index 2138e8ad3..ee65dc8f2 100644 --- a/crates/milli/src/fields_ids_map/metadata.rs +++ b/crates/milli/src/fields_ids_map/metadata.rs @@ -6,6 +6,7 @@ use heed::RoTxn; use super::FieldsIdsMap; use crate::attribute_patterns::PatternMatch; +use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME}; use crate::{ is_faceted_by, FieldId, FilterableAttributesFeatures, FilterableAttributesRule, Index, LocalizedAttributesRule, Result, @@ -13,8 +14,11 @@ use crate::{ #[derive(Debug, Clone, Copy)] pub struct Metadata { - pub searchable: bool, + pub searchable: Option, pub sortable: bool, + pub distinct: bool, + pub asc_desc: bool, + pub geo: bool, localized_attributes_rule_id: Option, filterable_attributes_rule_id: Option, } @@ -136,21 +140,33 @@ impl Metadata { } pub fn is_searchable(&self) -> bool { + self.searchable.is_some() + } + + pub fn searchable_weight(&self) -> Option { self.searchable } - /// Returns `true` if the field is part of the facet databases. (sortable, filterable, or facet searchable) + pub fn is_distinct(&self) -> bool { + self.distinct + } + + pub fn is_asc_desc(&self) -> bool { + self.asc_desc + } + + pub fn is_geo(&self) -> bool { + self.geo + } + + /// Returns `true` if the field is part of the facet databases. (sortable, distinct, asc_desc, filterable or facet searchable) pub fn is_faceted(&self, rules: &[FilterableAttributesRule]) -> bool { - if self.is_sortable() { + if self.is_distinct() || self.is_sortable() || self.is_asc_desc() { return true; } let features = self.filterable_attributes_features(&rules); - if features.is_filterable() { - return true; - } - - if features.is_facet_searchable() { + if features.is_filterable() || features.is_facet_searchable() { return true; } @@ -164,6 +180,8 @@ pub struct MetadataBuilder { filterable_attributes: Vec, sortable_attributes: HashSet, localized_attributes: Option>, + distinct_attribute: Option, + asc_desc_attributes: HashSet, } impl MetadataBuilder { @@ -176,45 +194,98 @@ impl MetadataBuilder { let filterable_attributes = index.filterable_attributes_rules(rtxn)?; let sortable_attributes = index.sortable_fields(rtxn)?; let localized_attributes = index.localized_attributes_rules(rtxn)?; + let distinct_attribute = index.distinct_field(rtxn)?.map(|s| s.to_string()); + let asc_desc_attributes = index.asc_desc_fields(rtxn)?; Ok(Self { searchable_attributes, filterable_attributes, sortable_attributes, localized_attributes, + distinct_attribute, + asc_desc_attributes, }) } - // pub fn new( - // searchable_attributes: Option>, - // filterable_attributes: Vec, - // sortable_attributes: HashSet, - // localized_attributes: Option>, - // ) -> Self { - // let searchable_attributes = match searchable_attributes { - // Some(fields) if fields.iter().any(|f| f == "*") => None, - // None => None, - // Some(fields) => Some(fields), - // }; + #[cfg(test)] + /// Build a new `MetadataBuilder` from the given parameters. + /// + /// This is used for testing, prefer using `MetadataBuilder::from_index` instead. + pub fn new( + searchable_attributes: Option>, + filterable_attributes: Vec, + sortable_attributes: HashSet, + localized_attributes: Option>, + distinct_attribute: Option, + asc_desc_attributes: HashSet, + ) -> Self { + let searchable_attributes = match searchable_attributes { + Some(fields) if fields.iter().any(|f| f == "*") => None, + None => None, + Some(fields) => Some(fields), + }; - // Self { - // searchable_attributes, - // filterable_attributes, - // sortable_attributes, - // localized_attributes, - // } - // } + Self { + searchable_attributes, + filterable_attributes, + sortable_attributes, + localized_attributes, + distinct_attribute, + asc_desc_attributes, + } + } pub fn metadata_for_field(&self, field: &str) -> Metadata { - let searchable = match &self.searchable_attributes { - // A field is searchable if it is faceted by a searchable attribute - Some(attributes) => attributes.iter().any(|pattern| is_faceted_by(field, pattern)), - None => true, - }; + if is_faceted_by(field, RESERVED_VECTORS_FIELD_NAME) { + // Vectors fields are not searchable, filterable, distinct or asc_desc + return Metadata { + searchable: None, + sortable: false, + distinct: false, + asc_desc: false, + geo: false, + localized_attributes_rule_id: None, + filterable_attributes_rule_id: None, + }; + } // A field is sortable if it is faceted by a sortable attribute let sortable = self.sortable_attributes.iter().any(|pattern| is_faceted_by(field, pattern)); + let filterable_attributes_rule_id = self + .filterable_attributes + .iter() + .position(|attribute| attribute.match_str(field) == PatternMatch::Match) + // saturating_add(1): make `id` `NonZero` + .map(|id| NonZeroU16::new(id.saturating_add(1).try_into().unwrap()).unwrap()); + + if is_faceted_by(field, RESERVED_GEO_FIELD_NAME) { + // Geo fields are not searchable, distinct or asc_desc + return Metadata { + searchable: None, + sortable, + distinct: false, + asc_desc: false, + geo: true, + localized_attributes_rule_id: None, + filterable_attributes_rule_id, + }; + } + + let searchable = match &self.searchable_attributes { + // A field is searchable if it is faceted by a searchable attribute + Some(attributes) => attributes + .iter() + .enumerate() + .find(|(_i, pattern)| is_faceted_by(field, pattern)) + .map(|(i, _)| i as u16), + None => None, + }; + + let distinct = + self.distinct_attribute.as_ref().is_some_and(|distinct_field| field == distinct_field); + let asc_desc = self.asc_desc_attributes.contains(field); + let localized_attributes_rule_id = self .localized_attributes .iter() @@ -223,16 +294,12 @@ impl MetadataBuilder { // saturating_add(1): make `id` `NonZero` .map(|id| NonZeroU16::new(id.saturating_add(1).try_into().unwrap()).unwrap()); - let filterable_attributes_rule_id = self - .filterable_attributes - .iter() - .position(|attribute| attribute.match_str(field) == PatternMatch::Match) - // saturating_add(1): make `id` `NonZero` - .map(|id| NonZeroU16::new(id.saturating_add(1).try_into().unwrap()).unwrap()); - Metadata { searchable, sortable, + distinct, + asc_desc, + geo: false, localized_attributes_rule_id, filterable_attributes_rule_id, } diff --git a/crates/milli/src/filterable_attributes_rules.rs b/crates/milli/src/filterable_attributes_rules.rs index f5291b5b8..42c8de74a 100644 --- a/crates/milli/src/filterable_attributes_rules.rs +++ b/crates/milli/src/filterable_attributes_rules.rs @@ -4,7 +4,7 @@ use std::collections::{BTreeSet, HashSet}; use utoipa::ToSchema; use crate::{ - attribute_patterns::{match_field_legacy, PatternMatch}, + attribute_patterns::{match_distinct_field, match_field_legacy, PatternMatch}, constants::RESERVED_GEO_FIELD_NAME, AttributePatterns, FieldId, FieldsIdsMap, }; @@ -210,3 +210,39 @@ pub fn match_pattern_by_features( selection } + +/// Match a field against a set of filterable, facet searchable fields, distinct field, sortable fields, and asc_desc fields. +pub fn match_faceted_field( + field_name: &str, + filterable_fields: &[FilterableAttributesRule], + sortable_fields: &HashSet, + asc_desc_fields: &HashSet, + distinct_field: &Option, +) -> PatternMatch { + // Check if the field matches any filterable or facet searchable field + let mut selection = match_pattern_by_features(field_name, &filterable_fields, &|features| { + features.is_facet_searchable() || features.is_filterable() + }); + + // If the field matches the pattern, return Match + if selection == PatternMatch::Match { + return selection; + } + + match match_distinct_field(distinct_field.as_deref(), field_name) { + PatternMatch::Match => return PatternMatch::Match, + PatternMatch::Parent => selection = PatternMatch::Parent, + PatternMatch::NoMatch => (), + } + + // Otherwise, check if the field matches any sortable/asc_desc field + for pattern in sortable_fields.iter().chain(asc_desc_fields.iter()) { + match match_field_legacy(&pattern, field_name) { + PatternMatch::Match => return PatternMatch::Match, + PatternMatch::Parent => selection = PatternMatch::Parent, + PatternMatch::NoMatch => (), + } + } + + selection +} diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index 52021d919..cb9d3bcb8 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -1,6 +1,5 @@ use std::borrow::Cow; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; -use std::convert::TryInto; use std::fs::File; use std::path::Path; @@ -14,6 +13,7 @@ use crate::attribute_patterns::PatternMatch; use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME}; use crate::documents::PrimaryKey; use crate::error::{InternalError, UserError}; +use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::fields_ids_map::FieldsIdsMap; use crate::filterable_attributes_rules::match_pattern_by_features; use crate::heed_codec::facet::{ @@ -648,8 +648,7 @@ impl Index { &self, wtxn: &mut RwTxn<'_>, user_fields: &[&str], - non_searchable_fields_ids: &[FieldId], - fields_ids_map: &FieldsIdsMap, + fields_ids_map: &FieldIdMapWithMetadata, ) -> Result<()> { // We can write the user defined searchable fields as-is. self.put_user_defined_searchable_fields(wtxn, user_fields)?; @@ -657,29 +656,17 @@ impl Index { let mut weights = FieldidsWeightsMap::default(); // Now we generate the real searchable fields: - // 1. Take the user defined searchable fields as-is to keep the priority defined by the attributes criterion. - // 2. Iterate over the user defined searchable fields. - // 3. If a user defined field is a subset of a field defined in the fields_ids_map - // (ie doggo.name is a subset of doggo) right after doggo and with the same weight. let mut real_fields = Vec::new(); - - for (id, field_from_map) in fields_ids_map.iter() { - for (weight, user_field) in user_fields.iter().enumerate() { - if crate::is_faceted_by(field_from_map, user_field) - && !real_fields.contains(&field_from_map) - && !non_searchable_fields_ids.contains(&id) - { - real_fields.push(field_from_map); - - let weight: u16 = - weight.try_into().map_err(|_| UserError::AttributeLimitReached)?; - weights.insert(id, weight); - } + for (id, field_from_map, metadata) in fields_ids_map.iter() { + if let Some(weight) = metadata.searchable_weight() { + real_fields.push(field_from_map); + weights.insert(id, weight); } } self.put_searchable_fields(wtxn, &real_fields)?; self.put_fieldids_weights_map(wtxn, &weights)?; + Ok(()) } @@ -978,6 +965,19 @@ impl Index { // Ok(faceted_fields) // } + pub fn asc_desc_fields(&self, rtxn: &RoTxn<'_>) -> Result> { + let asc_desc_fields = self + .criteria(rtxn)? + .into_iter() + .filter_map(|criterion| match criterion { + Criterion::Asc(field) | Criterion::Desc(field) => Some(field), + _otherwise => None, + }) + .collect(); + + Ok(asc_desc_fields) + } + /* faceted documents ids */ /// Retrieve all the documents which contain this field id set as null diff --git a/crates/milli/src/prompt/fields.rs b/crates/milli/src/prompt/fields.rs index ab15c31b0..831f17b0d 100644 --- a/crates/milli/src/prompt/fields.rs +++ b/crates/milli/src/prompt/fields.rs @@ -7,14 +7,14 @@ use liquid::model::{ }; use liquid::{ObjectView, ValueView}; -use super::{FieldMetadata, FieldsIdsMapWithMetadata}; +use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, Metadata}; use crate::GlobalFieldsIdsMap; #[derive(Debug, Clone, Copy)] pub struct FieldValue<'a, D: ObjectView> { name: &'a str, document: &'a D, - metadata: FieldMetadata, + metadata: Metadata, } impl<'a, D: ObjectView> ValueView for FieldValue<'a, D> { @@ -67,7 +67,10 @@ impl<'a, D: ObjectView> FieldValue<'a, D> { } pub fn is_searchable(&self) -> &bool { - &self.metadata.searchable + match self.metadata.is_searchable() { + true => &true, + false => &false, + } } pub fn is_empty(&self) -> bool { @@ -125,15 +128,11 @@ pub struct BorrowedFields<'a, 'map, D: ObjectView> { } impl<'a, D: ObjectView> OwnedFields<'a, D> { - pub fn new(document: &'a D, field_id_map: &'a FieldsIdsMapWithMetadata<'a>) -> Self { + pub fn new(document: &'a D, field_id_map: &'a FieldIdMapWithMetadata) -> Self { Self( std::iter::repeat(document) .zip(field_id_map.iter()) - .map(|(document, (fid, name))| FieldValue { - document, - name, - metadata: field_id_map.metadata(fid).unwrap_or_default(), - }) + .map(|(document, (_fid, name, metadata))| FieldValue { document, name, metadata }) .collect(), ) } @@ -187,7 +186,7 @@ impl<'a, 'map, D: ObjectView> ArrayView for BorrowedFields<'a, 'map, D> { let fv = self.doc_alloc.alloc(FieldValue { name: self.doc_alloc.alloc_str(&k), document: self.document, - metadata: FieldMetadata { searchable: metadata.searchable }, + metadata, }); fv as _ })) @@ -207,7 +206,7 @@ impl<'a, 'map, D: ObjectView> ArrayView for BorrowedFields<'a, 'map, D> { let fv = self.doc_alloc.alloc(FieldValue { name: self.doc_alloc.alloc_str(&key), document: self.document, - metadata: FieldMetadata { searchable: metadata.searchable }, + metadata, }); Some(fv as _) } diff --git a/crates/milli/src/prompt/mod.rs b/crates/milli/src/prompt/mod.rs index 3eb91611e..a5cb8de48 100644 --- a/crates/milli/src/prompt/mod.rs +++ b/crates/milli/src/prompt/mod.rs @@ -5,11 +5,9 @@ mod fields; mod template_checker; use std::cell::RefCell; -use std::collections::BTreeMap; use std::convert::TryFrom; use std::fmt::Debug; use std::num::NonZeroUsize; -use std::ops::Deref; use bumpalo::Bump; use document::ParseableDocument; @@ -18,8 +16,9 @@ use fields::{BorrowedFields, OwnedFields}; use self::context::Context; use self::document::Document; +use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::update::del_add::DelAdd; -use crate::{FieldId, FieldsIdsMap, GlobalFieldsIdsMap}; +use crate::GlobalFieldsIdsMap; pub struct Prompt { template: liquid::Template, @@ -145,9 +144,9 @@ impl Prompt { &self, document: &obkv::KvReaderU16, side: DelAdd, - field_id_map: &FieldsIdsMapWithMetadata, + field_id_map: &FieldIdMapWithMetadata, ) -> Result { - let document = Document::new(document, side, field_id_map); + let document = Document::new(document, side, field_id_map.as_fields_ids_map()); let fields = OwnedFields::new(&document, field_id_map); let context = Context::new(&document, &fields); @@ -172,40 +171,6 @@ fn truncate(s: &mut String, max_bytes: usize) { } } -pub struct FieldsIdsMapWithMetadata<'a> { - fields_ids_map: &'a FieldsIdsMap, - metadata: BTreeMap, -} - -impl<'a> FieldsIdsMapWithMetadata<'a> { - pub fn new(fields_ids_map: &'a FieldsIdsMap, searchable_fields_ids: &'_ [FieldId]) -> Self { - let mut metadata: BTreeMap = - fields_ids_map.ids().map(|id| (id, Default::default())).collect(); - for searchable_field_id in searchable_fields_ids { - let Some(metadata) = metadata.get_mut(searchable_field_id) else { continue }; - metadata.searchable = true; - } - Self { fields_ids_map, metadata } - } - - pub fn metadata(&self, field_id: FieldId) -> Option { - self.metadata.get(&field_id).copied() - } -} - -impl<'a> Deref for FieldsIdsMapWithMetadata<'a> { - type Target = FieldsIdsMap; - - fn deref(&self) -> &Self::Target { - self.fields_ids_map - } -} - -#[derive(Debug, Default, Clone, Copy)] -pub struct FieldMetadata { - pub searchable: bool, -} - #[cfg(test)] mod test { use super::Prompt; diff --git a/crates/milli/src/update/facet/mod.rs b/crates/milli/src/update/facet/mod.rs index 6743dfc3f..6ddbc87cf 100644 --- a/crates/milli/src/update/facet/mod.rs +++ b/crates/milli/src/update/facet/mod.rs @@ -159,7 +159,7 @@ impl<'i> FacetsUpdate<'i> { let field_ids = self.index.facet_leveled_field_ids(wtxn)?; let bulk_update = FacetsUpdateBulk::new( self.index, - field_ids, + field_ids.into_iter().collect(), self.facet_type, self.delta_data, self.group_size, diff --git a/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 606ae6b54..e64bbfd50 100644 --- a/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -150,9 +150,12 @@ fn searchable_fields_changed( obkv: &KvReader, settings_diff: &InnerIndexSettingsDiff, ) -> bool { - let searchable_fields = &settings_diff.new.searchable_fields_ids; for (field_id, field_bytes) in obkv.iter() { - if searchable_fields.contains(&field_id) { + let Some(metadata) = settings_diff.new.fields_ids_map.metadata(field_id) else { + debug_assert!(false, "Field id {} is not in the fields ids map", field_id); + continue; + }; + if metadata.is_searchable() { let del_add = KvReaderDelAdd::from_slice(field_bytes); match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) { // if both fields are None, check the next field. @@ -200,8 +203,12 @@ fn tokens_from_document<'a>( buffers.obkv_buffer.clear(); let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer); for (field_id, field_bytes) in obkv.iter() { + let Some(metadata) = settings.fields_ids_map.metadata(field_id) else { + debug_assert!(false, "Field id {} is not in the fields ids map", field_id); + continue; + }; // if field is searchable. - if settings.searchable_fields_ids.contains(&field_id) { + if metadata.is_searchable() { // extract deletion or addition only. if let Some(field_bytes) = KvReaderDelAdd::from_slice(field_bytes).get(del_add) { // parse json. @@ -216,7 +223,7 @@ fn tokens_from_document<'a>( buffers.field_buffer.clear(); if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) { // create an iterator of token with their positions. - let locales = settings.localized_searchable_fields_ids.locales(field_id); + let locales = metadata.locales(&settings.localized_attributes_rules); let tokens = process_tokens(tokenizer.tokenize_with_allow_list(field, locales)) .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); diff --git a/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index d330ea5a0..7cab156ad 100644 --- a/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -12,12 +12,11 @@ use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; use crate::heed_codec::{BEU16StrCodec, StrRefCodec}; -use crate::localized_attributes_rules::LocalizedFieldIds; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::helpers::{ MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps, }; -use crate::update::settings::InnerIndexSettingsDiff; +use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; /// Extracts the facet string and the documents ids where this facet string appear. @@ -33,13 +32,10 @@ pub fn extract_facet_string_docids( if settings_diff.settings_update_only() { extract_facet_string_docids_settings(docid_fid_facet_string, indexer, settings_diff) } else { - let localized_field_ids = &settings_diff.new.localized_faceted_fields_ids; - let facet_search = settings_diff.new.facet_search; extract_facet_string_docids_document_update( docid_fid_facet_string, indexer, - localized_field_ids, - facet_search, + &settings_diff.new, ) } } @@ -52,8 +48,7 @@ pub fn extract_facet_string_docids( fn extract_facet_string_docids_document_update( docid_fid_facet_string: grenad::Reader, indexer: GrenadParameters, - localized_field_ids: &LocalizedFieldIds, - facet_search: bool, + settings: &InnerIndexSettings, ) -> Result<(grenad::Reader>, grenad::Reader>)> { let max_memory = indexer.max_memory_by_thread(); @@ -92,6 +87,14 @@ fn extract_facet_string_docids_document_update( let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); + let Some(metadata) = settings.fields_ids_map.metadata(field_id) else { + unreachable!("metadata not found for field_id: {}", field_id) + }; + + if !metadata.is_faceted(&settings.filterable_attributes_rules) { + continue; + } + let (document_id_bytes, normalized_value_bytes) = try_split_array_at::<_, 4>(bytes).unwrap(); let document_id = u32::from_be_bytes(document_id_bytes); @@ -99,8 +102,10 @@ fn extract_facet_string_docids_document_update( let normalized_value = str::from_utf8(normalized_value_bytes)?; // Facet search normalization - if facet_search { - let locales = localized_field_ids.locales(field_id); + let features = + metadata.filterable_attributes_features(&settings.filterable_attributes_rules); + if features.is_facet_searchable() { + let locales = metadata.locales(&settings.localized_attributes_rules); let hyper_normalized_value = normalize_facet_string(normalized_value, locales); let set = BTreeSet::from_iter(std::iter::once(normalized_value)); @@ -178,8 +183,15 @@ fn extract_facet_string_docids_settings( let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); - let old_locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id); - let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id); + let Some(old_metadata) = settings_diff.old.fields_ids_map.metadata(field_id) else { + unreachable!("old metadata not found for field_id: {}", field_id) + }; + let Some(new_metadata) = settings_diff.new.fields_ids_map.metadata(field_id) else { + unreachable!("new metadata not found for field_id: {}", field_id) + }; + + let old_locales = old_metadata.locales(&settings_diff.old.localized_attributes_rules); + let new_locales = new_metadata.locales(&settings_diff.new.localized_attributes_rules); let are_same_locales = old_locales == new_locales; let reindex_facet_search = diff --git a/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 88c02fe70..de87c5a7c 100644 --- a/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -76,9 +76,9 @@ pub fn extract_fid_docid_facet_values( let mut strings_key_buffer = Vec::new(); let old_faceted_fids: BTreeSet<_> = - settings_diff.old.faceted_fields_ids.iter().copied().collect(); + settings_diff.list_faceted_fields_from_fid_map(DelAdd::Deletion); let new_faceted_fids: BTreeSet<_> = - settings_diff.new.faceted_fields_ids.iter().copied().collect(); + settings_diff.list_faceted_fields_from_fid_map(DelAdd::Addition); if !settings_diff.settings_update_only || settings_diff.reindex_facets() { let mut cursor = obkv_documents.into_cursor()?; diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index 9103e8324..560b73834 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -15,8 +15,9 @@ use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::constants::RESERVED_VECTORS_FIELD_NAME; use crate::error::FaultSource; +use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::index::IndexEmbeddingConfig; -use crate::prompt::{FieldsIdsMapWithMetadata, Prompt}; +use crate::prompt::Prompt; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::settings::InnerIndexSettingsDiff; use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution}; @@ -190,12 +191,8 @@ pub fn extract_vector_points( let reindex_vectors = settings_diff.reindex_vectors(); let old_fields_ids_map = &settings_diff.old.fields_ids_map; - let old_fields_ids_map = - FieldsIdsMapWithMetadata::new(old_fields_ids_map, &settings_diff.old.searchable_fields_ids); let new_fields_ids_map = &settings_diff.new.fields_ids_map; - let new_fields_ids_map = - FieldsIdsMapWithMetadata::new(new_fields_ids_map, &settings_diff.new.searchable_fields_ids); // the vector field id may have changed let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME); @@ -383,7 +380,7 @@ pub fn extract_vector_points( ); continue; } - regenerate_prompt(obkv, prompt, &new_fields_ids_map)? + regenerate_prompt(obkv, prompt, new_fields_ids_map)? } }, // prompt regeneration is only triggered for existing embedders @@ -400,7 +397,7 @@ pub fn extract_vector_points( regenerate_if_prompt_changed( obkv, (old_prompt, prompt), - (&old_fields_ids_map, &new_fields_ids_map), + (old_fields_ids_map, new_fields_ids_map), )? } else { // we can simply ignore user provided vectors as they are not regenerated and are @@ -416,7 +413,7 @@ pub fn extract_vector_points( prompt, (add_to_user_provided, remove_from_user_provided), (old, new), - (&old_fields_ids_map, &new_fields_ids_map), + (old_fields_ids_map, new_fields_ids_map), document_id, embedder_name, embedder_is_manual, @@ -486,10 +483,7 @@ fn extract_vector_document_diff( prompt: &Prompt, (add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap), (old, new): (VectorState, VectorState), - (old_fields_ids_map, new_fields_ids_map): ( - &FieldsIdsMapWithMetadata, - &FieldsIdsMapWithMetadata, - ), + (old_fields_ids_map, new_fields_ids_map): (&FieldIdMapWithMetadata, &FieldIdMapWithMetadata), document_id: impl Fn() -> Value, embedder_name: &str, embedder_is_manual: bool, @@ -611,10 +605,7 @@ fn extract_vector_document_diff( fn regenerate_if_prompt_changed( obkv: &obkv::KvReader, (old_prompt, new_prompt): (&Prompt, &Prompt), - (old_fields_ids_map, new_fields_ids_map): ( - &FieldsIdsMapWithMetadata, - &FieldsIdsMapWithMetadata, - ), + (old_fields_ids_map, new_fields_ids_map): (&FieldIdMapWithMetadata, &FieldIdMapWithMetadata), ) -> Result { let old_prompt = old_prompt .render_kvdeladd(obkv, DelAdd::Deletion, old_fields_ids_map) @@ -630,7 +621,7 @@ fn regenerate_if_prompt_changed( fn regenerate_prompt( obkv: &obkv::KvReader, prompt: &Prompt, - new_fields_ids_map: &FieldsIdsMapWithMetadata, + new_fields_ids_map: &FieldIdMapWithMetadata, ) -> Result { let prompt = prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?; diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index b645ca0bd..87ad20cc2 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -217,7 +217,7 @@ where // update the internal facet and searchable list, // because they might have changed due to the nested documents flattening. - settings_diff.new.recompute_facets(self.wtxn, self.index)?; + // settings_diff.new.recompute_facets(self.wtxn, self.index)?; settings_diff.new.recompute_searchables(self.wtxn, self.index)?; let settings_diff = Arc::new(settings_diff); diff --git a/crates/milli/src/update/index_documents/transform.rs b/crates/milli/src/update/index_documents/transform.rs index d87524a34..56b968c40 100644 --- a/crates/milli/src/update/index_documents/transform.rs +++ b/crates/milli/src/update/index_documents/transform.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::collections::btree_map::Entry as BEntry; use std::collections::hash_map::Entry as HEntry; -use std::collections::{BTreeMap, HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap}; use std::fs::File; use std::io::{Read, Seek}; @@ -18,8 +18,10 @@ use super::helpers::{ ObkvsMergeAdditionsAndDeletions, }; use super::{create_writer, IndexDocumentsMethod, IndexerConfig, KeepFirst}; +use crate::attribute_patterns::PatternMatch; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; +use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::index::{db_name, main_key}; use crate::update::del_add::{ into_del_add_obkv, into_del_add_obkv_conditional_operation, DelAdd, DelAddOperation, @@ -31,9 +33,7 @@ use crate::update::{AvailableIds, UpdateIndexingStep}; use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use crate::vector::settings::WriteBackToDocuments; use crate::vector::ArroyWrapper; -use crate::{ - is_faceted_by, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, -}; +use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, Index, Result}; pub struct TransformOutput { pub primary_key: String, @@ -52,7 +52,7 @@ pub struct TransformOutput { /// containing all those documents. pub struct Transform<'a, 'i> { pub index: &'i Index, - fields_ids_map: FieldsIdsMap, + fields_ids_map: FieldIdMapWithMetadata, indexer_settings: &'a IndexerConfig, pub index_documents_method: IndexDocumentsMethod, @@ -84,7 +84,7 @@ pub enum Operation { /// /// If new fields are present in the addition, they are added to the index field ids map. fn create_fields_mapping( - index_field_map: &mut FieldsIdsMap, + index_field_map: &mut FieldIdMapWithMetadata, batch_field_map: &DocumentsBatchIndex, ) -> Result> { batch_field_map @@ -141,10 +141,13 @@ impl<'a, 'i> Transform<'a, 'i> { true, ); let documents_ids = index.documents_ids(wtxn)?; + let fields_ids_map = index.fields_ids_map(wtxn)?; + let builder = MetadataBuilder::from_index(index, wtxn)?; + let fields_ids_map = FieldIdMapWithMetadata::new(fields_ids_map, builder); Ok(Transform { index, - fields_ids_map: index.fields_ids_map(wtxn)?, + fields_ids_map, indexer_settings, available_documents_ids: AvailableIds::new(&documents_ids), original_sorter, @@ -354,7 +357,7 @@ impl<'a, 'i> Transform<'a, 'i> { documents_seen: documents_count, }); - self.index.put_fields_ids_map(wtxn, &self.fields_ids_map)?; + self.index.put_fields_ids_map(wtxn, self.fields_ids_map.as_fields_ids_map())?; self.index.put_primary_key(wtxn, &primary_key)?; self.documents_count += documents_count; // Now that we have a valid sorter that contains the user id and the obkv we @@ -371,7 +374,7 @@ impl<'a, 'i> Transform<'a, 'i> { )] fn flatten_from_fields_ids_map( obkv: &KvReader, - fields_ids_map: &mut FieldsIdsMap, + fields_ids_map: &mut FieldIdMapWithMetadata, ) -> Result>> { if obkv .iter() @@ -657,7 +660,6 @@ impl<'a, 'i> Transform<'a, 'i> { fn rebind_existing_document( old_obkv: &KvReader, settings_diff: &InnerIndexSettingsDiff, - modified_faceted_fields: &HashSet, mut injected_vectors: serde_json::Map, old_vectors_fid: Option, original_obkv_buffer: Option<&mut Vec>, @@ -667,23 +669,26 @@ impl<'a, 'i> Transform<'a, 'i> { let is_primary_key = |id: FieldId| -> bool { settings_diff.primary_key_id == Some(id) }; // If only a faceted field has been added, keep only this field. - let global_facet_settings_changed = settings_diff.global_facet_settings_changed(); let facet_fids_changed = settings_diff.facet_fids_changed(); - let necessary_faceted_field = - |id: FieldId| -> bool { + + let necessary_faceted_field = |id: FieldId| -> Option { + if facet_fids_changed { let field_name = settings_diff.new.fields_ids_map.name(id).unwrap(); - if global_facet_settings_changed { - settings_diff.new.user_defined_faceted_fields.iter().any(|long| { - is_faceted_by(long, field_name) || is_faceted_by(field_name, long) - }) - } else if facet_fids_changed { - modified_faceted_fields.iter().any(|long| { - is_faceted_by(long, field_name) || is_faceted_by(field_name, long) - }) - } else { - false + // if the faceted fields changed, we need to keep all the field that are + // faceted in the old or new settings. + match ( + settings_diff.old.match_faceted_field(field_name), + settings_diff.new.match_faceted_field(field_name), + ) { + (PatternMatch::NoMatch, PatternMatch::NoMatch) => None, + (PatternMatch::NoMatch, _) => Some(DelAddOperation::Addition), + (_, PatternMatch::NoMatch) => Some(DelAddOperation::Deletion), + (_, _) => Some(DelAddOperation::DeletionAndAddition), } - }; + } else { + None + } + }; // Alway provide all fields when vectors are involved because // we need the fields for the prompt/templating. @@ -734,9 +739,12 @@ impl<'a, 'i> Transform<'a, 'i> { } } - if is_primary_key(id) || necessary_faceted_field(id) || reindex_vectors { + if is_primary_key(id) || reindex_vectors { operations.insert(id, DelAddOperation::DeletionAndAddition); obkv_writer.insert(id, val)?; + } else if let Some(operation) = necessary_faceted_field(id) { + operations.insert(id, operation); + obkv_writer.insert(id, val)?; } else if let Some(operation) = settings_diff.reindex_searchable_id(id) { operations.insert(id, operation); obkv_writer.insert(id, val)?; @@ -856,7 +864,6 @@ impl<'a, 'i> Transform<'a, 'i> { }; if original_sorter.is_some() || flattened_sorter.is_some() { - let modified_faceted_fields = settings_diff.modified_faceted_fields(); let mut original_obkv_buffer = Vec::new(); let mut flattened_obkv_buffer = Vec::new(); let mut document_sorter_key_buffer = Vec::new(); @@ -897,7 +904,6 @@ impl<'a, 'i> Transform<'a, 'i> { Self::rebind_existing_document( old_obkv, &settings_diff, - &modified_faceted_fields, injected_vectors, old_vectors_fid, Some(&mut original_obkv_buffer).filter(|_| original_sorter.is_some()), diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index 4433c6e75..796312df3 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -9,8 +9,9 @@ use serde_json::Value; use super::super::cache::BalancedCaches; use super::facet_document::extract_document_facets; -use super::{match_faceted_field, FacetKind}; +use super::FacetKind; use crate::fields_ids_map::metadata::Metadata; +use crate::filterable_attributes_rules::match_faceted_field; use crate::heed_codec::facet::OrderedF64Codec; use crate::update::del_add::DelAdd; use crate::update::new::channel::FieldIdDocidFacetSender; @@ -34,6 +35,8 @@ pub struct FacetedExtractorData<'a, 'b> { buckets: usize, filterable_attributes: Vec, sortable_fields: HashSet, + asc_desc_fields: HashSet, + distinct_field: Option, } impl<'a, 'b, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a, 'b> { @@ -58,6 +61,8 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a, 'b> context, &self.filterable_attributes, &self.sortable_fields, + &self.asc_desc_fields, + &self.distinct_field, change, self.sender, )? @@ -73,6 +78,8 @@ impl FacetedDocidsExtractor { context: &DocumentChangeContext>, filterable_attributes: &[FilterableAttributesRule], sortable_fields: &HashSet, + asc_desc_fields: &HashSet, + distinct_field: &Option, document_change: DocumentChange, sender: &FieldIdDocidFacetSender, ) -> Result<()> { @@ -89,6 +96,8 @@ impl FacetedDocidsExtractor { new_fields_ids_map.deref_mut(), filterable_attributes, sortable_fields, + asc_desc_fields, + distinct_field, &mut |fid, meta, depth, value| { let features = meta.filterable_attributes_features(filterable_attributes); Self::facet_fn_with_options( @@ -109,7 +118,13 @@ impl FacetedDocidsExtractor { DocumentChange::Update(inner) => { if !inner.has_changed_for_fields( &mut |field_name| { - match_faceted_field(field_name, filterable_attributes, sortable_fields) + match_faceted_field( + field_name, + filterable_attributes, + sortable_fields, + asc_desc_fields, + distinct_field, + ) }, rtxn, index, @@ -124,6 +139,8 @@ impl FacetedDocidsExtractor { new_fields_ids_map.deref_mut(), filterable_attributes, sortable_fields, + asc_desc_fields, + distinct_field, &mut |fid, meta, depth, value| { let features = meta.filterable_attributes_features(filterable_attributes); Self::facet_fn_with_options( @@ -148,6 +165,8 @@ impl FacetedDocidsExtractor { new_fields_ids_map.deref_mut(), filterable_attributes, sortable_fields, + asc_desc_fields, + distinct_field, &mut |fid, meta, depth, value| { let features = meta.filterable_attributes_features(filterable_attributes); Self::facet_fn_with_options( @@ -172,6 +191,8 @@ impl FacetedDocidsExtractor { new_fields_ids_map.deref_mut(), filterable_attributes, sortable_fields, + asc_desc_fields, + distinct_field, &mut |fid, meta, depth, value| { let features = meta.filterable_attributes_features(filterable_attributes); Self::facet_fn_with_options( @@ -426,6 +447,8 @@ impl FacetedDocidsExtractor { let rtxn = index.read_txn()?; let filterable_attributes = index.filterable_attributes_rules(&rtxn)?; let sortable_fields = index.sortable_fields(&rtxn)?; + let asc_desc_fields = index.asc_desc_fields(&rtxn)?; + let distinct_field = index.distinct_field(&rtxn)?.map(|s| s.to_string()); let datastore = ThreadLocal::new(); { @@ -439,6 +462,8 @@ impl FacetedDocidsExtractor { sender, filterable_attributes, sortable_fields, + asc_desc_fields, + distinct_field, }; extract( document_changes, diff --git a/crates/milli/src/update/new/extract/faceted/facet_document.rs b/crates/milli/src/update/new/extract/faceted/facet_document.rs index 235804204..7df741fa4 100644 --- a/crates/milli/src/update/new/extract/faceted/facet_document.rs +++ b/crates/milli/src/update/new/extract/faceted/facet_document.rs @@ -8,22 +8,30 @@ use crate::update::new::document::Document; use crate::update::new::extract::geo::extract_geo_coordinates; use crate::update::new::extract::perm_json_p; use crate::{ - FieldId, FilterableAttributesSettings, GlobalFieldsIdsMap, InternalError, Result, UserError, + FieldId, FilterableAttributesRule, GlobalFieldsIdsMap, InternalError, Result, UserError, }; -use super::match_faceted_field; +use crate::filterable_attributes_rules::match_faceted_field; pub fn extract_document_facets<'doc>( document: impl Document<'doc>, external_document_id: &str, field_id_map: &mut GlobalFieldsIdsMap, - filterable_attributes: &[FilterableAttributesSettings], + filterable_attributes: &[FilterableAttributesRule], sortable_fields: &HashSet, + asc_desc_fields: &HashSet, + distinct_field: &Option, facet_fn: &mut impl FnMut(FieldId, Metadata, perm_json_p::Depth, &Value) -> Result<()>, ) -> Result<()> { // return the match result for the given field name. let match_field = |field_name: &str| -> PatternMatch { - match_faceted_field(field_name, filterable_attributes, sortable_fields) + match_faceted_field( + field_name, + filterable_attributes, + sortable_fields, + asc_desc_fields, + distinct_field, + ) }; // extract the field if it is faceted (facet searchable, filterable, sortable) diff --git a/crates/milli/src/update/new/extract/faceted/mod.rs b/crates/milli/src/update/new/extract/faceted/mod.rs index a92a3da08..0c012d739 100644 --- a/crates/milli/src/update/new/extract/faceted/mod.rs +++ b/crates/milli/src/update/new/extract/faceted/mod.rs @@ -1,16 +1,8 @@ mod extract_facets; mod facet_document; -use std::collections::HashSet; - pub use extract_facets::FacetedDocidsExtractor; -use crate::{ - attribute_patterns::{match_field_legacy, PatternMatch}, - filterable_fields::match_pattern_by_features, - FilterableAttributesSettings, -}; - #[repr(u8)] #[derive(Debug, Clone, Copy)] pub enum FacetKind { @@ -39,30 +31,3 @@ impl FacetKind { (FacetKind::from(key[0]), &key[1..]) } } - -pub fn match_faceted_field( - field_name: &str, - filterable_fields: &[FilterableAttributesSettings], - sortable_fields: &HashSet, -) -> PatternMatch { - // Check if the field matches any filterable or facet searchable field - let mut selection = match_pattern_by_features(field_name, &filterable_fields, &|features| { - features.is_facet_searchable() || features.is_filterable() - }); - - // If the field matches the pattern, return Match - if selection == PatternMatch::Match { - return selection; - } - - // Otherwise, check if the field matches any sortable field - for pattern in sortable_fields { - match match_field_legacy(&pattern, field_name) { - PatternMatch::Match => return PatternMatch::Match, - PatternMatch::Parent => selection = PatternMatch::Parent, - PatternMatch::NoMatch => (), - } - } - - selection -} diff --git a/crates/milli/src/update/new/extract/geo/mod.rs b/crates/milli/src/update/new/extract/geo/mod.rs index 1cc78aab3..0d64e4b23 100644 --- a/crates/milli/src/update/new/extract/geo/mod.rs +++ b/crates/milli/src/update/new/extract/geo/mod.rs @@ -29,7 +29,7 @@ impl GeoExtractor { grenad_parameters: GrenadParameters, ) -> Result> { if index.is_geo_activated(rtxn)? { - Ok(Some(GeoExtractor {filterable_attributes_ruless })) + Ok(Some(GeoExtractor { grenad_parameters })) } else { Ok(None) } diff --git a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs index 6220956fb..100737e32 100644 --- a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -227,7 +227,14 @@ mod test { let fields_ids_map = FieldIdMapWithMetadata::new( fields_ids_map, - MetadataBuilder::new(Default::default(), Default::default(), Default::default(), None), + MetadataBuilder::new( + Default::default(), + Default::default(), + Default::default(), + None, + None, + Default::default(), + ), ); let fields_ids_map_lock = std::sync::RwLock::new(fields_ids_map); diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index fc647cfa5..3a258ce7a 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -114,7 +114,7 @@ pub(super) fn update_index( index.put_primary_key(wtxn, new_primary_key.name())?; } let mut inner_index_settings = InnerIndexSettings::from_index(index, wtxn, Some(embedders))?; - inner_index_settings.recompute_facets(wtxn, index)?; + // inner_index_settings.recompute_facets(wtxn, index)?; inner_index_settings.recompute_searchables(wtxn, index)?; index.put_field_distribution(wtxn, &field_distribution)?; index.put_documents_ids(wtxn, &document_ids)?; diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index c780f3711..a53c5968b 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -11,12 +11,15 @@ use roaring::RoaringBitmap; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use time::OffsetDateTime; -use super::del_add::DelAddOperation; +use super::del_add::{DelAdd, DelAddOperation}; use super::index_documents::{IndexDocumentsConfig, Transform}; use super::IndexerConfig; -use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME}; +use crate::attribute_patterns::PatternMatch; +use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::criterion::Criterion; use crate::error::UserError; +use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; +use crate::filterable_attributes_rules::match_faceted_field; use crate::index::{ IndexEmbeddingConfig, PrefixSearch, DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS, @@ -31,10 +34,7 @@ use crate::vector::settings::{ WriteBackToDocuments, }; use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; -use crate::{ - FieldId, FieldsIdsMap, FilterableAttributesSettings, Index, LocalizedAttributesRule, - LocalizedFieldIds, Result, -}; +use crate::{FieldId, FilterableAttributesRule, Index, LocalizedAttributesRule, Result}; #[derive(Debug, Clone, PartialEq, Eq, Copy)] pub enum Setting { @@ -158,7 +158,7 @@ pub struct Settings<'a, 't, 'i> { searchable_fields: Setting>, displayed_fields: Setting>, - filterable_fields: Setting>, + filterable_fields: Setting>, sortable_fields: Setting>, criteria: Setting>, stop_words: Setting>, @@ -244,7 +244,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.filterable_fields = Setting::Reset; } - pub fn set_filterable_fields(&mut self, rules: Vec) { + pub fn set_filterable_fields(&mut self, rules: Vec) { self.filterable_fields = Setting::Set(rules); } @@ -519,7 +519,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { } /// Updates the index's searchable attributes. - fn update_searchable(&mut self) -> Result { + fn update_user_defined_searchable_attributes(&mut self) -> Result { match self.searchable_fields { Setting::Set(ref fields) => { // Check to see if the searchable fields changed before doing anything else @@ -532,26 +532,10 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { return Ok(false); } - // Since we're updating the settings we can only add new fields at the end of the field id map - let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; // fields are deduplicated, only the first occurrence is taken into account let names = fields.iter().unique().map(String::as_str).collect::>(); - // Add all the searchable attributes to the field map, and then add the - // remaining fields from the old field map to the new one - for name in names.iter() { - // The fields ids map won't change the field id of already present elements thus only the - // new fields will be inserted. - fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; - } - - self.index.put_all_searchable_fields_from_fields_ids_map( - self.wtxn, - &names, - &fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME), - &fields_ids_map, - )?; - self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; + self.index.put_user_defined_searchable_fields(self.wtxn, &names)?; Ok(true) } Setting::Reset => Ok(self.index.delete_all_searchable_fields(self.wtxn)?), @@ -763,10 +747,10 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { fn update_filterable(&mut self) -> Result<()> { match self.filterable_fields { Setting::Set(ref fields) => { - self.index.put_filterable_fields(self.wtxn, fields)?; + self.index.put_filterable_attributes_rules(self.wtxn, fields)?; } Setting::Reset => { - self.index.delete_filterable_fields(self.wtxn)?; + self.index.delete_filterable_attributes_rules(self.wtxn)?; } Setting::NotSet => (), } @@ -1256,7 +1240,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.update_separator_tokens()?; self.update_dictionary()?; self.update_synonyms()?; - self.update_searchable()?; + self.update_user_defined_searchable_attributes()?; self.update_exact_attributes()?; self.update_proximity_precision()?; self.update_prefix_search()?; @@ -1266,7 +1250,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { let embedding_config_updates = self.update_embedding_configs()?; let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn, None)?; - new_inner_settings.recompute_facets(self.wtxn, self.index)?; + // new_inner_settings.recompute_facets(self.wtxn, self.index)?; + new_inner_settings.recompute_searchables(self.wtxn, self.index)?; let primary_key_id = self .index @@ -1318,8 +1303,8 @@ impl InnerIndexSettingsDiff { settings_update_only: bool, ) -> Self { let only_additional_fields = match ( - &old_settings.user_defined_searchable_fields, - &new_settings.user_defined_searchable_fields, + &old_settings.user_defined_searchable_attributes, + &new_settings.user_defined_searchable_attributes, ) { (None, None) | (Some(_), None) | (None, Some(_)) => None, // None means * (Some(old), Some(new)) => { @@ -1341,14 +1326,14 @@ impl InnerIndexSettingsDiff { || old_settings.dictionary != new_settings.dictionary || old_settings.proximity_precision != new_settings.proximity_precision || old_settings.prefix_search != new_settings.prefix_search - || old_settings.localized_searchable_fields_ids - != new_settings.localized_searchable_fields_ids + || old_settings.localized_attributes_rules + != new_settings.localized_attributes_rules }; let cache_exact_attributes = old_settings.exact_attributes != new_settings.exact_attributes; - let cache_user_defined_searchables = old_settings.user_defined_searchable_fields - != new_settings.user_defined_searchable_fields; + let cache_user_defined_searchables = old_settings.user_defined_searchable_attributes + != new_settings.user_defined_searchable_attributes; // if the user-defined searchables changed, then we need to reindex prompts. if cache_user_defined_searchables { @@ -1431,30 +1416,39 @@ impl InnerIndexSettingsDiff { } } + /// List the faceted fields from the inner fid map. + /// This is used to list the faceted fields when we are reindexing, + /// but it can't be used in document addition because the field id map must be exhaustive. + pub fn list_faceted_fields_from_fid_map(&self, del_add: DelAdd) -> BTreeSet { + let settings = match del_add { + DelAdd::Deletion => &self.old, + DelAdd::Addition => &self.new, + }; + + settings + .fields_ids_map + .iter_id_metadata() + .filter(|(_, metadata)| metadata.is_faceted(&settings.filterable_attributes_rules)) + .map(|(id, _)| id) + .collect() + } + pub fn facet_fids_changed(&self) -> bool { let existing_fields = &self.new.existing_fields; - if existing_fields.iter().any(|field| field.contains('.')) { - return true; - } + let new_faceted_fields: HashSet<_> = existing_fields + .iter() + .filter(|field| self.new.match_faceted_field(field) == PatternMatch::Match) + .collect(); + let old_faceted_fields: HashSet<_> = existing_fields + .iter() + .filter(|field| self.old.match_faceted_field(field) == PatternMatch::Match) + .collect(); - let old_faceted_fields = &self.old.user_defined_faceted_fields; - if old_faceted_fields.iter().any(|field| field.contains('.')) { - return true; - } - - // If there is new faceted fields we indicate that we must reindex as we must - // index new fields as facets. It means that the distinct attribute, - // an Asc/Desc criterion or a filtered attribute as be added or removed. - let new_faceted_fields = &self.new.user_defined_faceted_fields; - if new_faceted_fields.iter().any(|field| field.contains('.')) { - return true; - } - - (existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields) + old_faceted_fields != new_faceted_fields } pub fn global_facet_settings_changed(&self) -> bool { - self.old.localized_faceted_fields_ids != self.new.localized_faceted_fields_ids + self.old.localized_attributes_rules != self.new.localized_attributes_rules || self.old.facet_search != self.new.facet_search } @@ -1475,9 +1469,9 @@ impl InnerIndexSettingsDiff { || (!self.settings_update_only && self.new.geo_fields_ids.is_some()) } - pub fn modified_faceted_fields(&self) -> HashSet { - &self.old.user_defined_faceted_fields ^ &self.new.user_defined_faceted_fields - } + // pub fn modified_faceted_fields(&self) -> HashSet { + // &self.old.user_defined_faceted_fields ^ &self.new.user_defined_faceted_fields + // } } #[derive(Clone)] @@ -1485,20 +1479,18 @@ pub(crate) struct InnerIndexSettings { pub stop_words: Option>>, pub allowed_separators: Option>, pub dictionary: Option>, - pub fields_ids_map: FieldsIdsMap, - pub user_defined_faceted_fields: HashSet, - pub user_defined_searchable_fields: Option>, - pub faceted_fields_ids: HashSet, - pub searchable_fields_ids: Vec, + pub fields_ids_map: FieldIdMapWithMetadata, + pub localized_attributes_rules: Vec, + pub filterable_attributes_rules: Vec, + pub asc_desc_fields: HashSet, + pub distinct_field: Option, + pub user_defined_searchable_attributes: Option>, + pub sortable_fields: HashSet, pub exact_attributes: HashSet, pub proximity_precision: ProximityPrecision, pub embedding_configs: EmbeddingConfigs, pub existing_fields: HashSet, pub geo_fields_ids: Option<(FieldId, FieldId)>, - pub non_searchable_fields_ids: Vec, - pub non_faceted_fields_ids: Vec, - pub localized_searchable_fields_ids: LocalizedFieldIds, - pub localized_faceted_fields_ids: LocalizedFieldIds, pub prefix_search: PrefixSearch, pub facet_search: bool, } @@ -1514,12 +1506,6 @@ impl InnerIndexSettings { let allowed_separators = index.allowed_separators(rtxn)?; let dictionary = index.dictionary(rtxn)?; let mut fields_ids_map = index.fields_ids_map(rtxn)?; - let user_defined_searchable_fields = index.user_defined_searchable_fields(rtxn)?; - let user_defined_searchable_fields = - user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect()); - let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?; - let mut searchable_fields_ids = index.searchable_fields_ids(rtxn)?; - let mut faceted_fields_ids = index.faceted_fields_ids(rtxn)?; let exact_attributes = index.exact_attributes_ids(rtxn)?; let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default(); let embedding_configs = match embedding_configs { @@ -1535,82 +1521,73 @@ impl InnerIndexSettings { .collect(); // index.fields_ids_map($a)? ==>> fields_ids_map let geo_fields_ids = match fields_ids_map.id(RESERVED_GEO_FIELD_NAME) { - Some(gfid) => { - let is_sortable = index.sortable_fields_ids(rtxn)?.contains(&gfid); - let is_filterable = index.filterable_fields_ids(rtxn)?.contains(&gfid); + Some(_) if index.is_geo_activated(rtxn)? => { // if `_geo` is faceted then we get the `lat` and `lng` - if is_sortable || is_filterable { - let field_ids = fields_ids_map - .insert("_geo.lat") - .zip(fields_ids_map.insert("_geo.lng")) - .ok_or(UserError::AttributeLimitReached)?; - Some(field_ids) - } else { - None - } + let field_ids = fields_ids_map + .insert("_geo.lat") + .zip(fields_ids_map.insert("_geo.lng")) + .ok_or(UserError::AttributeLimitReached)?; + Some(field_ids) } - None => None, + _ => None, }; - let localized_attributes_rules = index.localized_attributes_rules(rtxn)?; - let localized_searchable_fields_ids = LocalizedFieldIds::new( - &localized_attributes_rules, - &fields_ids_map, - searchable_fields_ids.iter().cloned(), - ); - let localized_faceted_fields_ids = LocalizedFieldIds::new( - &localized_attributes_rules, - &fields_ids_map, - faceted_fields_ids.iter().cloned(), - ); - - let vectors_fids = fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME); - searchable_fields_ids.retain(|id| !vectors_fids.contains(id)); - faceted_fields_ids.retain(|id| !vectors_fids.contains(id)); + let localized_attributes_rules = + index.localized_attributes_rules(rtxn)?.unwrap_or_default(); + let filterable_attributes_rules = index.filterable_attributes_rules(rtxn)?; + let sortable_fields = index.sortable_fields(rtxn)?; + let asc_desc_fields = index.asc_desc_fields(rtxn)?; + let distinct_field = index.distinct_field(rtxn)?.map(|f| f.to_string()); + let user_defined_searchable_attributes = index + .user_defined_searchable_fields(rtxn)? + .map(|fields| fields.into_iter().map(|f| f.to_string()).collect()); + let builder = MetadataBuilder::from_index(index, rtxn)?; + let fields_ids_map = FieldIdMapWithMetadata::new(fields_ids_map, builder); Ok(Self { stop_words, allowed_separators, dictionary, fields_ids_map, - user_defined_faceted_fields, - user_defined_searchable_fields, - faceted_fields_ids, - searchable_fields_ids, + localized_attributes_rules, + filterable_attributes_rules, + asc_desc_fields, + distinct_field, + user_defined_searchable_attributes, + sortable_fields, exact_attributes, proximity_precision, embedding_configs, existing_fields, geo_fields_ids, - non_searchable_fields_ids: vectors_fids.clone(), - non_faceted_fields_ids: vectors_fids.clone(), - localized_searchable_fields_ids, - localized_faceted_fields_ids, prefix_search, facet_search, }) } - // find and insert the new field ids - pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn<'_>, index: &Index) -> Result<()> { - let new_facets = self - .fields_ids_map - .iter() - .filter(|(fid, _field)| !self.non_faceted_fields_ids.contains(fid)) - .filter(|(_fid, field)| crate::is_faceted(field, &self.user_defined_faceted_fields)) - .map(|(_fid, field)| field.to_string()) - .collect(); - index.put_faceted_fields(wtxn, &new_facets)?; - - self.faceted_fields_ids = index.faceted_fields_ids(wtxn)?; - let localized_attributes_rules = index.localized_attributes_rules(wtxn)?; - self.localized_faceted_fields_ids = LocalizedFieldIds::new( - &localized_attributes_rules, - &self.fields_ids_map, - self.faceted_fields_ids.iter().cloned(), - ); - Ok(()) + pub fn match_faceted_field(&self, field: &str) -> PatternMatch { + match_faceted_field( + field, + &self.filterable_attributes_rules, + &self.sortable_fields, + &self.asc_desc_fields, + &self.distinct_field, + ) } + // find and insert the new field ids + // pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn<'_>, index: &Index) -> Result<()> { + // let new_facets = self + // .fields_ids_map + // .iter() + // .filter(|(_fid, field, metadata)| { + // metadata.is_faceted(&self.filterable_attributes_rules) + // }) + // .map(|(_fid, field, _metadata)| field.to_string()) + // .collect(); + // index.put_faceted_fields(wtxn, &new_facets)?; + // Ok(()) + // } + // find and insert the new field ids pub fn recompute_searchables( &mut self, @@ -1618,7 +1595,7 @@ impl InnerIndexSettings { index: &Index, ) -> Result<()> { let searchable_fields = self - .user_defined_searchable_fields + .user_defined_searchable_attributes .as_ref() .map(|searchable| searchable.iter().map(|s| s.as_str()).collect::>()); @@ -1627,17 +1604,9 @@ impl InnerIndexSettings { index.put_all_searchable_fields_from_fields_ids_map( wtxn, &searchable_fields, - &self.non_searchable_fields_ids, &self.fields_ids_map, )?; } - self.searchable_fields_ids = index.searchable_fields_ids(wtxn)?; - let localized_attributes_rules = index.localized_attributes_rules(wtxn)?; - self.localized_searchable_fields_ids = LocalizedFieldIds::new( - &localized_attributes_rules, - &self.fields_ids_map, - self.searchable_fields_ids.iter().cloned(), - ); Ok(()) } @@ -2104,7 +2073,7 @@ mod tests { // Set the filterable fields to be the age. index .update_settings(|settings| { - settings.set_filterable_fields(vec![FilterableAttributesSettings::Field( + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( "age".to_string(), )]); }) @@ -2121,8 +2090,8 @@ mod tests { // Check that the displayed fields are correctly set. let rtxn = index.read_txn().unwrap(); - let fields_ids = index.filterable_fields(&rtxn).unwrap(); - assert_eq!(fields_ids, vec![FilterableAttributesSettings::Field("age".to_string(),)]); + let fields_ids = index.filterable_attributes_rules(&rtxn).unwrap(); + assert_eq!(fields_ids, vec![FilterableAttributesRule::Field("age".to_string(),)]); // Only count the field_id 0 and level 0 facet values. // TODO we must support typed CSVs for numbers to be understood. let fidmap = index.fields_ids_map(&rtxn).unwrap(); @@ -2165,20 +2134,20 @@ mod tests { index .update_settings(|settings| { settings.set_filterable_fields(vec![ - FilterableAttributesSettings::Field("age".to_string()), - FilterableAttributesSettings::Field("name".to_string()), + FilterableAttributesRule::Field("age".to_string()), + FilterableAttributesRule::Field("name".to_string()), ]); }) .unwrap(); // Check that the displayed fields are correctly set. let rtxn = index.read_txn().unwrap(); - let fields_ids = index.filterable_fields(&rtxn).unwrap(); + let fields_ids = index.filterable_attributes_rules(&rtxn).unwrap(); assert_eq!( fields_ids, vec![ - FilterableAttributesSettings::Field("age".to_string()), - FilterableAttributesSettings::Field("name".to_string()), + FilterableAttributesRule::Field("age".to_string()), + FilterableAttributesRule::Field("name".to_string()), ] ); @@ -2205,7 +2174,7 @@ mod tests { // Remove the age from the filterable fields. index .update_settings(|settings| { - settings.set_filterable_fields(vec![FilterableAttributesSettings::Field( + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( "name".to_string(), )]); }) @@ -2213,8 +2182,8 @@ mod tests { // Check that the displayed fields are correctly set. let rtxn = index.read_txn().unwrap(); - let fields_ids = index.filterable_fields(&rtxn).unwrap(); - assert_eq!(fields_ids, vec![FilterableAttributesSettings::Field("name".to_string())]); + let fields_ids = index.filterable_attributes_rules(&rtxn).unwrap(); + assert_eq!(fields_ids, vec![FilterableAttributesRule::Field("name".to_string())]); let rtxn = index.read_txn().unwrap(); // Only count the field_id 2 and level 0 facet values. @@ -2545,8 +2514,8 @@ mod tests { .update_settings(|settings| { settings.set_displayed_fields(vec!["hello".to_string()]); settings.set_filterable_fields(vec![ - FilterableAttributesSettings::Field("age".to_string()), - FilterableAttributesSettings::Field("toto".to_string()), + FilterableAttributesRule::Field("age".to_string()), + FilterableAttributesRule::Field("toto".to_string()), ]); settings.set_criteria(vec![Criterion::Asc(S("toto"))]); }) @@ -2664,7 +2633,7 @@ mod tests { // Set the genres setting index .update_settings(|settings| { - settings.set_filterable_fields(vec![FilterableAttributesSettings::Field( + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( "genres".to_string(), )]); })