diff --git a/crates/meilisearch-types/src/locales.rs b/crates/meilisearch-types/src/locales.rs index 945c38cc3..b3fb90493 100644 --- a/crates/meilisearch-types/src/locales.rs +++ b/crates/meilisearch-types/src/locales.rs @@ -1,5 +1,5 @@ use deserr::Deserr; -use milli::LocalizedAttributesRule; +use milli::{AttributePatterns, LocalizedAttributesRule}; use serde::{Deserialize, Serialize}; use utoipa::ToSchema; @@ -7,7 +7,7 @@ use utoipa::ToSchema; #[deserr(rename_all = camelCase)] #[serde(rename_all = "camelCase")] pub struct LocalizedAttributesRuleView { - pub attribute_patterns: Vec, + pub attribute_patterns: AttributePatterns, pub locales: Vec, } diff --git a/crates/milli/src/attribute_patterns.rs b/crates/milli/src/attribute_patterns.rs index 836ea4465..717db3042 100644 --- a/crates/milli/src/attribute_patterns.rs +++ b/crates/milli/src/attribute_patterns.rs @@ -2,6 +2,8 @@ use deserr::Deserr; use serde::{Deserialize, Serialize}; use utoipa::ToSchema; +use crate::is_faceted_by; + #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Deserr, ToSchema)] #[repr(transparent)] #[serde(transparent)] @@ -17,23 +19,69 @@ impl From> for AttributePatterns { } impl AttributePatterns { - pub fn match_str(&self, str: &str) -> bool { - self.patterns.iter().any(|pattern| match_pattern(pattern, str)) + pub fn match_str(&self, str: &str) -> PatternMatch { + let mut pattern_match = PatternMatch::NoMatch; + for pattern in &self.patterns { + match match_pattern(pattern, str) { + PatternMatch::Match => return PatternMatch::Match, + PatternMatch::Parent => pattern_match = PatternMatch::Parent, + PatternMatch::NoMatch => {} + } + } + pattern_match } } -fn match_pattern(pattern: &str, str: &str) -> bool { +fn match_pattern(pattern: &str, str: &str) -> PatternMatch { if pattern == "*" { - true + return PatternMatch::Match; } else if pattern.starts_with('*') && pattern.ends_with('*') { - str.contains(&pattern[1..pattern.len() - 1]) + if str.contains(&pattern[1..pattern.len() - 1]) { + return PatternMatch::Match; + } } else if let Some(pattern) = pattern.strip_prefix('*') { - str.ends_with(pattern) + if str.ends_with(pattern) { + return PatternMatch::Match; + } } else if let Some(pattern) = pattern.strip_suffix('*') { - str.starts_with(pattern) + if str.starts_with(pattern) { + return PatternMatch::Match; + } } else { - pattern == str + if pattern == str { + return PatternMatch::Match; + } } + + // If the field is a parent field of the pattern, return Parent + if is_faceted_by(pattern, str) { + PatternMatch::Parent + } else { + PatternMatch::NoMatch + } +} + +pub fn match_field_legacy(pattern: &str, field: &str) -> PatternMatch { + if is_faceted_by(field, pattern) { + // If the field matches the pattern or is a nested field of the pattern, return Match (legacy behavior) + PatternMatch::Match + } else if is_faceted_by(pattern, field) { + // If the field is a parent field of the pattern, return Parent + PatternMatch::Parent + } else { + // If the field does not match the pattern and is not a parent of a nested field that matches the pattern, return NoMatch + PatternMatch::NoMatch + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum PatternMatch { + /// The field is a parent of the of a nested field that matches the pattern + Parent, + /// The field matches the pattern + Match, + /// The field does not match the pattern + NoMatch, } #[cfg(test)] @@ -42,17 +90,17 @@ mod tests { #[test] fn test_match_pattern() { - assert!(match_pattern("*", "test")); - assert!(match_pattern("test*", "test")); - assert!(match_pattern("test*", "testa")); - assert!(match_pattern("*test", "test")); - assert!(match_pattern("*test", "atest")); - assert!(match_pattern("*test*", "test")); - assert!(match_pattern("*test*", "atesta")); - assert!(match_pattern("*test*", "atest")); - assert!(match_pattern("*test*", "testa")); - assert!(!match_pattern("test*test", "test")); - assert!(!match_pattern("*test", "testa")); - assert!(!match_pattern("test*", "atest")); + assert_eq!(match_pattern("*", "test"), PatternMatch::Match); + assert_eq!(match_pattern("test*", "test"), PatternMatch::Match); + assert_eq!(match_pattern("test*", "testa"), PatternMatch::Match); + assert_eq!(match_pattern("*test", "test"), PatternMatch::Match); + assert_eq!(match_pattern("*test", "atest"), PatternMatch::Match); + assert_eq!(match_pattern("*test*", "test"), PatternMatch::Match); + assert_eq!(match_pattern("*test*", "atesta"), PatternMatch::Match); + assert_eq!(match_pattern("*test*", "atest"), PatternMatch::Match); + assert_eq!(match_pattern("*test*", "testa"), PatternMatch::Match); + assert_eq!(match_pattern("test*test", "test"), PatternMatch::NoMatch); + assert_eq!(match_pattern("*test", "testa"), PatternMatch::NoMatch); + assert_eq!(match_pattern("test*", "atest"), PatternMatch::NoMatch); } } diff --git a/crates/milli/src/fields_ids_map/metadata.rs b/crates/milli/src/fields_ids_map/metadata.rs index 3bc7f9ca1..05a509baf 100644 --- a/crates/milli/src/fields_ids_map/metadata.rs +++ b/crates/milli/src/fields_ids_map/metadata.rs @@ -5,7 +5,11 @@ use charabia::Language; use heed::RoTxn; use super::FieldsIdsMap; -use crate::{FieldId, FilterableAttributesSettings, Index, LocalizedAttributesRule, Result}; +use crate::attribute_patterns::PatternMatch; +use crate::{ + FieldId, FilterableAttributesFeatures, FilterableAttributesSettings, Index, + LocalizedAttributesRule, Result, +}; #[derive(Debug, Clone, Copy)] pub struct Metadata { @@ -106,6 +110,34 @@ impl Metadata { let rule = rules.get((localized_attributes_rule_id - 1) as usize).unwrap(); Some(rule.locales()) } + + pub fn filterable_attributes<'rules>( + &self, + rules: &'rules [FilterableAttributesSettings], + ) -> Option<&'rules FilterableAttributesSettings> { + let filterable_attributes_rule_id = self.filterable_attributes_rule_id?.get(); + // - 1: `filterable_attributes_rule_id` is NonZero + let rule = rules.get((filterable_attributes_rule_id - 1) as usize).unwrap(); + Some(rule) + } + + pub fn filterable_attributes_features( + &self, + rules: &[FilterableAttributesSettings], + ) -> FilterableAttributesFeatures { + self.filterable_attributes(rules) + .map(|rule| rule.features()) + // if there is no filterable attributes rule, return no features + .unwrap_or_else(FilterableAttributesFeatures::no_features) + } + + pub fn is_sortable(&self) -> bool { + self.sortable + } + + pub fn is_searchable(&self) -> bool { + self.searchable + } } #[derive(Debug, Clone)] @@ -158,19 +190,14 @@ impl MetadataBuilder { .localized_attributes .iter() .flat_map(|v| v.iter()) - .position(|rule| rule.match_str(field)) + .position(|rule| rule.match_str(field) == PatternMatch::Match) // saturating_add(1): make `id` `NonZero` .map(|id| NonZeroU16::new(id.saturating_add(1).try_into().unwrap()).unwrap()); let filterable_attributes_rule_id = self .filterable_attributes .iter() - .position(|attribute| match attribute { - FilterableAttributesSettings::Field(field_name) => field_name == field, - FilterableAttributesSettings::Pattern(patterns) => { - patterns.patterns.match_str(field) - } - }) + .position(|attribute| attribute.match_str(field) == PatternMatch::Match) // saturating_add(1): make `id` `NonZero` .map(|id| NonZeroU16::new(id.saturating_add(1).try_into().unwrap()).unwrap()); diff --git a/crates/milli/src/filterable_fields.rs b/crates/milli/src/filterable_fields.rs index b381cd924..10329f966 100644 --- a/crates/milli/src/filterable_fields.rs +++ b/crates/milli/src/filterable_fields.rs @@ -1,10 +1,12 @@ use deserr::{DeserializeError, Deserr, ValuePointerRef}; use serde::{Deserialize, Serialize}; -use std::collections::HashSet; +use std::collections::{BTreeSet, HashSet}; use utoipa::ToSchema; use crate::{ - constants::RESERVED_GEO_FIELD_NAME, is_faceted_by, AttributePatterns, FieldId, FieldsIdsMap, + attribute_patterns::{match_field_legacy, PatternMatch}, + constants::RESERVED_GEO_FIELD_NAME, + AttributePatterns, FieldId, FieldsIdsMap, }; #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug, ToSchema)] @@ -15,17 +17,25 @@ pub enum FilterableAttributesSettings { } impl FilterableAttributesSettings { - pub fn match_str(&self, field: &str) -> bool { + pub fn match_str(&self, field: &str) -> PatternMatch { match self { - FilterableAttributesSettings::Field(field_name) => is_faceted_by(field, field_name), - FilterableAttributesSettings::Pattern(patterns) => patterns.patterns.match_str(field), + FilterableAttributesSettings::Field(pattern) => match_field_legacy(pattern, field), + FilterableAttributesSettings::Pattern(patterns) => patterns.match_str(field), } } pub fn has_geo(&self) -> bool { - /// TODO: This is a temporary solution to check if the geo field is activated. matches!(self, FilterableAttributesSettings::Field(field_name) if field_name == RESERVED_GEO_FIELD_NAME) } + + pub fn features(&self) -> FilterableAttributesFeatures { + match self { + FilterableAttributesSettings::Field(_) => { + FilterableAttributesFeatures::legacy_default() + } + FilterableAttributesSettings::Pattern(patterns) => patterns.features(), + } + } } #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug, Deserr, ToSchema)] @@ -36,12 +46,62 @@ pub struct FilterableAttributesPatterns { pub features: Option, } +impl FilterableAttributesPatterns { + pub fn match_str(&self, field: &str) -> PatternMatch { + self.patterns.match_str(field) + } + + pub fn features(&self) -> FilterableAttributesFeatures { + self.features.clone().unwrap_or_default() + } +} + #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug, Deserr, ToSchema)] #[serde(deny_unknown_fields, rename_all = "camelCase")] #[deserr(rename_all = camelCase, deny_unknown_fields)] pub struct FilterableAttributesFeatures { - facet_search: Option, - filter: Option, + facet_search: bool, + filter: FilterFeature, +} + +impl Default for FilterableAttributesFeatures { + fn default() -> Self { + Self { facet_search: false, filter: FilterFeature::Equal } + } +} + +impl FilterableAttributesFeatures { + pub fn legacy_default() -> Self { + Self { facet_search: true, filter: FilterFeature::Order } + } + + pub fn no_features() -> Self { + Self { facet_search: false, filter: FilterFeature::Disabled } + } + + pub fn is_filterable(&self) -> bool { + self.filter != FilterFeature::Disabled + } + + /// Check if `IS NULL` is allowed + pub fn is_filterable_null(&self) -> bool { + self.filter != FilterFeature::Disabled + } + + /// Check if `IS EMPTY` is allowed + pub fn is_filterable_empty(&self) -> bool { + self.filter != FilterFeature::Disabled + } + + /// Check if `<`, `>`, `<=`, `>=` or `TO` are allowed + pub fn is_filterable_order(&self) -> bool { + self.filter == FilterFeature::Order + } + + /// Check if the facet search is allowed + pub fn is_facet_searchable(&self) -> bool { + self.facet_search + } } impl Deserr for FilterableAttributesSettings { @@ -59,6 +119,13 @@ impl Deserr for FilterableAttributesSettings { } } +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug, Deserr, ToSchema)] +pub enum FilterFeature { + Disabled, + Equal, + Order, +} + pub fn matching_field_ids( filterable_attributes: &[FilterableAttributesSettings], fields_ids_map: &FieldsIdsMap, @@ -66,10 +133,77 @@ pub fn matching_field_ids( let mut result = HashSet::new(); for (field_id, field_name) in fields_ids_map.iter() { for filterable_attribute in filterable_attributes { - if filterable_attribute.match_str(field_name) { + if filterable_attribute.match_str(field_name) == PatternMatch::Match { result.insert(field_id); } } } result } + +pub fn matching_field_names<'fim>( + filterable_attributes: &[FilterableAttributesSettings], + fields_ids_map: &'fim FieldsIdsMap, +) -> BTreeSet<&'fim str> { + filtered_matching_field_names(filterable_attributes, fields_ids_map, &|_| true) +} + +pub fn filtered_matching_field_names<'fim>( + filterable_attributes: &[FilterableAttributesSettings], + fields_ids_map: &'fim FieldsIdsMap, + filter: &impl Fn(&FilterableAttributesFeatures) -> bool, +) -> BTreeSet<&'fim str> { + let mut result = BTreeSet::new(); + for (_, field_name) in fields_ids_map.iter() { + for filterable_attribute in filterable_attributes { + if filterable_attribute.match_str(field_name) == PatternMatch::Match { + let features = filterable_attribute.features(); + if filter(&features) { + result.insert(field_name); + } + } + } + } + result +} + +pub fn matching_features( + field_name: &str, + filterable_attributes: &[FilterableAttributesSettings], +) -> Option { + for filterable_attribute in filterable_attributes { + if filterable_attribute.match_str(field_name) == PatternMatch::Match { + return Some(filterable_attribute.features()); + } + } + None +} + +pub fn is_field_filterable( + field_name: &str, + filterable_attributes: &[FilterableAttributesSettings], +) -> bool { + matching_features(field_name, filterable_attributes) + .map_or(false, |features| features.is_filterable()) +} + +pub fn match_pattern_by_features( + field_name: &str, + filterable_attributes: &[FilterableAttributesSettings], + filter: &impl Fn(&FilterableAttributesFeatures) -> bool, +) -> PatternMatch { + let mut selection = PatternMatch::NoMatch; + // Check if the field name matches any pattern that is facet searchable or filterable + for pattern in filterable_attributes { + let features = pattern.features(); + if filter(&features) { + match pattern.match_str(field_name) { + PatternMatch::Match => return PatternMatch::Match, + PatternMatch::Parent => selection = PatternMatch::Parent, + PatternMatch::NoMatch => (), + } + } + } + + selection +} diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index f7f591539..2e6b16fb6 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -803,7 +803,7 @@ impl Index { self.main.remap_key_type::().delete(wtxn, main_key::FILTERABLE_FIELDS_KEY) } - /// Returns the filterable fields names. + /// Returns the filterable fields setting value. pub fn filterable_fields( &self, rtxn: &RoTxn<'_>, @@ -815,7 +815,7 @@ impl Index { .unwrap_or_default()) } - /// Identical to `filterable_fields`, but returns ids instead. + /// Returns the filterable fields ids. pub fn filterable_fields_ids(&self, rtxn: &RoTxn<'_>) -> Result> { let fields = self.filterable_fields(rtxn)?; let fields_ids_map = self.fields_ids_map(rtxn)?; @@ -878,67 +878,79 @@ impl Index { /// Returns true if the geo feature is activated. pub fn is_geo_activated(&self, rtxn: &RoTxn<'_>) -> Result { - let geo_filter = self.filterable_fields(rtxn)?.iter().any(|field| field.has_geo()); - let geo_sortable = self.sortable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME); + let geo_filter = self.is_geo_filtering_activated(rtxn)?; + let geo_sortable = self.is_geo_sorting_activated(rtxn)?; Ok(geo_filter || geo_sortable) } - /// Returns the faceted fields names. - pub fn faceted_fields(&self, rtxn: &RoTxn<'_>) -> heed::Result> { - Ok(self - .main - .remap_types::>() - .get(rtxn, main_key::HIDDEN_FACETED_FIELDS_KEY)? - .unwrap_or_default()) + /// Returns true if the geo sorting feature is activated. + pub fn is_geo_sorting_activated(&self, rtxn: &RoTxn<'_>) -> Result { + let geo_sortable = self.sortable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME); + Ok(geo_sortable) } - /// Identical to `faceted_fields`, but returns ids instead. - pub fn faceted_fields_ids(&self, rtxn: &RoTxn<'_>) -> Result> { - let fields = self.faceted_fields(rtxn)?; - let fields_ids_map = self.fields_ids_map(rtxn)?; - - let mut fields_ids = HashSet::new(); - for name in fields { - if let Some(field_id) = fields_ids_map.id(&name) { - fields_ids.insert(field_id); - } - } - - Ok(fields_ids) + /// Returns true if the geo filtering feature is activated. + pub fn is_geo_filtering_activated(&self, rtxn: &RoTxn<'_>) -> Result { + let geo_filter = self.filterable_fields(rtxn)?.iter().any(|field| field.has_geo()); + Ok(geo_filter) } + // /// Returns the faceted fields names. + // pub fn faceted_fields(&self, rtxn: &RoTxn<'_>) -> heed::Result> { + // Ok(self + // .main + // .remap_types::>() + // .get(rtxn, main_key::HIDDEN_FACETED_FIELDS_KEY)? + // .unwrap_or_default()) + // } + + // /// Identical to `faceted_fields`, but returns ids instead. + // pub fn faceted_fields_ids(&self, rtxn: &RoTxn<'_>) -> Result> { + // let fields = self.faceted_fields(rtxn)?; + // let fields_ids_map = self.fields_ids_map(rtxn)?; + + // let mut fields_ids = HashSet::new(); + // for name in fields { + // if let Some(field_id) = fields_ids_map.id(&name) { + // fields_ids.insert(field_id); + // } + // } + + // Ok(fields_ids) + // } + /* faceted documents ids */ /// Returns the user defined faceted fields names. /// /// The user faceted fields are the union of all the filterable, sortable, distinct, and Asc/Desc fields. - pub fn user_defined_faceted_fields(&self, rtxn: &RoTxn<'_>) -> Result> { - let fields_ids_map = self.fields_ids_map(rtxn)?; - let filterable_fields = self.filterable_fields_ids(rtxn)?; - let sortable_fields = self.sortable_fields(rtxn)?; - let distinct_field = self.distinct_field(rtxn)?; - let asc_desc_fields = - self.criteria(rtxn)?.into_iter().filter_map(|criterion| match criterion { - Criterion::Asc(field) | Criterion::Desc(field) => Some(field), - _otherwise => None, - }); + // pub fn user_defined_faceted_fields(&self, rtxn: &RoTxn<'_>) -> Result> { + // let fields_ids_map = self.fields_ids_map(rtxn)?; + // let filterable_fields = self.filterable_fields_ids(rtxn)?; + // let sortable_fields = self.sortable_fields(rtxn)?; + // let distinct_field = self.distinct_field(rtxn)?; + // let asc_desc_fields = + // self.criteria(rtxn)?.into_iter().filter_map(|criterion| match criterion { + // Criterion::Asc(field) | Criterion::Desc(field) => Some(field), + // _otherwise => None, + // }); - let mut faceted_fields: HashSet<_> = filterable_fields - .into_iter() - .filter_map(|field_id| { - let field_name = fields_ids_map.name(field_id); - debug_assert!(field_name.is_some(), "field name not found for {field_id}"); - field_name.map(|field| field.to_string()) - }) - .collect(); - faceted_fields.extend(sortable_fields); - faceted_fields.extend(asc_desc_fields); - if let Some(field) = distinct_field { - faceted_fields.insert(field.to_owned()); - } + // let mut faceted_fields: HashSet<_> = filterable_fields + // .into_iter() + // .filter_map(|field_id| { + // let field_name = fields_ids_map.name(field_id); + // debug_assert!(field_name.is_some(), "field name not found for {field_id}"); + // field_name.map(|field| field.to_string()) + // }) + // .collect(); + // faceted_fields.extend(sortable_fields); + // faceted_fields.extend(asc_desc_fields); + // if let Some(field) = distinct_field { + // faceted_fields.insert(field.to_owned()); + // } - Ok(faceted_fields) - } + // Ok(faceted_fields) + // } /* faceted documents ids */ @@ -1728,7 +1740,7 @@ pub(crate) mod tests { use big_s::S; use bumpalo::Bump; use heed::{EnvOpenOptions, RwTxn}; - use maplit::{btreemap, hashset}; + use maplit::btreemap; use memmap2::Mmap; use tempfile::TempDir; @@ -1744,7 +1756,8 @@ pub(crate) mod tests { use crate::vector::settings::{EmbedderSource, EmbeddingSettings}; use crate::vector::EmbeddingConfigs; use crate::{ - db_snap, obkv_to_json, Filter, Index, Search, SearchResult, ThreadPoolNoAbortBuilder, + db_snap, obkv_to_json, Filter, FilterableAttributesSettings, Index, Search, SearchResult, + ThreadPoolNoAbortBuilder, }; pub(crate) struct TempIndex { @@ -2174,7 +2187,9 @@ pub(crate) mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S(RESERVED_GEO_FIELD_NAME) }); + settings.set_filterable_fields(vec![FilterableAttributesSettings::Field( + RESERVED_GEO_FIELD_NAME.to_string(), + )]); }) .unwrap(); index @@ -2282,7 +2297,9 @@ pub(crate) mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S("doggo") }); + settings.set_filterable_fields(vec![FilterableAttributesSettings::Field( + "doggo".to_string(), + )]); }) .unwrap(); index @@ -2319,15 +2336,14 @@ pub(crate) mod tests { #[test] fn replace_documents_external_ids_and_soft_deletion_check() { - use big_s::S; - use maplit::hashset; - let index = TempIndex::new(); index .update_settings(|settings| { settings.set_primary_key("id".to_owned()); - settings.set_filterable_fields(hashset! { S("doggo") }); + settings.set_filterable_fields(vec![FilterableAttributesSettings::Field( + "doggo".to_string(), + )]); }) .unwrap(); @@ -2860,8 +2876,9 @@ pub(crate) mod tests { index .update_settings(|settings| { settings.set_primary_key("id".to_string()); - settings - .set_filterable_fields(HashSet::from([RESERVED_GEO_FIELD_NAME.to_string()])); + settings.set_filterable_fields(vec![FilterableAttributesSettings::Field( + RESERVED_GEO_FIELD_NAME.to_string(), + )]); }) .unwrap(); @@ -2895,8 +2912,9 @@ pub(crate) mod tests { index .update_settings(|settings| { settings.set_primary_key("id".to_string()); - settings - .set_filterable_fields(HashSet::from([RESERVED_GEO_FIELD_NAME.to_string()])); + settings.set_filterable_fields(vec![FilterableAttributesSettings::Field( + RESERVED_GEO_FIELD_NAME.to_string(), + )]); }) .unwrap(); @@ -2929,7 +2947,9 @@ pub(crate) mod tests { index .update_settings(|settings| { settings.set_searchable_fields(vec![S("name")]); - settings.set_filterable_fields(HashSet::from([S("age")])); + settings.set_filterable_fields(vec![FilterableAttributesSettings::Field( + "age".to_string(), + )]); }) .unwrap(); @@ -2951,7 +2971,9 @@ pub(crate) mod tests { index .update_settings(|settings| { settings.set_searchable_fields(vec![S("name"), S("realName")]); - settings.set_filterable_fields(HashSet::from([S("age")])); + settings.set_filterable_fields(vec![FilterableAttributesSettings::Field( + "age".to_string(), + )]); }) .unwrap(); @@ -3050,7 +3072,10 @@ pub(crate) mod tests { index .update_settings(|settings| { settings.set_searchable_fields(vec![S("_vectors"), S("_vectors.doggo")]); - settings.set_filterable_fields(hashset![S("_vectors"), S("_vectors.doggo")]); + settings.set_filterable_fields(vec![ + FilterableAttributesSettings::Field("_vectors".to_string()), + FilterableAttributesSettings::Field("_vectors.doggo".to_string()), + ]); }) .unwrap(); diff --git a/crates/milli/src/localized_attributes_rules.rs b/crates/milli/src/localized_attributes_rules.rs index 16dd3fd07..df130b873 100644 --- a/crates/milli/src/localized_attributes_rules.rs +++ b/crates/milli/src/localized_attributes_rules.rs @@ -4,6 +4,7 @@ use charabia::Language; use serde::{Deserialize, Serialize}; use utoipa::ToSchema; +use crate::attribute_patterns::PatternMatch; use crate::fields_ids_map::FieldsIdsMap; use crate::{AttributePatterns, FieldId}; @@ -27,7 +28,7 @@ impl LocalizedAttributesRule { Self { attribute_patterns: AttributePatterns::from(attribute_patterns), locales } } - pub fn match_str(&self, str: &str) -> bool { + pub fn match_str(&self, str: &str) -> PatternMatch { self.attribute_patterns.match_str(str) } @@ -57,7 +58,7 @@ impl LocalizedFieldIds { for (field_id, field_name) in fields { let mut locales = Vec::new(); for rule in rules { - if rule.match_str(field_name) { + if rule.match_str(field_name) == PatternMatch::Match { locales.extend(rule.locales.iter()); // Take the first rule that matches break; diff --git a/crates/milli/src/search/facet/facet_distribution.rs b/crates/milli/src/search/facet/facet_distribution.rs index 027f8b176..fa9e64a1c 100644 --- a/crates/milli/src/search/facet/facet_distribution.rs +++ b/crates/milli/src/search/facet/facet_distribution.rs @@ -11,6 +11,7 @@ use serde::{Deserialize, Serialize}; use crate::error::UserError; use crate::facet::FacetType; +use crate::filterable_fields::{is_field_filterable, matching_field_names}; use crate::heed_codec::facet::{ FacetGroupKeyCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, OrderedF64Codec, }; @@ -18,7 +19,7 @@ use crate::heed_codec::{BytesRefCodec, StrRefCodec}; use crate::search::facet::facet_distribution_iter::{ count_iterate_over_facet_distribution, lexicographically_iterate_over_facet_distribution, }; -use crate::{FieldId, Index, Result}; +use crate::{FieldId, FieldsIdsMap, Index, Result}; /// The default number of values by facets that will /// be fetched from the key-value store. @@ -281,32 +282,13 @@ impl<'a> FacetDistribution<'a> { pub fn compute_stats(&self) -> Result> { let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; - let filterable_fields = self.index.filterable_fields(self.rtxn)?; let candidates = if let Some(candidates) = self.candidates.clone() { candidates } else { return Ok(Default::default()); }; - let fields = match &self.facets { - Some(facets) => { - let invalid_fields: HashSet<_> = facets - .iter() - .map(|(name, _)| name) - .filter(|facet| !crate::is_faceted(facet, &filterable_fields)) - .collect(); - if !invalid_fields.is_empty() { - return Err(UserError::InvalidFacetsDistribution { - invalid_facets_name: invalid_fields.into_iter().cloned().collect(), - valid_facets_name: filterable_fields.into_iter().collect(), - } - .into()); - } else { - facets.iter().map(|(name, _)| name).cloned().collect() - } - } - None => filterable_fields, - }; + let fields = self.faceted_fields_names(&fields_ids_map)?; let mut distribution = BTreeMap::new(); for (fid, name) in fields_ids_map.iter() { @@ -341,27 +323,8 @@ impl<'a> FacetDistribution<'a> { pub fn execute(&self) -> Result>> { let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; - let filterable_fields = self.index.filterable_fields(self.rtxn)?; - let fields = match self.facets { - Some(ref facets) => { - let invalid_fields: HashSet<_> = facets - .iter() - .map(|(name, _)| name) - .filter(|facet| !crate::is_faceted(facet, &filterable_fields)) - .collect(); - if !invalid_fields.is_empty() { - return Err(UserError::InvalidFacetsDistribution { - invalid_facets_name: invalid_fields.into_iter().cloned().collect(), - valid_facets_name: filterable_fields.into_iter().collect(), - } - .into()); - } else { - facets.iter().map(|(name, _)| name).cloned().collect() - } - } - None => filterable_fields, - }; + let fields = self.faceted_fields_names(&fields_ids_map)?; let mut distribution = BTreeMap::new(); for (fid, name) in fields_ids_map.iter() { @@ -378,6 +341,40 @@ impl<'a> FacetDistribution<'a> { Ok(distribution) } + + fn faceted_fields_names(&self, fields_ids_map: &FieldsIdsMap) -> Result> { + /// TODO: @many: this is a bit of a mess, we should refactor it + let filterable_fields = self.index.filterable_fields(self.rtxn)?; + let fields = match &self.facets { + Some(facets) => { + let invalid_fields: HashSet<_> = facets + .iter() + .map(|(name, _)| name) + .filter(|facet| !is_field_filterable(facet, &filterable_fields)) + .collect(); + if !invalid_fields.is_empty() { + let valid_facets_name = + matching_field_names(&filterable_fields, &fields_ids_map); + return Err(UserError::InvalidFacetsDistribution { + invalid_facets_name: invalid_fields.into_iter().cloned().collect(), + valid_facets_name: valid_facets_name + .into_iter() + .map(String::from) + .collect(), + } + .into()); + } else { + facets.iter().map(|(name, _)| name).cloned().collect() + } + } + None => matching_field_names(&filterable_fields, &fields_ids_map) + .into_iter() + .map(String::from) + .collect(), + }; + + Ok(fields) + } } impl fmt::Debug for FacetDistribution<'_> { @@ -405,11 +402,10 @@ mod tests { use std::iter; use big_s::S; - use maplit::hashset; use crate::documents::mmap_from_objects; use crate::index::tests::TempIndex; - use crate::{milli_snap, FacetDistribution, OrderBy}; + use crate::{milli_snap, FacetDistribution, FilterableAttributesSettings, OrderBy}; #[test] fn few_candidates_few_facet_values() { @@ -419,7 +415,10 @@ mod tests { let index = TempIndex::new(); index - .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .update_settings(|settings| { + settings + .set_filterable_fields(vec![FilterableAttributesSettings::Field(S("colour"))]) + }) .unwrap(); let documents = documents!([ @@ -490,7 +489,10 @@ mod tests { let index = TempIndex::new_with_map_size(4096 * 10_000); index - .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .update_settings(|settings| { + settings + .set_filterable_fields(vec![FilterableAttributesSettings::Field(S("colour"))]) + }) .unwrap(); let facet_values = ["Red", "RED", " red ", "Blue", "BLUE"]; @@ -575,7 +577,10 @@ mod tests { let index = TempIndex::new_with_map_size(4096 * 10_000); index - .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .update_settings(|settings| { + settings + .set_filterable_fields(vec![FilterableAttributesSettings::Field(S("colour"))]) + }) .unwrap(); let facet_values = (0..1000).map(|x| format!("{x:x}")).collect::>(); @@ -634,7 +639,10 @@ mod tests { let index = TempIndex::new_with_map_size(4096 * 10_000); index - .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .update_settings(|settings| { + settings + .set_filterable_fields(vec![FilterableAttributesSettings::Field(S("colour"))]) + }) .unwrap(); let facet_values = (0..1000).collect::>(); @@ -685,7 +693,10 @@ mod tests { let index = TempIndex::new_with_map_size(4096 * 10_000); index - .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .update_settings(|settings| { + settings + .set_filterable_fields(vec![FilterableAttributesSettings::Field(S("colour"))]) + }) .unwrap(); let facet_values = (0..1000).collect::>(); @@ -736,7 +747,10 @@ mod tests { let index = TempIndex::new_with_map_size(4096 * 10_000); index - .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .update_settings(|settings| { + settings + .set_filterable_fields(vec![FilterableAttributesSettings::Field(S("colour"))]) + }) .unwrap(); let facet_values = (0..1000).collect::>(); @@ -787,7 +801,10 @@ mod tests { let index = TempIndex::new_with_map_size(4096 * 10_000); index - .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .update_settings(|settings| { + settings + .set_filterable_fields(vec![FilterableAttributesSettings::Field(S("colour"))]) + }) .unwrap(); let facet_values = (0..1000).collect::>(); diff --git a/crates/milli/src/search/facet/filter.rs b/crates/milli/src/search/facet/filter.rs index 76f9ed6ff..97e59fc0a 100644 --- a/crates/milli/src/search/facet/filter.rs +++ b/crates/milli/src/search/facet/filter.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; +use std::collections::BTreeSet; use std::fmt::{Debug, Display}; use std::ops::Bound::{self, Excluded, Included}; @@ -12,13 +12,16 @@ use serde_json::Value; use super::facet_range_search; use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::error::{Error, UserError}; +use crate::filterable_fields::{ + filtered_matching_field_names, is_field_filterable, matching_features, +}; use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, OrderedF64Codec, }; use crate::index::db_name::FACET_ID_STRING_DOCIDS; use crate::{ - distance_between_two_points, lat_lng_to_xyz, FieldId, Index, InternalError, Result, - SerializationError, + distance_between_two_points, lat_lng_to_xyz, FieldId, FilterableAttributesFeatures, + FilterableAttributesSettings, Index, InternalError, Result, SerializationError, }; /// The maximum number of filters the filter AST can process. @@ -60,7 +63,7 @@ impl Display for BadGeoError { #[derive(Debug)] enum FilterError<'a> { - AttributeNotFilterable { attribute: &'a str, filterable_fields: HashSet }, + AttributeNotFilterable { attribute: &'a str, filterable_fields: BTreeSet<&'a str> }, ParseGeoError(BadGeoError), TooDeep, } @@ -233,10 +236,15 @@ impl<'a> Filter<'a> { let filterable_fields = index.filterable_fields(rtxn)?; for fid in self.condition.fids(MAX_FILTER_DEPTH) { let attribute = fid.value(); - if !crate::is_faceted(attribute, &filterable_fields) { + if !is_field_filterable(attribute, &filterable_fields) { + let fields_ids_map = index.fields_ids_map(rtxn)?; return Err(fid.as_external_error(FilterError::AttributeNotFilterable { attribute, - filterable_fields, + filterable_fields: filtered_matching_field_names( + &filterable_fields, + &fields_ids_map, + &|features| features.is_filterable(), + ), }))?; } } @@ -249,6 +257,7 @@ impl<'a> Filter<'a> { field_id: FieldId, universe: Option<&RoaringBitmap>, operator: &Condition<'a>, + features: &FilterableAttributesFeatures, ) -> Result { let numbers_db = index.facet_id_f64_docids; let strings_db = index.facet_id_string_docids; @@ -258,6 +267,17 @@ impl<'a> Filter<'a> { // field id and the level. let (left, right) = match operator { + // return an error if the filter is not allowed for this field + Condition::GreaterThan(_) + | Condition::GreaterThanOrEqual(_) + | Condition::LowerThan(_) + | Condition::LowerThanOrEqual(_) + | Condition::Between { .. } + if !features.is_filterable_order() => + { + /// TODO produce an dedicated error for this + todo!("filtering on non-ordered fields is not supported, return an error") + } Condition::GreaterThan(val) => { (Excluded(val.parse_finite_float()?), Included(f64::MAX)) } @@ -307,7 +327,8 @@ impl<'a> Filter<'a> { } Condition::NotEqual(val) => { let operator = Condition::Equal(val.clone()); - let docids = Self::evaluate_operator(rtxn, index, field_id, None, &operator)?; + let docids = + Self::evaluate_operator(rtxn, index, field_id, None, &operator, features)?; let all_ids = index.documents_ids(rtxn)?; return Ok(all_ids - docids); } @@ -409,7 +430,7 @@ impl<'a> Filter<'a> { &self, rtxn: &heed::RoTxn<'_>, index: &Index, - filterable_fields: &HashSet, + filterable_fields: &[FilterableAttributesSettings], universe: Option<&RoaringBitmap>, ) -> Result { if universe.map_or(false, |u| u.is_empty()) { @@ -434,36 +455,56 @@ impl<'a> Filter<'a> { } } FilterCondition::In { fid, els } => { - if crate::is_faceted(fid.value(), filterable_fields) { - let field_ids_map = index.fields_ids_map(rtxn)?; - if let Some(fid) = field_ids_map.id(fid.value()) { - els.iter() - .map(|el| Condition::Equal(el.clone())) - .map(|op| Self::evaluate_operator(rtxn, index, fid, universe, &op)) - .union() - } else { - Ok(RoaringBitmap::new()) + match matching_features(fid.value(), filterable_fields) { + Some(features) if features.is_filterable() => { + let field_ids_map = index.fields_ids_map(rtxn)?; + if let Some(fid) = field_ids_map.id(fid.value()) { + els.iter() + .map(|el| Condition::Equal(el.clone())) + .map(|op| { + Self::evaluate_operator( + rtxn, index, fid, universe, &op, &features, + ) + }) + .union() + } else { + Ok(RoaringBitmap::new()) + } + } + _ => { + let field_ids_map = index.fields_ids_map(rtxn)?; + Err(fid.as_external_error(FilterError::AttributeNotFilterable { + attribute: fid.value(), + filterable_fields: filtered_matching_field_names( + filterable_fields, + &field_ids_map, + &|features| features.is_filterable(), + ), + }))? } - } else { - Err(fid.as_external_error(FilterError::AttributeNotFilterable { - attribute: fid.value(), - filterable_fields: filterable_fields.clone(), - }))? } } FilterCondition::Condition { fid, op } => { - if crate::is_faceted(fid.value(), filterable_fields) { - let field_ids_map = index.fields_ids_map(rtxn)?; - if let Some(fid) = field_ids_map.id(fid.value()) { - Self::evaluate_operator(rtxn, index, fid, universe, op) - } else { - Ok(RoaringBitmap::new()) + match matching_features(fid.value(), filterable_fields) { + Some(features) if features.is_filterable() => { + let field_ids_map = index.fields_ids_map(rtxn)?; + if let Some(fid) = field_ids_map.id(fid.value()) { + Self::evaluate_operator(rtxn, index, fid, universe, op, &features) + } else { + Ok(RoaringBitmap::new()) + } + } + _ => { + let field_ids_map = index.fields_ids_map(rtxn)?; + Err(fid.as_external_error(FilterError::AttributeNotFilterable { + attribute: fid.value(), + filterable_fields: filtered_matching_field_names( + filterable_fields, + &field_ids_map, + &|features| features.is_filterable(), + ), + }))? } - } else { - Err(fid.as_external_error(FilterError::AttributeNotFilterable { - attribute: fid.value(), - filterable_fields: filterable_fields.clone(), - }))? } } FilterCondition::Or(subfilters) => subfilters @@ -502,7 +543,7 @@ impl<'a> Filter<'a> { } } FilterCondition::GeoLowerThan { point, radius } => { - if filterable_fields.contains(RESERVED_GEO_FIELD_NAME) { + if index.is_geo_filtering_activated(rtxn)? { let base_point: [f64; 2] = [point[0].parse_finite_float()?, point[1].parse_finite_float()?]; if !(-90.0..=90.0).contains(&base_point[0]) { @@ -530,14 +571,19 @@ impl<'a> Filter<'a> { Ok(result) } else { + let field_ids_map = index.fields_ids_map(rtxn)?; Err(point[0].as_external_error(FilterError::AttributeNotFilterable { attribute: RESERVED_GEO_FIELD_NAME, - filterable_fields: filterable_fields.clone(), + filterable_fields: filtered_matching_field_names( + filterable_fields, + &field_ids_map, + &|features| features.is_filterable(), + ), }))? } } FilterCondition::GeoBoundingBox { top_right_point, bottom_left_point } => { - if filterable_fields.contains(RESERVED_GEO_FIELD_NAME) { + if index.is_geo_filtering_activated(rtxn)? { let top_right: [f64; 2] = [ top_right_point[0].parse_finite_float()?, top_right_point[1].parse_finite_float()?, @@ -662,10 +708,15 @@ impl<'a> Filter<'a> { Ok(selected_lat & selected_lng) } else { + let field_ids_map = index.fields_ids_map(rtxn)?; Err(top_right_point[0].as_external_error( FilterError::AttributeNotFilterable { attribute: RESERVED_GEO_FIELD_NAME, - filterable_fields: filterable_fields.clone(), + filterable_fields: filtered_matching_field_names( + filterable_fields, + &field_ids_map, + &|features| features.is_filterable(), + ), }, ))? } @@ -687,12 +738,11 @@ mod tests { use big_s::S; use either::Either; - use maplit::hashset; use roaring::RoaringBitmap; use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::index::tests::TempIndex; - use crate::Filter; + use crate::{Filter, FilterableAttributesSettings}; #[test] fn empty_db() { @@ -700,7 +750,9 @@ mod tests { //Set the filterable fields to be the channel. index .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S("PrIcE") }); + settings.set_filterable_fields(vec![FilterableAttributesSettings::Field( + "PrIcE".to_string(), + )]); }) .unwrap(); @@ -804,7 +856,9 @@ mod tests { index .update_settings(|settings| { settings.set_searchable_fields(vec![S("title")]); - settings.set_filterable_fields(hashset! { S("title") }); + settings.set_filterable_fields(vec![FilterableAttributesSettings::Field( + "title".to_string(), + )]); }) .unwrap(); @@ -870,7 +924,9 @@ mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset!(S("monitor_diagonal"))); + settings.set_filterable_fields(vec![FilterableAttributesSettings::Field( + "monitor_diagonal".to_string(), + )]); }) .unwrap(); @@ -901,7 +957,9 @@ mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S(RESERVED_GEO_FIELD_NAME) }); + settings.set_filterable_fields(vec![FilterableAttributesSettings::Field(S( + RESERVED_GEO_FIELD_NAME, + ))]); }) .unwrap(); @@ -948,7 +1006,10 @@ mod tests { index .update_settings(|settings| { settings.set_searchable_fields(vec![S(RESERVED_GEO_FIELD_NAME), S("price")]); // to keep the fields order - settings.set_filterable_fields(hashset! { S(RESERVED_GEO_FIELD_NAME), S("price") }); + settings.set_filterable_fields(vec![ + FilterableAttributesSettings::Field(S(RESERVED_GEO_FIELD_NAME)), + FilterableAttributesSettings::Field("price".to_string()), + ]); }) .unwrap(); @@ -998,7 +1059,10 @@ mod tests { index .update_settings(|settings| { settings.set_searchable_fields(vec![S(RESERVED_GEO_FIELD_NAME), S("price")]); // to keep the fields order - settings.set_filterable_fields(hashset! { S(RESERVED_GEO_FIELD_NAME), S("price") }); + settings.set_filterable_fields(vec![ + FilterableAttributesSettings::Field(S(RESERVED_GEO_FIELD_NAME)), + FilterableAttributesSettings::Field("price".to_string()), + ]); }) .unwrap(); @@ -1108,7 +1172,9 @@ mod tests { index .update_settings(|settings| { settings.set_searchable_fields(vec![S("price")]); // to keep the fields order - settings.set_filterable_fields(hashset! { S("price") }); + settings.set_filterable_fields(vec![FilterableAttributesSettings::Field( + "price".to_string(), + )]); }) .unwrap(); index @@ -1164,7 +1230,11 @@ mod tests { index .update_settings(|settings| { settings.set_primary_key("id".to_owned()); - settings.set_filterable_fields(hashset! { S("id"), S("one"), S("two") }); + settings.set_filterable_fields(vec![ + FilterableAttributesSettings::Field("id".to_string()), + FilterableAttributesSettings::Field("one".to_string()), + FilterableAttributesSettings::Field("two".to_string()), + ]); }) .unwrap(); diff --git a/crates/milli/src/search/facet/search.rs b/crates/milli/src/search/facet/search.rs index cdba7ee16..4a0e58b33 100644 --- a/crates/milli/src/search/facet/search.rs +++ b/crates/milli/src/search/facet/search.rs @@ -10,6 +10,7 @@ use roaring::RoaringBitmap; use tracing::error; use crate::error::UserError; +use crate::filterable_fields::{is_field_filterable, matching_field_names}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue}; use crate::search::build_dfa; use crate::{DocumentId, FieldId, OrderBy, Result, Search}; @@ -74,9 +75,11 @@ impl<'a> SearchForFacetValues<'a> { let rtxn = self.search_query.rtxn; let filterable_fields = index.filterable_fields(rtxn)?; - if !filterable_fields.contains(&self.facet) { + if !is_field_filterable(&self.facet, &filterable_fields) { + let fields_ids_map = index.fields_ids_map(rtxn)?; + let matching_field_names = matching_field_names(&filterable_fields, &fields_ids_map); let (valid_fields, hidden_fields) = - index.remove_hidden_fields(rtxn, filterable_fields)?; + index.remove_hidden_fields(rtxn, matching_field_names)?; return Err(UserError::InvalidFacetSearchFacetName { field: self.facet.clone(), diff --git a/crates/milli/src/search/mod.rs b/crates/milli/src/search/mod.rs index d5b05f515..80a3ccc2c 100644 --- a/crates/milli/src/search/mod.rs +++ b/crates/milli/src/search/mod.rs @@ -9,6 +9,7 @@ use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, Filter, OrderBy, DEFAULT_VALUES_PER_FACET}; pub use self::new::matches::{FormatOptions, MatchBounds, MatcherBuilder, MatchingWords}; use self::new::{execute_vector_search, PartialSearchResult}; +use crate::filterable_fields::{is_field_filterable, matching_field_names}; use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::vector::Embedder; use crate::{ @@ -188,9 +189,15 @@ impl<'a> Search<'a> { if let Some(distinct) = &self.distinct { let filterable_fields = ctx.index.filterable_fields(ctx.txn)?; - if !crate::is_faceted(distinct, &filterable_fields) { + // check if the distinct field is in the filterable fields + if !is_field_filterable(distinct, &filterable_fields) { + // if not, remove the hidden fields from the filterable fields to generate the error message + let fields_ids_map = ctx.index.fields_ids_map(ctx.txn)?; + let matching_field_names = + matching_field_names(&filterable_fields, &fields_ids_map); let (valid_fields, hidden_fields) = - ctx.index.remove_hidden_fields(ctx.txn, filterable_fields)?; + ctx.index.remove_hidden_fields(ctx.txn, matching_field_names)?; + // and return the error return Err(Error::UserError(UserError::InvalidDistinctAttribute { field: distinct.clone(), valid_fields, diff --git a/crates/milli/src/search/new/tests/cutoff.rs b/crates/milli/src/search/new/tests/cutoff.rs index 63b67f2e7..99cb8f1d3 100644 --- a/crates/milli/src/search/new/tests/cutoff.rs +++ b/crates/milli/src/search/new/tests/cutoff.rs @@ -5,13 +5,11 @@ use std::time::Duration; -use big_s::S; -use maplit::hashset; use meili_snap::snapshot; use crate::index::tests::TempIndex; use crate::score_details::{ScoreDetails, ScoringStrategy}; -use crate::{Criterion, Filter, Search, TimeBudget}; +use crate::{Criterion, Filter, FilterableAttributesSettings, Search, TimeBudget}; fn create_index() -> TempIndex { let index = TempIndex::new(); @@ -20,7 +18,7 @@ fn create_index() -> TempIndex { .update_settings(|s| { s.set_primary_key("id".to_owned()); s.set_searchable_fields(vec!["text".to_owned()]); - s.set_filterable_fields(hashset! { S("id") }); + s.set_filterable_fields(vec![FilterableAttributesSettings::Field("id".to_owned())]); s.set_criteria(vec![Criterion::Words, Criterion::Typo]); }) .unwrap(); diff --git a/crates/milli/src/search/new/tests/distinct.rs b/crates/milli/src/search/new/tests/distinct.rs index dd27bfc8a..492f3961d 100644 --- a/crates/milli/src/search/new/tests/distinct.rs +++ b/crates/milli/src/search/new/tests/distinct.rs @@ -19,7 +19,10 @@ use maplit::hashset; use super::collect_field_values; use crate::index::tests::TempIndex; -use crate::{AscDesc, Criterion, Index, Member, Search, SearchResult, TermsMatchingStrategy}; +use crate::{ + AscDesc, Criterion, FilterableAttributesSettings, Index, Member, Search, SearchResult, + TermsMatchingStrategy, +}; fn create_index() -> TempIndex { let index = TempIndex::new(); @@ -236,7 +239,7 @@ fn test_distinct_placeholder_no_ranking_rules() { // Set the letter as filterable and unset the distinct attribute. index .update_settings(|s| { - s.set_filterable_fields(hashset! { S("letter") }); + s.set_filterable_fields(vec![FilterableAttributesSettings::Field("letter".to_owned())]); s.reset_distinct_field(); }) .unwrap(); diff --git a/crates/milli/src/search/new/tests/integration.rs b/crates/milli/src/search/new/tests/integration.rs index fc15b5f12..cbed3b79f 100644 --- a/crates/milli/src/search/new/tests/integration.rs +++ b/crates/milli/src/search/new/tests/integration.rs @@ -9,7 +9,7 @@ use crate::progress::Progress; use crate::update::new::indexer; use crate::update::{IndexDocumentsMethod, IndexerConfig, Settings}; use crate::vector::EmbeddingConfigs; -use crate::{db_snap, Criterion, Index}; +use crate::{db_snap, Criterion, FilterableAttributesSettings, Index}; pub const CONTENT: &str = include_str!("../../../../tests/assets/test_set.ndjson"); use crate::constants::RESERVED_GEO_FIELD_NAME; @@ -25,14 +25,14 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_criteria(criteria.to_vec()); - builder.set_filterable_fields(hashset! { - S("tag"), - S("asc_desc_rank"), - S(RESERVED_GEO_FIELD_NAME), - S("opt1"), - S("opt1.opt2"), - S("tag_in") - }); + builder.set_filterable_fields(vec![ + FilterableAttributesSettings::Field(S("tag")), + FilterableAttributesSettings::Field(S("asc_desc_rank")), + FilterableAttributesSettings::Field(S(RESERVED_GEO_FIELD_NAME)), + FilterableAttributesSettings::Field(S("opt1")), + FilterableAttributesSettings::Field(S("opt1.opt2")), + FilterableAttributesSettings::Field(S("tag_in")), + ]); builder.set_sortable_fields(hashset! { S("tag"), S("asc_desc_rank"), diff --git a/crates/milli/src/update/facet/bulk.rs b/crates/milli/src/update/facet/bulk.rs index 1ab8740ed..dfbdeea77 100644 --- a/crates/milli/src/update/facet/bulk.rs +++ b/crates/milli/src/update/facet/bulk.rs @@ -374,7 +374,7 @@ mod tests { use crate::heed_codec::StrRefCodec; use crate::index::tests::TempIndex; use crate::update::facet::test_helpers::{ordered_string, FacetIndex}; - use crate::{db_snap, milli_snap}; + use crate::{db_snap, milli_snap, FilterableAttributesSettings}; #[test] fn insert() { @@ -474,7 +474,9 @@ mod tests { index .update_settings(|settings| { settings.set_primary_key("id".to_owned()); - settings.set_filterable_fields(hashset! { S("id") }); + settings.set_filterable_fields(vec![FilterableAttributesSettings::Field( + "id".to_string(), + )]); }) .unwrap(); diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index f2c5fe2b9..df66961e2 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -760,18 +760,19 @@ mod tests { use bumpalo::Bump; use fst::IntoStreamer; use heed::RwTxn; - use maplit::hashset; + use maplit::{btreeset, hashset}; use super::*; use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::documents::mmap_from_objects; + use crate::filterable_fields::filtered_matching_field_names; use crate::index::tests::TempIndex; use crate::index::IndexEmbeddingConfig; use crate::progress::Progress; use crate::search::TermsMatchingStrategy; use crate::update::new::indexer; use crate::update::Setting; - use crate::{db_snap, Filter, Search, UserError}; + use crate::{db_snap, Filter, FilterableAttributesSettings, Search, UserError}; #[test] fn simple_document_replacement() { @@ -1001,7 +1002,9 @@ mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset!(S(RESERVED_GEO_FIELD_NAME))); + settings.set_filterable_fields(vec![FilterableAttributesSettings::Field( + RESERVED_GEO_FIELD_NAME.to_string(), + )]); }) .unwrap(); } @@ -1013,7 +1016,9 @@ mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset!(S(RESERVED_GEO_FIELD_NAME))); + settings.set_filterable_fields(vec![FilterableAttributesSettings::Field( + RESERVED_GEO_FIELD_NAME.to_string(), + )]); }) .unwrap(); @@ -1229,15 +1234,24 @@ mod tests { let searchable_fields = vec![S("title"), S("nested.object"), S("nested.machin")]; settings.set_searchable_fields(searchable_fields); - let faceted_fields = hashset!(S("title"), S("nested.object"), S("nested.machin")); + let faceted_fields = vec![ + FilterableAttributesSettings::Field("title".to_string()), + FilterableAttributesSettings::Field("nested.object".to_string()), + FilterableAttributesSettings::Field("nested.machin".to_string()), + ]; settings.set_filterable_fields(faceted_fields); }) .unwrap(); let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(facets, hashset!(S("title"), S("nested.object"), S("nested.machin"))); + let filterable_fields = index.filterable_fields(&rtxn).unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let facets = + filtered_matching_field_names(&filterable_fields, &fields_ids_map, &|features| { + features.is_filterable() + }); + assert_eq!(facets, btreeset!("title", "nested.object", "nested.machin")); // testing the simple query search let mut search = crate::Search::new(&rtxn, &index); @@ -1433,7 +1447,9 @@ mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset!(String::from("dog"))); + settings.set_filterable_fields(vec![FilterableAttributesSettings::Field( + "dog".to_string(), + )]); }) .unwrap(); @@ -1452,9 +1468,14 @@ mod tests { let rtxn = index.read_txn().unwrap(); - let hidden = index.faceted_fields(&rtxn).unwrap(); + let filterable_fields = index.filterable_fields(&rtxn).unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let facets = + filtered_matching_field_names(&filterable_fields, &fields_ids_map, &|features| { + features.is_filterable() + }); - assert_eq!(hidden, hashset!(S("dog"), S("dog.race"), S("dog.race.bernese mountain"))); + assert_eq!(facets, btreeset!("dog", "dog.race", "dog.race.bernese mountain")); for (s, i) in [("zeroth", 0), ("first", 1), ("second", 2), ("third", 3)] { let mut search = crate::Search::new(&rtxn, &index); @@ -1475,9 +1496,14 @@ mod tests { let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); + let filterable_fields = index.filterable_fields(&rtxn).unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let facets = + filtered_matching_field_names(&filterable_fields, &fields_ids_map, &|features| { + features.is_filterable() + }); - assert_eq!(facets, hashset!()); + assert_eq!(facets, btreeset!()); // update the settings to test the sortable index @@ -1501,9 +1527,14 @@ mod tests { let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); + let filterable_fields = index.filterable_fields(&rtxn).unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let facets = + filtered_matching_field_names(&filterable_fields, &fields_ids_map, &|features| { + features.is_filterable() + }); - assert_eq!(facets, hashset!(S("dog.race"), S("dog.race.bernese mountain"))); + assert_eq!(facets, btreeset!("dog.race", "dog.race.bernese mountain")); let mut search = crate::Search::new(&rtxn, &index); search.sort_criteria(vec![crate::AscDesc::Asc(crate::Member::Field(S( @@ -1712,8 +1743,13 @@ mod tests { let check_ok = |index: &Index| { let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue"))); + let filterable_fields = index.filterable_fields(&rtxn).unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let facets = + filtered_matching_field_names(&filterable_fields, &fields_ids_map, &|features| { + features.is_filterable() + }); + assert_eq!(facets, btreeset!("colour", "colour.green", "colour.green.blue")); let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); @@ -1733,7 +1769,7 @@ mod tests { assert_eq!(bitmap_colour_blue.into_iter().collect::>(), vec![7]); }; - let faceted_fields = hashset!(S("colour")); + let faceted_fields = vec![FilterableAttributesSettings::Field("colour".to_string())]; let index = TempIndex::new(); index.add_documents(content()).unwrap(); @@ -1818,8 +1854,13 @@ mod tests { let check_ok = |index: &Index| { let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue"))); + let filterable_fields = index.filterable_fields(&rtxn).unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let facets = + filtered_matching_field_names(&filterable_fields, &fields_ids_map, &|features| { + features.is_filterable() + }); + assert_eq!(facets, btreeset!("colour", "colour.green", "colour.green.blue")); let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); @@ -1839,7 +1880,7 @@ mod tests { assert_eq!(bitmap_colour_blue.into_iter().collect::>(), vec![3]); }; - let faceted_fields = hashset!(S("colour")); + let faceted_fields = vec![FilterableAttributesSettings::Field("colour".to_string())]; let index = TempIndex::new(); index.add_documents(content()).unwrap(); @@ -1882,8 +1923,13 @@ mod tests { let check_ok = |index: &Index| { let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(facets, hashset!(S("tags"), S("tags.green"), S("tags.green.blue"))); + let filterable_fields = index.filterable_fields(&rtxn).unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let facets = + filtered_matching_field_names(&filterable_fields, &fields_ids_map, &|features| { + features.is_filterable() + }); + assert_eq!(facets, btreeset!("tags", "tags.green", "tags.green.blue")); let tags_id = index.fields_ids_map(&rtxn).unwrap().id("tags").unwrap(); let tags_green_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green").unwrap(); @@ -1902,7 +1948,7 @@ mod tests { assert_eq!(bitmap_tags_blue.into_iter().collect::>(), vec![12]); }; - let faceted_fields = hashset!(S("tags")); + let faceted_fields = vec![FilterableAttributesSettings::Field("tags".to_string())]; let index = TempIndex::new(); index.add_documents(content()).unwrap(); @@ -2086,7 +2132,9 @@ mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S("title") }); + settings.set_filterable_fields(vec![FilterableAttributesSettings::Field( + "title".to_string(), + )]); }) .unwrap(); @@ -2941,7 +2989,10 @@ mod tests { index .update_settings_using_wtxn(&mut wtxn, |settings| { settings.set_primary_key(S("docid")); - settings.set_filterable_fields(hashset! { S("label"), S("label2") }); + settings.set_filterable_fields(vec![ + FilterableAttributesSettings::Field("label".to_string()), + FilterableAttributesSettings::Field("label2".to_string()), + ]); }) .unwrap(); wtxn.commit().unwrap(); @@ -3120,7 +3171,9 @@ mod tests { index .update_settings_using_wtxn(&mut wtxn, |settings| { settings.set_primary_key(S("id")); - settings.set_filterable_fields(hashset!(S(RESERVED_GEO_FIELD_NAME))); + settings.set_filterable_fields(vec![FilterableAttributesSettings::Field( + RESERVED_GEO_FIELD_NAME.to_string(), + )]); settings.set_sortable_fields(hashset!(S(RESERVED_GEO_FIELD_NAME))); }) .unwrap(); diff --git a/crates/milli/src/update/new/document_change.rs b/crates/milli/src/update/new/document_change.rs index 1644b2254..647be1c12 100644 --- a/crates/milli/src/update/new/document_change.rs +++ b/crates/milli/src/update/new/document_change.rs @@ -4,10 +4,10 @@ use heed::RoTxn; use super::document::{ Document as _, DocumentFromDb, DocumentFromVersions, MergedDocument, Versions, }; -use super::extract::perm_json_p; use super::vector_document::{ MergedVectorDocument, VectorDocumentFromDb, VectorDocumentFromVersions, }; +use crate::attribute_patterns::PatternMatch; use crate::documents::FieldIdMapper; use crate::vector::EmbeddingConfigs; use crate::{DocumentId, Index, Result}; @@ -173,7 +173,7 @@ impl<'doc> Update<'doc> { /// Otherwise `false`. pub fn has_changed_for_fields<'t, Mapper: FieldIdMapper>( &self, - fields: Option<&[&str]>, + selector: &mut impl FnMut(&str) -> PatternMatch, rtxn: &'t RoTxn, index: &'t Index, mapper: &'t Mapper, @@ -185,7 +185,7 @@ impl<'doc> Update<'doc> { for entry in self.updated().iter_top_level_fields() { let (key, updated_value) = entry?; - if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip { + if selector(key) == PatternMatch::NoMatch { continue; } @@ -229,7 +229,7 @@ impl<'doc> Update<'doc> { for entry in current.iter_top_level_fields() { let (key, _) = entry?; - if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip { + if selector(key) == PatternMatch::NoMatch { continue; } current_selected_field_count += 1; diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index 7e0484e39..2c74238de 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -5,12 +5,12 @@ use std::ops::DerefMut as _; use bumpalo::collections::Vec as BVec; use bumpalo::Bump; use hashbrown::HashMap; -use heed::RoTxn; use serde_json::Value; use super::super::cache::BalancedCaches; use super::facet_document::extract_document_facets; -use super::FacetKind; +use super::{match_faceted_field, FacetKind}; +use crate::fields_ids_map::metadata::Metadata; use crate::heed_codec::facet::OrderedF64Codec; use crate::update::del_add::DelAdd; use crate::update::new::channel::FieldIdDocidFacetSender; @@ -23,13 +23,17 @@ use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{FullySend, ThreadLocal}; use crate::update::new::DocumentChange; use crate::update::GrenadParameters; -use crate::{DocumentId, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH}; +use crate::{ + DocumentId, FieldId, FilterableAttributesFeatures, FilterableAttributesSettings, Result, + MAX_FACET_VALUE_LENGTH, +}; pub struct FacetedExtractorData<'a, 'b> { - attributes_to_extract: &'a [&'a str], sender: &'a FieldIdDocidFacetSender<'a, 'b>, grenad_parameters: &'a GrenadParameters, buckets: usize, + filterable_attributes: Vec, + sortable_fields: HashSet, } impl<'a, 'b, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a, 'b> { @@ -52,7 +56,8 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a, 'b> let change = change?; FacetedDocidsExtractor::extract_document_change( context, - self.attributes_to_extract, + &self.filterable_attributes, + &self.sortable_fields, change, self.sender, )? @@ -66,11 +71,12 @@ pub struct FacetedDocidsExtractor; impl FacetedDocidsExtractor { fn extract_document_change( context: &DocumentChangeContext>, - attributes_to_extract: &[&str], + filterable_attributes: &[FilterableAttributesSettings], + sortable_fields: &HashSet, document_change: DocumentChange, sender: &FieldIdDocidFacetSender, ) -> Result<()> { - let index = &context.index; + let index = context.index; let rtxn = &context.rtxn; let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); let mut cached_sorter = context.data.borrow_mut_or_yield(); @@ -78,11 +84,13 @@ impl FacetedDocidsExtractor { let docid = document_change.docid(); let res = match document_change { DocumentChange::Deletion(inner) => extract_document_facets( - attributes_to_extract, inner.current(rtxn, index, context.db_fields_ids_map)?, inner.external_document_id(), new_fields_ids_map.deref_mut(), - &mut |fid, depth, value| { + filterable_attributes, + sortable_fields, + &mut |fid, meta, depth, value| { + let features = meta.filterable_attributes_features(filterable_attributes); Self::facet_fn_with_options( &context.doc_alloc, cached_sorter.deref_mut(), @@ -91,6 +99,8 @@ impl FacetedDocidsExtractor { DelAddFacetValue::insert_del, docid, fid, + meta, + features, depth, value, ) @@ -98,7 +108,9 @@ impl FacetedDocidsExtractor { ), DocumentChange::Update(inner) => { if !inner.has_changed_for_fields( - Some(attributes_to_extract), + &mut |field_name| { + match_faceted_field(field_name, filterable_attributes, sortable_fields) + }, rtxn, index, context.db_fields_ids_map, @@ -107,11 +119,13 @@ impl FacetedDocidsExtractor { } extract_document_facets( - attributes_to_extract, inner.current(rtxn, index, context.db_fields_ids_map)?, inner.external_document_id(), new_fields_ids_map.deref_mut(), - &mut |fid, depth, value| { + filterable_attributes, + sortable_fields, + &mut |fid, meta, depth, value| { + let features = meta.filterable_attributes_features(filterable_attributes); Self::facet_fn_with_options( &context.doc_alloc, cached_sorter.deref_mut(), @@ -120,6 +134,8 @@ impl FacetedDocidsExtractor { DelAddFacetValue::insert_del, docid, fid, + meta, + features, depth, value, ) @@ -127,11 +143,13 @@ impl FacetedDocidsExtractor { )?; extract_document_facets( - attributes_to_extract, inner.merged(rtxn, index, context.db_fields_ids_map)?, inner.external_document_id(), new_fields_ids_map.deref_mut(), - &mut |fid, depth, value| { + filterable_attributes, + sortable_fields, + &mut |fid, meta, depth, value| { + let features = meta.filterable_attributes_features(filterable_attributes); Self::facet_fn_with_options( &context.doc_alloc, cached_sorter.deref_mut(), @@ -140,6 +158,8 @@ impl FacetedDocidsExtractor { DelAddFacetValue::insert_add, docid, fid, + meta, + features, depth, value, ) @@ -147,11 +167,13 @@ impl FacetedDocidsExtractor { ) } DocumentChange::Insertion(inner) => extract_document_facets( - attributes_to_extract, inner.inserted(), inner.external_document_id(), new_fields_ids_map.deref_mut(), - &mut |fid, depth, value| { + filterable_attributes, + sortable_fields, + &mut |fid, meta, depth, value| { + let features = meta.filterable_attributes_features(filterable_attributes); Self::facet_fn_with_options( &context.doc_alloc, cached_sorter.deref_mut(), @@ -160,6 +182,8 @@ impl FacetedDocidsExtractor { DelAddFacetValue::insert_add, docid, fid, + meta, + features, depth, value, ) @@ -180,9 +204,16 @@ impl FacetedDocidsExtractor { facet_fn: impl Fn(&mut DelAddFacetValue<'doc>, FieldId, BVec<'doc, u8>, FacetKind), docid: DocumentId, fid: FieldId, + meta: Metadata, + features: FilterableAttributesFeatures, depth: perm_json_p::Depth, value: &Value, ) -> Result<()> { + // if the field is not filterable, facet searchable or sortable, do nothing + if !(features.is_filterable() || features.is_facet_searchable() || meta.is_sortable()) { + return Ok(()); + } + let mut buffer = BVec::new_in(doc_alloc); // Exists // key: fid @@ -246,7 +277,9 @@ impl FacetedDocidsExtractor { } // Null // key: fid - Value::Null if depth == perm_json_p::Depth::OnBaseKey => { + Value::Null + if depth == perm_json_p::Depth::OnBaseKey && features.is_filterable_null() => + { buffer.clear(); buffer.push(FacetKind::Null as u8); buffer.extend_from_slice(&fid.to_be_bytes()); @@ -254,19 +287,29 @@ impl FacetedDocidsExtractor { } // Empty // key: fid - Value::Array(a) if a.is_empty() && depth == perm_json_p::Depth::OnBaseKey => { + Value::Array(a) + if a.is_empty() + && depth == perm_json_p::Depth::OnBaseKey + && features.is_filterable_empty() => + { buffer.clear(); buffer.push(FacetKind::Empty as u8); buffer.extend_from_slice(&fid.to_be_bytes()); cache_fn(cached_sorter, &buffer, docid) } - Value::String(_) if depth == perm_json_p::Depth::OnBaseKey => { + Value::String(_) + if depth == perm_json_p::Depth::OnBaseKey && features.is_filterable_empty() => + { buffer.clear(); buffer.push(FacetKind::Empty as u8); buffer.extend_from_slice(&fid.to_be_bytes()); cache_fn(cached_sorter, &buffer, docid) } - Value::Object(o) if o.is_empty() && depth == perm_json_p::Depth::OnBaseKey => { + Value::Object(o) + if o.is_empty() + && depth == perm_json_p::Depth::OnBaseKey + && features.is_filterable_empty() => + { buffer.clear(); buffer.push(FacetKind::Empty as u8); buffer.extend_from_slice(&fid.to_be_bytes()); @@ -276,10 +319,6 @@ impl FacetedDocidsExtractor { _ => Ok(()), } } - - fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { - index.user_defined_faceted_fields(rtxn) - } } struct DelAddFacetValue<'doc> { @@ -385,9 +424,8 @@ impl FacetedDocidsExtractor { { let index = indexing_context.index; let rtxn = index.read_txn()?; - let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?; - let attributes_to_extract: Vec<_> = - attributes_to_extract.iter().map(|s| s.as_ref()).collect(); + let filterable_attributes = index.filterable_fields(&rtxn)?; + let sortable_fields = index.sortable_fields(&rtxn)?; let datastore = ThreadLocal::new(); { @@ -396,10 +434,11 @@ impl FacetedDocidsExtractor { let _entered = span.enter(); let extractor = FacetedExtractorData { - attributes_to_extract: &attributes_to_extract, grenad_parameters: indexing_context.grenad_parameters, buckets: rayon::current_num_threads(), sender, + filterable_attributes, + sortable_fields, }; extract( document_changes, diff --git a/crates/milli/src/update/new/extract/faceted/facet_document.rs b/crates/milli/src/update/new/extract/faceted/facet_document.rs index 8d582d103..235804204 100644 --- a/crates/milli/src/update/new/extract/faceted/facet_document.rs +++ b/crates/milli/src/update/new/extract/faceted/facet_document.rs @@ -1,46 +1,70 @@ +use std::collections::HashSet; + use serde_json::Value; -use crate::constants::RESERVED_GEO_FIELD_NAME; +use crate::attribute_patterns::PatternMatch; +use crate::fields_ids_map::metadata::Metadata; use crate::update::new::document::Document; use crate::update::new::extract::geo::extract_geo_coordinates; use crate::update::new::extract::perm_json_p; -use crate::{FieldId, GlobalFieldsIdsMap, InternalError, Result, UserError}; +use crate::{ + FieldId, FilterableAttributesSettings, GlobalFieldsIdsMap, InternalError, Result, UserError, +}; + +use super::match_faceted_field; pub fn extract_document_facets<'doc>( - attributes_to_extract: &[&str], document: impl Document<'doc>, external_document_id: &str, field_id_map: &mut GlobalFieldsIdsMap, - facet_fn: &mut impl FnMut(FieldId, perm_json_p::Depth, &Value) -> Result<()>, + filterable_attributes: &[FilterableAttributesSettings], + sortable_fields: &HashSet, + facet_fn: &mut impl FnMut(FieldId, Metadata, perm_json_p::Depth, &Value) -> Result<()>, ) -> Result<()> { + // return the match result for the given field name. + let match_field = |field_name: &str| -> PatternMatch { + match_faceted_field(field_name, filterable_attributes, sortable_fields) + }; + + // extract the field if it is faceted (facet searchable, filterable, sortable) + let mut extract_field = |name: &str, depth: perm_json_p::Depth, value: &Value| -> Result<()> { + match field_id_map.id_with_metadata_or_insert(name) { + Some((field_id, meta)) => { + facet_fn(field_id, meta, depth, value)?; + + Ok(()) + } + None => Err(UserError::AttributeLimitReached.into()), + } + }; + for res in document.iter_top_level_fields() { let (field_name, value) = res?; + let selection = match_field(field_name); - let mut tokenize_field = - |name: &str, depth: perm_json_p::Depth, value: &Value| match field_id_map - .id_or_insert(name) - { - Some(field_id) => facet_fn(field_id, depth, value), - None => Err(UserError::AttributeLimitReached.into()), - }; + // extract the field if it matches a pattern and if it is faceted (facet searchable, filterable, sortable) + let mut match_and_extract = |name: &str, depth: perm_json_p::Depth, value: &Value| { + let selection = match_field(name); + if selection == PatternMatch::Match { + extract_field(name, depth, value)?; + } - // if the current field is searchable or contains a searchable attribute - let selection = perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]); - if selection != perm_json_p::Selection::Skip { + Ok(selection) + }; + + if selection != PatternMatch::NoMatch { // parse json. match serde_json::value::to_value(value).map_err(InternalError::SerdeJson)? { Value::Object(object) => { perm_json_p::seek_leaf_values_in_object( &object, - Some(attributes_to_extract), - &[], // skip no attributes field_name, perm_json_p::Depth::OnBaseKey, - &mut tokenize_field, + &mut match_and_extract, )?; - if selection == perm_json_p::Selection::Select { - tokenize_field( + if selection == PatternMatch::Match { + extract_field( field_name, perm_json_p::Depth::OnBaseKey, &Value::Object(object), @@ -50,36 +74,34 @@ pub fn extract_document_facets<'doc>( Value::Array(array) => { perm_json_p::seek_leaf_values_in_array( &array, - Some(attributes_to_extract), - &[], // skip no attributes field_name, perm_json_p::Depth::OnBaseKey, - &mut tokenize_field, + &mut match_and_extract, )?; - if selection == perm_json_p::Selection::Select { - tokenize_field( + if selection == PatternMatch::Match { + extract_field( field_name, perm_json_p::Depth::OnBaseKey, &Value::Array(array), )?; } } - value => tokenize_field(field_name, perm_json_p::Depth::OnBaseKey, &value)?, + value => extract_field(field_name, perm_json_p::Depth::OnBaseKey, &value)?, } } } - if attributes_to_extract.contains(&RESERVED_GEO_FIELD_NAME) { + if filterable_attributes.iter().any(|rule| rule.has_geo()) { if let Some(geo_value) = document.geo_field()? { if let Some([lat, lng]) = extract_geo_coordinates(external_document_id, geo_value)? { - let (lat_fid, lng_fid) = field_id_map - .id_or_insert("_geo.lat") - .zip(field_id_map.id_or_insert("_geo.lng")) + let ((lat_fid, lat_meta), (lng_fid, lng_meta)) = field_id_map + .id_with_metadata_or_insert("_geo.lat") + .zip(field_id_map.id_with_metadata_or_insert("_geo.lng")) .ok_or(UserError::AttributeLimitReached)?; - facet_fn(lat_fid, perm_json_p::Depth::OnBaseKey, &lat.into())?; - facet_fn(lng_fid, perm_json_p::Depth::OnBaseKey, &lng.into())?; + facet_fn(lat_fid, lat_meta, perm_json_p::Depth::OnBaseKey, &lat.into())?; + facet_fn(lng_fid, lng_meta, perm_json_p::Depth::OnBaseKey, &lng.into())?; } } } diff --git a/crates/milli/src/update/new/extract/faceted/mod.rs b/crates/milli/src/update/new/extract/faceted/mod.rs index 0c012d739..a92a3da08 100644 --- a/crates/milli/src/update/new/extract/faceted/mod.rs +++ b/crates/milli/src/update/new/extract/faceted/mod.rs @@ -1,8 +1,16 @@ mod extract_facets; mod facet_document; +use std::collections::HashSet; + pub use extract_facets::FacetedDocidsExtractor; +use crate::{ + attribute_patterns::{match_field_legacy, PatternMatch}, + filterable_fields::match_pattern_by_features, + FilterableAttributesSettings, +}; + #[repr(u8)] #[derive(Debug, Clone, Copy)] pub enum FacetKind { @@ -31,3 +39,30 @@ impl FacetKind { (FacetKind::from(key[0]), &key[1..]) } } + +pub fn match_faceted_field( + field_name: &str, + filterable_fields: &[FilterableAttributesSettings], + sortable_fields: &HashSet, +) -> PatternMatch { + // Check if the field matches any filterable or facet searchable field + let mut selection = match_pattern_by_features(field_name, &filterable_fields, &|features| { + features.is_facet_searchable() || features.is_filterable() + }); + + // If the field matches the pattern, return Match + if selection == PatternMatch::Match { + return selection; + } + + // Otherwise, check if the field matches any sortable field + for pattern in sortable_fields { + match match_field_legacy(&pattern, field_name) { + PatternMatch::Match => return PatternMatch::Match, + PatternMatch::Parent => selection = PatternMatch::Parent, + PatternMatch::NoMatch => (), + } + } + + selection +} diff --git a/crates/milli/src/update/new/extract/mod.rs b/crates/milli/src/update/new/extract/mod.rs index aa0a3d333..412ec1f2b 100644 --- a/crates/milli/src/update/new/extract/mod.rs +++ b/crates/milli/src/update/new/extract/mod.rs @@ -35,7 +35,7 @@ pub trait DocidsExtractor { pub mod perm_json_p { use serde_json::{Map, Value}; - use crate::Result; + use crate::{attribute_patterns::PatternMatch, Result}; const SPLIT_SYMBOL: char = '.'; /// Returns `true` if the `selector` match the `key`. @@ -68,11 +68,9 @@ pub mod perm_json_p { pub fn seek_leaf_values_in_object( value: &Map, - selectors: Option<&[&str]>, - skip_selectors: &[&str], base_key: &str, base_depth: Depth, - seeker: &mut impl FnMut(&str, Depth, &Value) -> Result<()>, + seeker: &mut impl FnMut(&str, Depth, &Value) -> Result, ) -> Result<()> { if value.is_empty() { seeker(base_key, base_depth, &Value::Object(Map::with_capacity(0)))?; @@ -85,40 +83,16 @@ pub mod perm_json_p { format!("{}{}{}", base_key, SPLIT_SYMBOL, key) }; - // here if the user only specified `doggo` we need to iterate in all the fields of `doggo` - // so we check the contained_in on both side - let selection = select_field(&base_key, selectors, skip_selectors); - if selection != Selection::Skip { + let selection = seeker(&base_key, Depth::OnBaseKey, value)?; + if selection != PatternMatch::NoMatch { match value { Value::Object(object) => { - if selection == Selection::Select { - seeker(&base_key, Depth::OnBaseKey, value)?; - } - - seek_leaf_values_in_object( - object, - selectors, - skip_selectors, - &base_key, - Depth::OnBaseKey, - seeker, - ) + seek_leaf_values_in_object(object, &base_key, Depth::OnBaseKey, seeker) } Value::Array(array) => { - if selection == Selection::Select { - seeker(&base_key, Depth::OnBaseKey, value)?; - } - - seek_leaf_values_in_array( - array, - selectors, - skip_selectors, - &base_key, - Depth::OnBaseKey, - seeker, - ) + seek_leaf_values_in_array(array, &base_key, Depth::OnBaseKey, seeker) } - value => seeker(&base_key, Depth::OnBaseKey, value), + _ => Ok(()), }?; } } @@ -128,11 +102,9 @@ pub mod perm_json_p { pub fn seek_leaf_values_in_array( values: &[Value], - selectors: Option<&[&str]>, - skip_selectors: &[&str], base_key: &str, base_depth: Depth, - seeker: &mut impl FnMut(&str, Depth, &Value) -> Result<()>, + seeker: &mut impl FnMut(&str, Depth, &Value) -> Result, ) -> Result<()> { if values.is_empty() { seeker(base_key, base_depth, &Value::Array(vec![]))?; @@ -140,61 +112,16 @@ pub mod perm_json_p { for value in values { match value { - Value::Object(object) => seek_leaf_values_in_object( - object, - selectors, - skip_selectors, - base_key, - Depth::InsideArray, - seeker, - ), - Value::Array(array) => seek_leaf_values_in_array( - array, - selectors, - skip_selectors, - base_key, - Depth::InsideArray, - seeker, - ), - value => seeker(base_key, Depth::InsideArray, value), + Value::Object(object) => { + seek_leaf_values_in_object(object, base_key, Depth::InsideArray, seeker) + } + Value::Array(array) => { + seek_leaf_values_in_array(array, base_key, Depth::InsideArray, seeker) + } + value => seeker(base_key, Depth::InsideArray, value).map(|_| ()), }?; } Ok(()) } - - pub fn select_field( - field_name: &str, - selectors: Option<&[&str]>, - skip_selectors: &[&str], - ) -> Selection { - if skip_selectors.iter().any(|skip_selector| { - contained_in(skip_selector, field_name) || contained_in(field_name, skip_selector) - }) { - Selection::Skip - } else if let Some(selectors) = selectors { - let mut selection = Selection::Skip; - for selector in selectors { - if contained_in(field_name, selector) { - selection = Selection::Select; - break; - } else if contained_in(selector, field_name) { - selection = Selection::Parent; - } - } - selection - } else { - Selection::Select - } - } - - #[derive(Debug, Clone, Copy, PartialEq, Eq)] - pub enum Selection { - /// The field is a parent of the of a nested field that must be selected - Parent, - /// The field must be selected - Select, - /// The field must be skipped - Skip, - } } diff --git a/crates/milli/src/update/new/facet_search_builder.rs b/crates/milli/src/update/new/facet_search_builder.rs index d1ff6096d..84de420f7 100644 --- a/crates/milli/src/update/new/facet_search_builder.rs +++ b/crates/milli/src/update/new/facet_search_builder.rs @@ -9,6 +9,7 @@ use heed::{BytesDecode, BytesEncode, RoTxn, RwTxn}; use super::fst_merger_builder::FstMergerBuilder; use super::KvReaderDelAdd; +use crate::attribute_patterns::PatternMatch; use crate::heed_codec::facet::FacetGroupKey; use crate::update::del_add::{DelAdd, KvWriterDelAdd}; use crate::update::{create_sorter, MergeDeladdBtreesetString}; @@ -92,7 +93,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> { let locales = self .localized_attributes_rules .iter() - .find(|rule| rule.match_str(field_name)) + .find(|rule| rule.match_str(field_name) == PatternMatch::Match) .map(|rule| rule.locales.clone()); e.insert(locales); diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index 85259c2d0..c780f3711 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -31,7 +31,10 @@ use crate::vector::settings::{ WriteBackToDocuments, }; use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; -use crate::{FieldId, FieldsIdsMap, Index, LocalizedAttributesRule, LocalizedFieldIds, Result}; +use crate::{ + FieldId, FieldsIdsMap, FilterableAttributesSettings, Index, LocalizedAttributesRule, + LocalizedFieldIds, Result, +}; #[derive(Debug, Clone, PartialEq, Eq, Copy)] pub enum Setting { @@ -155,7 +158,7 @@ pub struct Settings<'a, 't, 'i> { searchable_fields: Setting>, displayed_fields: Setting>, - filterable_fields: Setting>, + filterable_fields: Setting>, sortable_fields: Setting>, criteria: Setting>, stop_words: Setting>, @@ -241,8 +244,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.filterable_fields = Setting::Reset; } - pub fn set_filterable_fields(&mut self, names: HashSet) { - self.filterable_fields = Setting::Set(names); + pub fn set_filterable_fields(&mut self, rules: Vec) { + self.filterable_fields = Setting::Set(rules); } pub fn set_sortable_fields(&mut self, names: HashSet) { @@ -760,11 +763,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { fn update_filterable(&mut self) -> Result<()> { match self.filterable_fields { Setting::Set(ref fields) => { - let mut new_facets = HashSet::new(); - for name in fields { - new_facets.insert(name.clone()); - } - self.index.put_filterable_fields(self.wtxn, &new_facets)?; + self.index.put_filterable_fields(self.wtxn, fields)?; } Setting::Reset => { self.index.delete_filterable_fields(self.wtxn)?; @@ -1895,7 +1894,7 @@ pub fn validate_embedding_settings( mod tests { use big_s::S; use heed::types::Bytes; - use maplit::{btreemap, btreeset, hashset}; + use maplit::{btreemap, btreeset}; use meili_snap::snapshot; use super::*; @@ -2105,7 +2104,9 @@ mod tests { // Set the filterable fields to be the age. index .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S("age") }); + settings.set_filterable_fields(vec![FilterableAttributesSettings::Field( + "age".to_string(), + )]); }) .unwrap(); @@ -2121,7 +2122,7 @@ mod tests { // Check that the displayed fields are correctly set. let rtxn = index.read_txn().unwrap(); let fields_ids = index.filterable_fields(&rtxn).unwrap(); - assert_eq!(fields_ids, hashset! { S("age") }); + assert_eq!(fields_ids, vec![FilterableAttributesSettings::Field("age".to_string(),)]); // Only count the field_id 0 and level 0 facet values. // TODO we must support typed CSVs for numbers to be understood. let fidmap = index.fields_ids_map(&rtxn).unwrap(); @@ -2163,14 +2164,23 @@ mod tests { // Set the filterable fields to be the age and the name. index .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S("age"), S("name") }); + settings.set_filterable_fields(vec![ + FilterableAttributesSettings::Field("age".to_string()), + FilterableAttributesSettings::Field("name".to_string()), + ]); }) .unwrap(); // Check that the displayed fields are correctly set. let rtxn = index.read_txn().unwrap(); let fields_ids = index.filterable_fields(&rtxn).unwrap(); - assert_eq!(fields_ids, hashset! { S("age"), S("name") }); + assert_eq!( + fields_ids, + vec![ + FilterableAttributesSettings::Field("age".to_string()), + FilterableAttributesSettings::Field("name".to_string()), + ] + ); let rtxn = index.read_txn().unwrap(); // Only count the field_id 2 and level 0 facet values. @@ -2195,14 +2205,16 @@ mod tests { // Remove the age from the filterable fields. index .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S("name") }); + settings.set_filterable_fields(vec![FilterableAttributesSettings::Field( + "name".to_string(), + )]); }) .unwrap(); // Check that the displayed fields are correctly set. let rtxn = index.read_txn().unwrap(); let fields_ids = index.filterable_fields(&rtxn).unwrap(); - assert_eq!(fields_ids, hashset! { S("name") }); + assert_eq!(fields_ids, vec![FilterableAttributesSettings::Field("name".to_string())]); let rtxn = index.read_txn().unwrap(); // Only count the field_id 2 and level 0 facet values. @@ -2532,7 +2544,10 @@ mod tests { index .update_settings(|settings| { settings.set_displayed_fields(vec!["hello".to_string()]); - settings.set_filterable_fields(hashset! { S("age"), S("toto") }); + settings.set_filterable_fields(vec![ + FilterableAttributesSettings::Field("age".to_string()), + FilterableAttributesSettings::Field("toto".to_string()), + ]); settings.set_criteria(vec![Criterion::Asc(S("toto"))]); }) .unwrap(); @@ -2649,7 +2664,9 @@ mod tests { // Set the genres setting index .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S("genres") }); + settings.set_filterable_fields(vec![FilterableAttributesSettings::Field( + "genres".to_string(), + )]); }) .unwrap(); diff --git a/crates/milli/tests/search/facet_distribution.rs b/crates/milli/tests/search/facet_distribution.rs index ced81409d..505431d45 100644 --- a/crates/milli/tests/search/facet_distribution.rs +++ b/crates/milli/tests/search/facet_distribution.rs @@ -1,13 +1,12 @@ use big_s::S; use bumpalo::Bump; use heed::EnvOpenOptions; -use maplit::hashset; use milli::documents::mmap_from_objects; use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings}; use milli::vector::EmbeddingConfigs; -use milli::{FacetDistribution, Index, Object, OrderBy}; +use milli::{FacetDistribution, FilterableAttributesSettings, Index, Object, OrderBy}; use serde_json::{from_value, json}; #[test] @@ -21,10 +20,10 @@ fn test_facet_distribution_with_no_facet_values() { let config = IndexerConfig::default(); let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_filterable_fields(hashset! { - S("genres"), - S("tags"), - }); + builder.set_filterable_fields(vec![ + FilterableAttributesSettings::Field(S("genres")), + FilterableAttributesSettings::Field(S("tags")), + ]); builder.execute(|_| (), || false).unwrap(); wtxn.commit().unwrap(); diff --git a/crates/milli/tests/search/mod.rs b/crates/milli/tests/search/mod.rs index 30690969b..d3c35d622 100644 --- a/crates/milli/tests/search/mod.rs +++ b/crates/milli/tests/search/mod.rs @@ -11,7 +11,10 @@ use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings}; use milli::vector::EmbeddingConfigs; -use milli::{AscDesc, Criterion, DocumentId, Index, Member, TermsMatchingStrategy}; +use milli::{ + AscDesc, Criterion, DocumentId, FilterableAttributesSettings, Index, Member, + TermsMatchingStrategy, +}; use serde::{Deserialize, Deserializer}; use slice_group_by::GroupBy; @@ -42,14 +45,14 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_criteria(criteria.to_vec()); - builder.set_filterable_fields(hashset! { - S("tag"), - S("asc_desc_rank"), - S("_geo"), - S("opt1"), - S("opt1.opt2"), - S("tag_in") - }); + builder.set_filterable_fields(vec![ + FilterableAttributesSettings::Field(S("tag")), + FilterableAttributesSettings::Field(S("asc_desc_rank")), + FilterableAttributesSettings::Field(S("_geo")), + FilterableAttributesSettings::Field(S("opt1")), + FilterableAttributesSettings::Field(S("opt1.opt2")), + FilterableAttributesSettings::Field(S("tag_in")), + ]); builder.set_sortable_fields(hashset! { S("tag"), S("asc_desc_rank"),