From eaf3be1702e758297cfc583766e2229e08fd1c4e Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 16 Jan 2025 10:47:40 +0100 Subject: [PATCH] WIP --- crates/meilisearch-types/src/settings.rs | 8 +- crates/milli/src/attribute_patterns.rs | 58 ++++++++++++++ crates/milli/src/fields_ids_map/metadata.rs | 31 ++++++-- crates/milli/src/filterable_fields.rs | 75 +++++++++++++++++++ crates/milli/src/index.rs | 58 +++++++------- crates/milli/src/lib.rs | 6 ++ .../milli/src/localized_attributes_rules.rs | 45 ++--------- .../src/update/index_documents/enrich.rs | 7 +- .../milli/src/update/new/extract/geo/mod.rs | 5 +- 9 files changed, 203 insertions(+), 90 deletions(-) create mode 100644 crates/milli/src/attribute_patterns.rs create mode 100644 crates/milli/src/filterable_fields.rs diff --git a/crates/meilisearch-types/src/settings.rs b/crates/meilisearch-types/src/settings.rs index 658d7eec4..edd596fef 100644 --- a/crates/meilisearch-types/src/settings.rs +++ b/crates/meilisearch-types/src/settings.rs @@ -11,7 +11,9 @@ use fst::IntoStreamer; use milli::index::{IndexEmbeddingConfig, PrefixSearch}; use milli::proximity::ProximityPrecision; use milli::update::Setting; -use milli::{Criterion, CriterionError, Index, DEFAULT_VALUES_PER_FACET}; +use milli::{ + Criterion, CriterionError, FilterableAttributesSettings, Index, DEFAULT_VALUES_PER_FACET, +}; use serde::{Deserialize, Serialize, Serializer}; use utoipa::ToSchema; @@ -202,8 +204,8 @@ pub struct Settings { /// Attributes to use for faceting and filtering. See [Filtering and Faceted Search](https://www.meilisearch.com/docs/learn/filtering_and_sorting/search_with_facet_filters). #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] - #[schema(value_type = Option>, example = json!(["release_date", "genre"]))] - pub filterable_attributes: Setting>, + #[schema(value_type = Option>, example = json!(["release_date", "genre"]))] + pub filterable_attributes: Setting>, /// Attributes to use when sorting search results. #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] diff --git a/crates/milli/src/attribute_patterns.rs b/crates/milli/src/attribute_patterns.rs new file mode 100644 index 000000000..836ea4465 --- /dev/null +++ b/crates/milli/src/attribute_patterns.rs @@ -0,0 +1,58 @@ +use deserr::Deserr; +use serde::{Deserialize, Serialize}; +use utoipa::ToSchema; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Deserr, ToSchema)] +#[repr(transparent)] +#[serde(transparent)] +pub struct AttributePatterns { + #[schema(value_type = Vec)] + pub patterns: Vec, +} + +impl From> for AttributePatterns { + fn from(patterns: Vec) -> Self { + Self { patterns } + } +} + +impl AttributePatterns { + pub fn match_str(&self, str: &str) -> bool { + self.patterns.iter().any(|pattern| match_pattern(pattern, str)) + } +} + +fn match_pattern(pattern: &str, str: &str) -> bool { + if pattern == "*" { + true + } else if pattern.starts_with('*') && pattern.ends_with('*') { + str.contains(&pattern[1..pattern.len() - 1]) + } else if let Some(pattern) = pattern.strip_prefix('*') { + str.ends_with(pattern) + } else if let Some(pattern) = pattern.strip_suffix('*') { + str.starts_with(pattern) + } else { + pattern == str + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_match_pattern() { + assert!(match_pattern("*", "test")); + assert!(match_pattern("test*", "test")); + assert!(match_pattern("test*", "testa")); + assert!(match_pattern("*test", "test")); + assert!(match_pattern("*test", "atest")); + assert!(match_pattern("*test*", "test")); + assert!(match_pattern("*test*", "atesta")); + assert!(match_pattern("*test*", "atest")); + assert!(match_pattern("*test*", "testa")); + assert!(!match_pattern("test*test", "test")); + assert!(!match_pattern("*test", "testa")); + assert!(!match_pattern("test*", "atest")); + } +} diff --git a/crates/milli/src/fields_ids_map/metadata.rs b/crates/milli/src/fields_ids_map/metadata.rs index 65a1111fa..3bc7f9ca1 100644 --- a/crates/milli/src/fields_ids_map/metadata.rs +++ b/crates/milli/src/fields_ids_map/metadata.rs @@ -5,14 +5,14 @@ use charabia::Language; use heed::RoTxn; use super::FieldsIdsMap; -use crate::{FieldId, Index, LocalizedAttributesRule, Result}; +use crate::{FieldId, FilterableAttributesSettings, Index, LocalizedAttributesRule, Result}; #[derive(Debug, Clone, Copy)] pub struct Metadata { pub searchable: bool, - pub filterable: bool, pub sortable: bool, localized_attributes_rule_id: Option, + filterable_attributes_rule_id: Option, } #[derive(Debug, Clone)] @@ -111,7 +111,7 @@ impl Metadata { #[derive(Debug, Clone)] pub struct MetadataBuilder { searchable_attributes: Vec, - filterable_attributes: HashSet, + filterable_attributes: Vec, sortable_attributes: HashSet, localized_attributes: Option>, } @@ -134,7 +134,7 @@ impl MetadataBuilder { pub fn new( searchable_attributes: Vec, - filterable_attributes: HashSet, + filterable_attributes: Vec, sortable_attributes: HashSet, localized_attributes: Option>, ) -> Self { @@ -152,8 +152,6 @@ impl MetadataBuilder { .iter() .any(|attribute| attribute == "*" || attribute == field); - let filterable = self.filterable_attributes.contains(field); - let sortable = self.sortable_attributes.contains(field); let localized_attributes_rule_id = self @@ -164,7 +162,24 @@ impl MetadataBuilder { // saturating_add(1): make `id` `NonZero` .map(|id| NonZeroU16::new(id.saturating_add(1).try_into().unwrap()).unwrap()); - Metadata { searchable, filterable, sortable, localized_attributes_rule_id } + let filterable_attributes_rule_id = self + .filterable_attributes + .iter() + .position(|attribute| match attribute { + FilterableAttributesSettings::Field(field_name) => field_name == field, + FilterableAttributesSettings::Pattern(patterns) => { + patterns.patterns.match_str(field) + } + }) + // saturating_add(1): make `id` `NonZero` + .map(|id| NonZeroU16::new(id.saturating_add(1).try_into().unwrap()).unwrap()); + + Metadata { + searchable, + sortable, + localized_attributes_rule_id, + filterable_attributes_rule_id, + } } pub fn searchable_attributes(&self) -> &[String] { @@ -175,7 +190,7 @@ impl MetadataBuilder { &self.sortable_attributes } - pub fn filterable_attributes(&self) -> &HashSet { + pub fn filterable_attributes(&self) -> &[FilterableAttributesSettings] { &self.filterable_attributes } diff --git a/crates/milli/src/filterable_fields.rs b/crates/milli/src/filterable_fields.rs new file mode 100644 index 000000000..b381cd924 --- /dev/null +++ b/crates/milli/src/filterable_fields.rs @@ -0,0 +1,75 @@ +use deserr::{DeserializeError, Deserr, ValuePointerRef}; +use serde::{Deserialize, Serialize}; +use std::collections::HashSet; +use utoipa::ToSchema; + +use crate::{ + constants::RESERVED_GEO_FIELD_NAME, is_faceted_by, AttributePatterns, FieldId, FieldsIdsMap, +}; + +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug, ToSchema)] +#[serde(untagged)] +pub enum FilterableAttributesSettings { + Field(String), + Pattern(FilterableAttributesPatterns), +} + +impl FilterableAttributesSettings { + pub fn match_str(&self, field: &str) -> bool { + match self { + FilterableAttributesSettings::Field(field_name) => is_faceted_by(field, field_name), + FilterableAttributesSettings::Pattern(patterns) => patterns.patterns.match_str(field), + } + } + + pub fn has_geo(&self) -> bool { + /// TODO: This is a temporary solution to check if the geo field is activated. + matches!(self, FilterableAttributesSettings::Field(field_name) if field_name == RESERVED_GEO_FIELD_NAME) + } +} + +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug, Deserr, ToSchema)] +#[serde(deny_unknown_fields, rename_all = "camelCase")] +#[deserr(rename_all = camelCase, deny_unknown_fields)] +pub struct FilterableAttributesPatterns { + pub patterns: AttributePatterns, + pub features: Option, +} + +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug, Deserr, ToSchema)] +#[serde(deny_unknown_fields, rename_all = "camelCase")] +#[deserr(rename_all = camelCase, deny_unknown_fields)] +pub struct FilterableAttributesFeatures { + facet_search: Option, + filter: Option, +} + +impl Deserr for FilterableAttributesSettings { + fn deserialize_from_value( + value: deserr::Value, + location: ValuePointerRef, + ) -> Result { + if value.kind() == deserr::ValueKind::Map { + Ok(Self::Pattern(FilterableAttributesPatterns::deserialize_from_value( + value, location, + )?)) + } else { + Ok(Self::Field(String::deserialize_from_value(value, location)?)) + } + } +} + +pub fn matching_field_ids( + filterable_attributes: &[FilterableAttributesSettings], + fields_ids_map: &FieldsIdsMap, +) -> HashSet { + let mut result = HashSet::new(); + for (field_id, field_name) in fields_ids_map.iter() { + for filterable_attribute in filterable_attributes { + if filterable_attribute.match_str(field_name) { + result.insert(field_id); + } + } + } + result +} diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index 9829df2ee..f7f591539 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -10,10 +10,11 @@ use roaring::RoaringBitmap; use rstar::RTree; use serde::{Deserialize, Serialize}; -use crate::constants::RESERVED_VECTORS_FIELD_NAME; +use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME}; use crate::documents::PrimaryKey; use crate::error::{InternalError, UserError}; use crate::fields_ids_map::FieldsIdsMap; +use crate::filterable_fields::matching_field_ids; use crate::heed_codec::facet::{ FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec, OrderedF64Codec, @@ -25,8 +26,9 @@ use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfig}; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec, - FieldidsWeightsMap, GeoPoint, LocalizedAttributesRule, ObkvCodec, Result, RoaringBitmapCodec, - RoaringBitmapLenCodec, Search, U8StrStrCodec, Weight, BEU16, BEU32, BEU64, + FieldidsWeightsMap, FilterableAttributesSettings, GeoPoint, LocalizedAttributesRule, ObkvCodec, + Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, Weight, BEU16, BEU32, + BEU64, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; @@ -787,7 +789,7 @@ impl Index { pub(crate) fn put_filterable_fields( &self, wtxn: &mut RwTxn<'_>, - fields: &HashSet, + fields: &Vec, ) -> heed::Result<()> { self.main.remap_types::>().put( wtxn, @@ -802,7 +804,10 @@ impl Index { } /// Returns the filterable fields names. - pub fn filterable_fields(&self, rtxn: &RoTxn<'_>) -> heed::Result> { + pub fn filterable_fields( + &self, + rtxn: &RoTxn<'_>, + ) -> heed::Result> { Ok(self .main .remap_types::>() @@ -815,14 +820,9 @@ impl Index { let fields = self.filterable_fields(rtxn)?; let fields_ids_map = self.fields_ids_map(rtxn)?; - let mut fields_ids = HashSet::new(); - for name in fields { - if let Some(field_id) = fields_ids_map.id(&name) { - fields_ids.insert(field_id); - } - } + let matching_field_ids = matching_field_ids(&fields, &fields_ids_map); - Ok(fields_ids) + Ok(matching_field_ids) } /* sortable fields */ @@ -876,6 +876,13 @@ impl Index { ) } + /// Returns true if the geo feature is activated. + pub fn is_geo_activated(&self, rtxn: &RoTxn<'_>) -> Result { + let geo_filter = self.filterable_fields(rtxn)?.iter().any(|field| field.has_geo()); + let geo_sortable = self.sortable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME); + Ok(geo_filter || geo_sortable) + } + /// Returns the faceted fields names. pub fn faceted_fields(&self, rtxn: &RoTxn<'_>) -> heed::Result> { Ok(self @@ -906,7 +913,8 @@ impl Index { /// /// The user faceted fields are the union of all the filterable, sortable, distinct, and Asc/Desc fields. pub fn user_defined_faceted_fields(&self, rtxn: &RoTxn<'_>) -> Result> { - let filterable_fields = self.filterable_fields(rtxn)?; + let fields_ids_map = self.fields_ids_map(rtxn)?; + let filterable_fields = self.filterable_fields_ids(rtxn)?; let sortable_fields = self.sortable_fields(rtxn)?; let distinct_field = self.distinct_field(rtxn)?; let asc_desc_fields = @@ -915,7 +923,14 @@ impl Index { _otherwise => None, }); - let mut faceted_fields = filterable_fields; + let mut faceted_fields: HashSet<_> = filterable_fields + .into_iter() + .filter_map(|field_id| { + let field_name = fields_ids_map.name(field_id); + debug_assert!(field_name.is_some(), "field name not found for {field_id}"); + field_name.map(|field| field.to_string()) + }) + .collect(); faceted_fields.extend(sortable_fields); faceted_fields.extend(asc_desc_fields); if let Some(field) = distinct_field { @@ -925,21 +940,6 @@ impl Index { Ok(faceted_fields) } - /// Identical to `user_defined_faceted_fields`, but returns ids instead. - pub fn user_defined_faceted_fields_ids(&self, rtxn: &RoTxn<'_>) -> Result> { - let fields = self.user_defined_faceted_fields(rtxn)?; - let fields_ids_map = self.fields_ids_map(rtxn)?; - - let mut fields_ids = HashSet::new(); - for name in fields { - if let Some(field_id) = fields_ids_map.id(&name) { - fields_ids.insert(field_id); - } - } - - Ok(fields_ids) - } - /* faceted documents ids */ /// Retrieve all the documents which contain this field id set as null diff --git a/crates/milli/src/lib.rs b/crates/milli/src/lib.rs index ea88d2b78..dc9bc4911 100644 --- a/crates/milli/src/lib.rs +++ b/crates/milli/src/lib.rs @@ -9,11 +9,13 @@ pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; pub mod documents; mod asc_desc; +mod attribute_patterns; mod criterion; mod error; mod external_documents_ids; pub mod facet; mod fields_ids_map; +mod filterable_fields; pub mod heed_codec; pub mod index; mod localized_attributes_rules; @@ -51,6 +53,7 @@ pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbor pub use {charabia as tokenizer, heed, rhai}; pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; +pub use self::attribute_patterns::AttributePatterns; pub use self::criterion::{default_criteria, Criterion, CriterionError}; pub use self::error::{ Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError, @@ -58,6 +61,9 @@ pub use self::error::{ pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fieldids_weights_map::FieldidsWeightsMap; pub use self::fields_ids_map::{FieldsIdsMap, GlobalFieldsIdsMap}; +pub use self::filterable_fields::{ + FilterableAttributesFeatures, FilterableAttributesPatterns, FilterableAttributesSettings, +}; pub use self::heed_codec::{ BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, diff --git a/crates/milli/src/localized_attributes_rules.rs b/crates/milli/src/localized_attributes_rules.rs index 2b9bf099c..16dd3fd07 100644 --- a/crates/milli/src/localized_attributes_rules.rs +++ b/crates/milli/src/localized_attributes_rules.rs @@ -5,7 +5,7 @@ use serde::{Deserialize, Serialize}; use utoipa::ToSchema; use crate::fields_ids_map::FieldsIdsMap; -use crate::FieldId; +use crate::{AttributePatterns, FieldId}; /// A rule that defines which locales are supported for a given attribute. /// @@ -17,18 +17,18 @@ use crate::FieldId; /// The pattern `*attribute_name*` matches any attribute name that contains `attribute_name`. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] pub struct LocalizedAttributesRule { - pub attribute_patterns: Vec, + pub attribute_patterns: AttributePatterns, #[schema(value_type = Vec)] pub locales: Vec, } impl LocalizedAttributesRule { pub fn new(attribute_patterns: Vec, locales: Vec) -> Self { - Self { attribute_patterns, locales } + Self { attribute_patterns: AttributePatterns::from(attribute_patterns), locales } } pub fn match_str(&self, str: &str) -> bool { - self.attribute_patterns.iter().any(|pattern| match_pattern(pattern.as_str(), str)) + self.attribute_patterns.match_str(str) } pub fn locales(&self) -> &[Language] { @@ -36,20 +36,6 @@ impl LocalizedAttributesRule { } } -fn match_pattern(pattern: &str, str: &str) -> bool { - if pattern == "*" { - true - } else if pattern.starts_with('*') && pattern.ends_with('*') { - str.contains(&pattern[1..pattern.len() - 1]) - } else if let Some(pattern) = pattern.strip_prefix('*') { - str.ends_with(pattern) - } else if let Some(pattern) = pattern.strip_suffix('*') { - str.starts_with(pattern) - } else { - pattern == str - } -} - #[derive(Debug, Clone, PartialEq, Eq)] pub struct LocalizedFieldIds { field_id_to_locales: HashMap>, @@ -65,7 +51,7 @@ impl LocalizedFieldIds { if let Some(rules) = rules { let fields = fields_ids.filter_map(|field_id| { - fields_ids_map.name(field_id).map(|field_name| (field_id, field_name)) + fields_ids_map.name(field_id).map(|field_name: &str| (field_id, field_name)) }); for (field_id, field_name) in fields { @@ -108,24 +94,3 @@ impl LocalizedFieldIds { locales } } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_match_pattern() { - assert!(match_pattern("*", "test")); - assert!(match_pattern("test*", "test")); - assert!(match_pattern("test*", "testa")); - assert!(match_pattern("*test", "test")); - assert!(match_pattern("*test", "atest")); - assert!(match_pattern("*test*", "test")); - assert!(match_pattern("*test*", "atesta")); - assert!(match_pattern("*test*", "atest")); - assert!(match_pattern("*test*", "testa")); - assert!(!match_pattern("test*test", "test")); - assert!(!match_pattern("*test", "testa")); - assert!(!match_pattern("test*", "atest")); - } -} diff --git a/crates/milli/src/update/index_documents/enrich.rs b/crates/milli/src/update/index_documents/enrich.rs index c35701961..b59b27ac3 100644 --- a/crates/milli/src/update/index_documents/enrich.rs +++ b/crates/milli/src/update/index_documents/enrich.rs @@ -95,12 +95,7 @@ pub fn enrich_documents_batch( // If the settings specifies that a _geo field must be used therefore we must check the // validity of it in all the documents of this batch and this is when we return `Some`. let geo_field_id = match documents_batch_index.id(RESERVED_GEO_FIELD_NAME) { - Some(geo_field_id) - if index.sortable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME) - || index.filterable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME) => - { - Some(geo_field_id) - } + Some(geo_field_id) if index.is_geo_activated(rtxn)? => Some(geo_field_id), _otherwise => None, }; diff --git a/crates/milli/src/update/new/extract/geo/mod.rs b/crates/milli/src/update/new/extract/geo/mod.rs index 42da7766e..0d64e4b23 100644 --- a/crates/milli/src/update/new/extract/geo/mod.rs +++ b/crates/milli/src/update/new/extract/geo/mod.rs @@ -9,7 +9,6 @@ use heed::RoTxn; use serde_json::value::RawValue; use serde_json::Value; -use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::error::GeoError; use crate::update::new::document::Document; use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extractor}; @@ -29,9 +28,7 @@ impl GeoExtractor { index: &Index, grenad_parameters: GrenadParameters, ) -> Result> { - let is_sortable = index.sortable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME); - let is_filterable = index.filterable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME); - if is_sortable || is_filterable { + if index.is_geo_activated(rtxn)? { Ok(Some(GeoExtractor { grenad_parameters })) } else { Ok(None)