diff --git a/crates/benchmarks/benches/indexing.rs b/crates/benchmarks/benches/indexing.rs index 4bd5315ff..9938fca26 100644 --- a/crates/benchmarks/benches/indexing.rs +++ b/crates/benchmarks/benches/indexing.rs @@ -12,7 +12,7 @@ use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexerConfig, Settings}; use milli::vector::EmbeddingConfigs; -use milli::Index; +use milli::{FilterableAttributesRule, Index}; use rand::seq::SliceRandom; use rand_chacha::rand_core::SeedableRng; use roaring::RoaringBitmap; @@ -57,7 +57,8 @@ fn setup_settings<'t>( let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); builder.set_searchable_fields(searchable_fields); - let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); + let filterable_fields = + filterable_fields.iter().map(|s| FilterableAttributesRule::Field(s.to_string())).collect(); builder.set_filterable_fields(filterable_fields); let sortable_fields = sortable_fields.iter().map(|s| s.to_string()).collect(); diff --git a/crates/benchmarks/benches/search_geo.rs b/crates/benchmarks/benches/search_geo.rs index 72503ce57..d76929f99 100644 --- a/crates/benchmarks/benches/search_geo.rs +++ b/crates/benchmarks/benches/search_geo.rs @@ -2,7 +2,7 @@ mod datasets_paths; mod utils; use criterion::{criterion_group, criterion_main}; -use milli::update::Settings; +use milli::{update::Settings, FilterableAttributesRule}; use utils::Conf; #[cfg(not(windows))] @@ -21,8 +21,10 @@ fn base_conf(builder: &mut Settings) { ["name", "alternatenames", "elevation"].iter().map(|s| s.to_string()).collect(); builder.set_searchable_fields(searchable_fields); - let filterable_fields = - ["_geo", "population", "elevation"].iter().map(|s| s.to_string()).collect(); + let filterable_fields = ["_geo", "population", "elevation"] + .iter() + .map(|s| FilterableAttributesRule::Field(s.to_string())) + .collect(); builder.set_filterable_fields(filterable_fields); let sortable_fields = diff --git a/crates/benchmarks/benches/search_songs.rs b/crates/benchmarks/benches/search_songs.rs index bef014a0e..680a675ef 100644 --- a/crates/benchmarks/benches/search_songs.rs +++ b/crates/benchmarks/benches/search_songs.rs @@ -2,7 +2,7 @@ mod datasets_paths; mod utils; use criterion::{criterion_group, criterion_main}; -use milli::update::Settings; +use milli::{update::Settings, FilterableAttributesRule}; use utils::Conf; #[cfg(not(windows))] @@ -22,7 +22,7 @@ fn base_conf(builder: &mut Settings) { let faceted_fields = ["released-timestamp", "duration-float", "genre", "country", "artist"] .iter() - .map(|s| s.to_string()) + .map(|s| FilterableAttributesRule::Field(s.to_string())) .collect(); builder.set_filterable_fields(faceted_fields); } diff --git a/crates/dump/src/lib.rs b/crates/dump/src/lib.rs index e7fd22333..4e2d6ac2f 100644 --- a/crates/dump/src/lib.rs +++ b/crates/dump/src/lib.rs @@ -233,8 +233,8 @@ pub(crate) mod test { use meilisearch_types::features::{Network, Remote, RuntimeTogglableFeatures}; use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::keys::{Action, Key}; - use meilisearch_types::milli; use meilisearch_types::milli::update::Setting; + use meilisearch_types::milli::{self, FilterableAttributesRule}; use meilisearch_types::settings::{Checked, FacetingSettings, Settings}; use meilisearch_types::task_view::DetailsView; use meilisearch_types::tasks::{Details, Kind, Status}; @@ -279,7 +279,10 @@ pub(crate) mod test { let settings = Settings { displayed_attributes: Setting::Set(vec![S("race"), S("name")]).into(), searchable_attributes: Setting::Set(vec![S("name"), S("race")]).into(), - filterable_attributes: Setting::Set(btreeset! { S("race"), S("age") }), + filterable_attributes: Setting::Set(vec![ + FilterableAttributesRule::Field(S("race")), + FilterableAttributesRule::Field(S("age")), + ]), sortable_attributes: Setting::Set(btreeset! { S("age") }), ranking_rules: Setting::NotSet, stop_words: Setting::NotSet, diff --git a/crates/dump/src/reader/compat/v5_to_v6.rs b/crates/dump/src/reader/compat/v5_to_v6.rs index 2dd4ed761..6b63e7c6b 100644 --- a/crates/dump/src/reader/compat/v5_to_v6.rs +++ b/crates/dump/src/reader/compat/v5_to_v6.rs @@ -322,7 +322,16 @@ impl From> for v6::Settings { v6::Settings { displayed_attributes: v6::Setting::from(settings.displayed_attributes).into(), searchable_attributes: v6::Setting::from(settings.searchable_attributes).into(), - filterable_attributes: settings.filterable_attributes.into(), + filterable_attributes: match settings.filterable_attributes { + v5::settings::Setting::Set(filterable_attributes) => v6::Setting::Set( + filterable_attributes + .into_iter() + .map(v6::FilterableAttributesRule::Field) + .collect(), + ), + v5::settings::Setting::Reset => v6::Setting::Reset, + v5::settings::Setting::NotSet => v6::Setting::NotSet, + }, sortable_attributes: settings.sortable_attributes.into(), ranking_rules: { match settings.ranking_rules { diff --git a/crates/dump/src/reader/v6/mod.rs b/crates/dump/src/reader/v6/mod.rs index d9ceec114..0b4ba5bdd 100644 --- a/crates/dump/src/reader/v6/mod.rs +++ b/crates/dump/src/reader/v6/mod.rs @@ -46,6 +46,8 @@ pub type ResponseError = meilisearch_types::error::ResponseError; pub type Code = meilisearch_types::error::Code; pub type RankingRuleView = meilisearch_types::settings::RankingRuleView; +pub type FilterableAttributesRule = meilisearch_types::milli::FilterableAttributesRule; + pub struct V6Reader { dump: TempDir, instance_uid: Option, diff --git a/crates/filter-parser/src/condition.rs b/crates/filter-parser/src/condition.rs index 04b6dc266..0fc007bf1 100644 --- a/crates/filter-parser/src/condition.rs +++ b/crates/filter-parser/src/condition.rs @@ -30,6 +30,25 @@ pub enum Condition<'a> { StartsWith { keyword: Token<'a>, word: Token<'a> }, } +impl Condition<'_> { + pub fn operator(&self) -> &str { + match self { + Condition::GreaterThan(_) => ">", + Condition::GreaterThanOrEqual(_) => ">=", + Condition::Equal(_) => "=", + Condition::NotEqual(_) => "!=", + Condition::Null => "IS NULL", + Condition::Empty => "IS EMPTY", + Condition::Exists => "EXISTS", + Condition::LowerThan(_) => "<", + Condition::LowerThanOrEqual(_) => "<=", + Condition::Between { .. } => "TO", + Condition::Contains { .. } => "CONTAINS", + Condition::StartsWith { .. } => "STARTS WITH", + } + } +} + /// condition = value ("==" | ">" ...) value pub fn parse_condition(input: Span) -> IResult { let operator = alt((tag("<="), tag(">="), tag("!="), tag("<"), tag(">"), tag("="))); diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_adding_the_documents.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_adding_the_documents.snap index 1b9018726..ebacb5415 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_adding_the_documents.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_adding_the_documents.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/scheduler/test_failure.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings.snap index 5bbc89c44..0fc0d7fb5 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/scheduler/test_failure.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap index 7149d5f97..d06a4e78a 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap @@ -1,17 +1,16 @@ --- source: crates/index-scheduler/src/scheduler/test_failure.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} 2 {uid: 2, batch_uid: 2, status: succeeded, details: { received_document_ids: 1, deleted_documents: Some(1) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1"] }} 3 {uid: 3, batch_uid: 2, status: failed, error: ResponseError { code: 200, message: "Index `doggos`: Invalid type for filter subexpression: expected: String, Array, found: true.", error_code: "invalid_document_filter", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#invalid_document_filter" }, details: { original_filter: true, deleted_documents: Some(0) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: Bool(true) }} -4 {uid: 4, batch_uid: 2, status: failed, error: ResponseError { code: 200, message: "Index `doggos`: Attribute `id` is not filterable. Available filterable attributes are: `catto`.\n1:3 id = 2", error_code: "invalid_document_filter", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#invalid_document_filter" }, details: { original_filter: "id = 2", deleted_documents: Some(0) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: String("id = 2") }} +4 {uid: 4, batch_uid: 2, status: failed, error: ResponseError { code: 200, message: "Index `doggos`: Attribute `id` is not filterable. Available filterable attribute patterns are: `catto`.\n1:3 id = 2", error_code: "invalid_document_filter", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#invalid_document_filter" }, details: { original_filter: "id = 2", deleted_documents: Some(0) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: String("id = 2") }} 5 {uid: 5, batch_uid: 2, status: succeeded, details: { original_filter: "catto EXISTS", deleted_documents: Some(1) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: String("catto EXISTS") }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/registered_the_document_deletions.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/registered_the_document_deletions.snap index b13a63738..8b010498f 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/registered_the_document_deletions.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/registered_the_document_deletions.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/scheduler/test_failure.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} 2 {uid: 2, status: enqueued, details: { received_document_ids: 1, deleted_documents: None }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1"] }} 3 {uid: 3, status: enqueued, details: { original_filter: true, deleted_documents: None }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: Bool(true) }} diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/registered_the_setting_and_document_addition.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/registered_the_setting_and_document_addition.snap index 9e10d3052..0ba3ef598 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/registered_the_setting_and_document_addition.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/registered_the_setting_and_document_addition.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/scheduler/test_failure.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/scheduler/test_failure.rs b/crates/index-scheduler/src/scheduler/test_failure.rs index 5cdcb248b..191910d38 100644 --- a/crates/index-scheduler/src/scheduler/test_failure.rs +++ b/crates/index-scheduler/src/scheduler/test_failure.rs @@ -1,11 +1,11 @@ use std::time::Instant; use big_s::S; -use maplit::btreeset; use meili_snap::snapshot; use meilisearch_types::milli::obkv_to_json; use meilisearch_types::milli::update::IndexDocumentsMethod::*; use meilisearch_types::milli::update::Setting; +use meilisearch_types::milli::FilterableAttributesRule; use meilisearch_types::tasks::{Kind, KindWithContent}; use crate::insta_snapshot::snapshot_index_scheduler; @@ -127,7 +127,8 @@ fn fail_in_process_batch_for_document_deletion() { use meilisearch_types::settings::{Settings, Unchecked}; let mut new_settings: Box> = Box::default(); - new_settings.filterable_attributes = Setting::Set(btreeset!(S("catto"))); + new_settings.filterable_attributes = + Setting::Set(vec![FilterableAttributesRule::Field(S("catto"))]); index_scheduler .register( diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index 5a0451b6c..c7f375eff 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -414,6 +414,7 @@ impl ErrorCode for milli::Error { UserError::AttributeLimitReached => Code::MaxFieldsLimitExceeded, UserError::InvalidFilter(_) => Code::InvalidSearchFilter, UserError::InvalidFilterExpression(..) => Code::InvalidSearchFilter, + UserError::FilterOperatorNotAllowed { .. } => Code::InvalidSearchFilter, UserError::MissingDocumentId { .. } => Code::MissingDocumentId, UserError::InvalidDocumentId { .. } | UserError::TooManyDocumentIds { .. } => { Code::InvalidDocumentId diff --git a/crates/meilisearch-types/src/locales.rs b/crates/meilisearch-types/src/locales.rs index 945c38cc3..b3fb90493 100644 --- a/crates/meilisearch-types/src/locales.rs +++ b/crates/meilisearch-types/src/locales.rs @@ -1,5 +1,5 @@ use deserr::Deserr; -use milli::LocalizedAttributesRule; +use milli::{AttributePatterns, LocalizedAttributesRule}; use serde::{Deserialize, Serialize}; use utoipa::ToSchema; @@ -7,7 +7,7 @@ use utoipa::ToSchema; #[deserr(rename_all = camelCase)] #[serde(rename_all = "camelCase")] pub struct LocalizedAttributesRuleView { - pub attribute_patterns: Vec, + pub attribute_patterns: AttributePatterns, pub locales: Vec, } diff --git a/crates/meilisearch-types/src/settings.rs b/crates/meilisearch-types/src/settings.rs index e501d7359..7b5807d06 100644 --- a/crates/meilisearch-types/src/settings.rs +++ b/crates/meilisearch-types/src/settings.rs @@ -11,7 +11,7 @@ use fst::IntoStreamer; use milli::index::{IndexEmbeddingConfig, PrefixSearch}; use milli::proximity::ProximityPrecision; use milli::update::Setting; -use milli::{Criterion, CriterionError, Index, DEFAULT_VALUES_PER_FACET}; +use milli::{Criterion, CriterionError, FilterableAttributesRule, Index, DEFAULT_VALUES_PER_FACET}; use serde::{Deserialize, Serialize, Serializer}; use utoipa::ToSchema; @@ -202,8 +202,8 @@ pub struct Settings { /// Attributes to use for faceting and filtering. See [Filtering and Faceted Search](https://www.meilisearch.com/docs/learn/filtering_and_sorting/search_with_facet_filters). #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] - #[schema(value_type = Option>, example = json!(["release_date", "genre"]))] - pub filterable_attributes: Setting>, + #[schema(value_type = Option>, example = json!(["release_date", "genre"]))] + pub filterable_attributes: Setting>, /// Attributes to use when sorting search results. #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] @@ -791,7 +791,7 @@ pub fn settings( .user_defined_searchable_fields(rtxn)? .map(|fields| fields.into_iter().map(String::from).collect()); - let filterable_attributes = index.filterable_fields(rtxn)?.into_iter().collect(); + let filterable_attributes = index.filterable_attributes_rules(rtxn)?.into_iter().collect(); let sortable_attributes = index.sortable_fields(rtxn)?.into_iter().collect(); diff --git a/crates/meilisearch/src/routes/indexes/settings.rs b/crates/meilisearch/src/routes/indexes/settings.rs index bfd0e1090..92b018c8c 100644 --- a/crates/meilisearch/src/routes/indexes/settings.rs +++ b/crates/meilisearch/src/routes/indexes/settings.rs @@ -291,7 +291,7 @@ make_setting_routes!( { route: "/filterable-attributes", update_verb: put, - value_type: std::collections::BTreeSet, + value_type: Vec, err_type: meilisearch_types::deserr::DeserrJsonError< meilisearch_types::error::deserr_codes::InvalidSettingsFilterableAttributes, >, diff --git a/crates/meilisearch/src/routes/indexes/settings_analytics.rs b/crates/meilisearch/src/routes/indexes/settings_analytics.rs index 4944349a4..cb5983f02 100644 --- a/crates/meilisearch/src/routes/indexes/settings_analytics.rs +++ b/crates/meilisearch/src/routes/indexes/settings_analytics.rs @@ -8,6 +8,7 @@ use std::collections::{BTreeMap, BTreeSet, HashSet}; use meilisearch_types::facet_values_sort::FacetValuesSort; use meilisearch_types::locales::{Locale, LocalizedAttributesRuleView}; use meilisearch_types::milli::update::Setting; +use meilisearch_types::milli::FilterableAttributesRule; use meilisearch_types::settings::{ FacetingSettings, PaginationSettings, PrefixSearchSettings, ProximityPrecisionView, RankingRuleView, SettingEmbeddingSettings, TypoSettings, @@ -89,6 +90,10 @@ impl Aggregate for SettingsAnalytics { filterable_attributes: FilterableAttributesAnalytics { total: new.filterable_attributes.total.or(self.filterable_attributes.total), has_geo: new.filterable_attributes.has_geo.or(self.filterable_attributes.has_geo), + has_patterns: new + .filterable_attributes + .has_patterns + .or(self.filterable_attributes.has_patterns), }, distinct_attribute: DistinctAttributeAnalytics { set: self.distinct_attribute.set | new.distinct_attribute.set, @@ -328,13 +333,19 @@ impl SortableAttributesAnalytics { pub struct FilterableAttributesAnalytics { pub total: Option, pub has_geo: Option, + pub has_patterns: Option, } impl FilterableAttributesAnalytics { - pub fn new(setting: Option<&BTreeSet>) -> Self { + pub fn new(setting: Option<&Vec>) -> Self { Self { total: setting.as_ref().map(|filter| filter.len()), - has_geo: setting.as_ref().map(|filter| filter.contains("_geo")), + has_geo: setting + .as_ref() + .map(|filter| filter.iter().any(FilterableAttributesRule::has_geo)), + has_patterns: setting.as_ref().map(|filter| { + filter.iter().any(|rule| matches!(rule, FilterableAttributesRule::Pattern(_))) + }), } } diff --git a/crates/meilisearch/src/routes/mod.rs b/crates/meilisearch/src/routes/mod.rs index 02cb4130a..cc9aeb7d2 100644 --- a/crates/meilisearch/src/routes/mod.rs +++ b/crates/meilisearch/src/routes/mod.rs @@ -9,6 +9,10 @@ use meilisearch_types::batches::BatchStats; use meilisearch_types::error::{Code, ErrorType, ResponseError}; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::keys::CreateApiKey; +use meilisearch_types::milli::{ + AttributePatterns, FilterFeatures, FilterableAttributesFeatures, FilterableAttributesPatterns, + FilterableAttributesRule, +}; use meilisearch_types::settings::{ Checked, FacetingSettings, MinWordSizeTyposSetting, PaginationSettings, Settings, TypoSettings, Unchecked, @@ -88,7 +92,7 @@ pub mod tasks; url = "/", description = "Local server", )), - components(schemas(PaginationView, PaginationView, IndexView, DocumentDeletionByFilter, AllBatches, BatchStats, ProgressStepView, ProgressView, BatchView, RuntimeTogglableFeatures, SwapIndexesPayload, DocumentEditionByFunction, MergeFacets, FederationOptions, SearchQueryWithIndex, Federation, FederatedSearch, FederatedSearchResult, SearchResults, SearchResultWithIndex, SimilarQuery, SimilarResult, PaginationView, BrowseQuery, UpdateIndexRequest, IndexUid, IndexCreateRequest, KeyView, Action, CreateApiKey, UpdateStderrLogs, LogMode, GetLogs, IndexStats, Stats, HealthStatus, HealthResponse, VersionResponse, Code, ErrorType, AllTasks, TaskView, Status, DetailsView, ResponseError, Settings, Settings, TypoSettings, MinWordSizeTyposSetting, FacetingSettings, PaginationSettings, SummarizedTaskView, Kind, Network, Remote)) + components(schemas(PaginationView, PaginationView, IndexView, DocumentDeletionByFilter, AllBatches, BatchStats, ProgressStepView, ProgressView, BatchView, RuntimeTogglableFeatures, SwapIndexesPayload, DocumentEditionByFunction, MergeFacets, FederationOptions, SearchQueryWithIndex, Federation, FederatedSearch, FederatedSearchResult, SearchResults, SearchResultWithIndex, SimilarQuery, SimilarResult, PaginationView, BrowseQuery, UpdateIndexRequest, IndexUid, IndexCreateRequest, KeyView, Action, CreateApiKey, UpdateStderrLogs, LogMode, GetLogs, IndexStats, Stats, HealthStatus, HealthResponse, VersionResponse, Code, ErrorType, AllTasks, TaskView, Status, DetailsView, ResponseError, Settings, Settings, TypoSettings, MinWordSizeTyposSetting, FacetingSettings, PaginationSettings, SummarizedTaskView, Kind, Network, Remote, FilterableAttributesRule, FilterableAttributesPatterns, AttributePatterns, FilterableAttributesFeatures, FilterFeatures)) )] pub struct MeilisearchApi; diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index 565dbccf1..58a181d3b 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -20,7 +20,7 @@ use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy}; use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; use meilisearch_types::milli::vector::Embedder; use meilisearch_types::milli::{ - FacetValueHit, InternalError, OrderBy, SearchForFacetValues, TimeBudget, + FacetValueHit, InternalError, OrderBy, PatternMatch, SearchForFacetValues, TimeBudget, }; use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS; use meilisearch_types::{milli, Document}; @@ -1538,8 +1538,9 @@ pub fn perform_facet_search( // If the facet string is not localized, we **ignore** the locales provided by the user because the facet data has no locale. // If the user does not provide locales, we use the locales of the facet string. let localized_attributes = index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); - let localized_attributes_locales = - localized_attributes.into_iter().find(|attr| attr.match_str(&facet_name)); + let localized_attributes_locales = localized_attributes + .into_iter() + .find(|attr| attr.match_str(&facet_name) == PatternMatch::Match); let locales = localized_attributes_locales.map(|attr| { attr.locales .into_iter() @@ -1885,7 +1886,7 @@ fn format_fields( let locales = locales.or_else(|| { localized_attributes .iter() - .find(|rule| rule.match_str(key)) + .find(|rule| rule.match_str(key) == PatternMatch::Match) .map(LocalizedAttributesRule::locales) }); diff --git a/crates/meilisearch/tests/common/server.rs b/crates/meilisearch/tests/common/server.rs index f78542db1..d1e81e0a7 100644 --- a/crates/meilisearch/tests/common/server.rs +++ b/crates/meilisearch/tests/common/server.rs @@ -125,6 +125,12 @@ impl Server { self.service.post("/indexes", body).await } + pub async fn delete_index(&self, uid: impl AsRef) -> (Value, StatusCode) { + let url = format!("/indexes/{}", urlencoding::encode(uid.as_ref())); + let (value, code) = self.service.delete(url).await; + (value, code) + } + pub fn index_with_encoder(&self, uid: impl AsRef, encoder: Encoder) -> Index<'_> { Index { uid: uid.as_ref().to_string(), diff --git a/crates/meilisearch/tests/documents/errors.rs b/crates/meilisearch/tests/documents/errors.rs index 7b2ca8b5e..c02c1f000 100644 --- a/crates/meilisearch/tests/documents/errors.rs +++ b/crates/meilisearch/tests/documents/errors.rs @@ -636,7 +636,7 @@ async fn delete_document_by_filter() { "originalFilter": "\"catto = jorts\"" }, "error": { - "message": "Index `SHARED_DOCUMENTS`: Attribute `catto` is not filterable. Available filterable attributes are: `id`, `title`.\n1:6 catto = jorts", + "message": "Index `SHARED_DOCUMENTS`: Attribute `catto` is not filterable. Available filterable attribute patterns are: `id`, `title`.\n1:6 catto = jorts", "code": "invalid_document_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_filter" @@ -738,7 +738,7 @@ async fn fetch_document_by_filter() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Attribute `doggo` is not filterable. Available filterable attributes are: `color`.\n1:6 doggo = bernese", + "message": "Attribute `doggo` is not filterable. Available filterable attribute patterns are: `color`.\n1:6 doggo = bernese", "code": "invalid_document_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_filter" diff --git a/crates/meilisearch/tests/search/errors.rs b/crates/meilisearch/tests/search/errors.rs index 9dea42b12..c4cba7504 100644 --- a/crates/meilisearch/tests/search/errors.rs +++ b/crates/meilisearch/tests/search/errors.rs @@ -1,8 +1,10 @@ use meili_snap::*; -use crate::common::{shared_does_not_exists_index, Server}; +use crate::common::{shared_does_not_exists_index, Server, DOCUMENTS, NESTED_DOCUMENTS}; use crate::json; +use super::test_settings_documents_indexing_swapping_and_search; + #[actix_rt::test] async fn search_unexisting_index() { let index = shared_does_not_exists_index().await; @@ -430,7 +432,7 @@ async fn search_non_filterable_facets() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Invalid facet distribution, attribute `doggo` is not filterable. The available filterable attribute is `title`.", + "message": "Invalid facet distribution, attribute `doggo` is not filterable. The available filterable attribute pattern is `title`.", "code": "invalid_search_facets", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_facets" @@ -441,7 +443,7 @@ async fn search_non_filterable_facets() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Invalid facet distribution, attribute `doggo` is not filterable. The available filterable attribute is `title`.", + "message": "Invalid facet distribution, attribute `doggo` is not filterable. The available filterable attribute pattern is `title`.", "code": "invalid_search_facets", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_facets" @@ -461,7 +463,7 @@ async fn search_non_filterable_facets_multiple_filterable() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Invalid facet distribution, attribute `doggo` is not filterable. The available filterable attributes are `genres, title`.", + "message": "Invalid facet distribution, attribute `doggo` is not filterable. The available filterable attribute patterns are `genres, title`.", "code": "invalid_search_facets", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_facets" @@ -472,7 +474,7 @@ async fn search_non_filterable_facets_multiple_filterable() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Invalid facet distribution, attribute `doggo` is not filterable. The available filterable attributes are `genres, title`.", + "message": "Invalid facet distribution, attribute `doggo` is not filterable. The available filterable attribute patterns are `genres, title`.", "code": "invalid_search_facets", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_facets" @@ -522,7 +524,7 @@ async fn search_non_filterable_facets_multiple_facets() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Invalid facet distribution, attributes `doggo, neko` are not filterable. The available filterable attributes are `genres, title`.", + "message": "Invalid facet distribution, attributes `doggo, neko` are not filterable. The available filterable attribute patterns are `genres, title`.", "code": "invalid_search_facets", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_facets" @@ -533,7 +535,7 @@ async fn search_non_filterable_facets_multiple_facets() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Invalid facet distribution, attributes `doggo, neko` are not filterable. The available filterable attributes are `genres, title`.", + "message": "Invalid facet distribution, attributes `doggo, neko` are not filterable. The available filterable attribute patterns are `genres, title`.", "code": "invalid_search_facets", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_facets" @@ -636,14 +638,11 @@ async fn search_bad_matching_strategy() { #[actix_rt::test] async fn filter_invalid_syntax_object() { - let server = Server::new_shared(); - let index = server.unique_index(); - - let (task, _code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; - index.wait_task(task.uid()).await.succeeded(); - - index - .search(json!({"filter": "title & Glass"}), |response, code| { + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["title"]}), + &json!({"filter": "title & Glass"}), + |response, code| { snapshot!(response, @r###" { "message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass", @@ -653,20 +652,18 @@ async fn filter_invalid_syntax_object() { } "###); snapshot!(code, @"400 Bad Request"); - }) - .await; + }, + ) + .await; } #[actix_rt::test] async fn filter_invalid_syntax_array() { - let server = Server::new_shared(); - let index = server.unique_index(); - - let (task, _code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; - index.wait_task(task.uid()).await.succeeded(); - - index - .search(json!({"filter": ["title & Glass"]}), |response, code| { + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["title"]}), + &json!({"filter": ["title & Glass"]}), + |response, code| { snapshot!(response, @r###" { "message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass", @@ -676,206 +673,327 @@ async fn filter_invalid_syntax_array() { } "###); snapshot!(code, @"400 Bad Request"); - }) - .await; + }, + ) + .await; } #[actix_rt::test] async fn filter_invalid_syntax_string() { - let server = Server::new_shared(); - let index = server.unique_index(); - - let (task, _code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; - index.wait_task(task.uid()).await.succeeded(); - - let expected_response = json!({ - "message": "Found unexpected characters at the end of the filter: `XOR title = Glass`. You probably forgot an `OR` or an `AND` rule.\n15:32 title = Glass XOR title = Glass", - "code": "invalid_search_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_filter" - }); - index - .search(json!({"filter": "title = Glass XOR title = Glass"}), |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); - }) - .await; + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["title"]}), + &json!({"filter": "title = Glass XOR title = Glass"}), + |response, code| { + snapshot!(response, @r###" + { + "message": "Found unexpected characters at the end of the filter: `XOR title = Glass`. You probably forgot an `OR` or an `AND` rule.\n15:32 title = Glass XOR title = Glass", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + snapshot!(code, @"400 Bad Request"); + }, + ) + .await; } #[actix_rt::test] async fn filter_invalid_attribute_array() { - let server = Server::new_shared(); - let index = server.unique_index(); - - let (task, _code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; - index.wait_task(task.uid()).await.succeeded(); - - let expected_response = json!({ - "message": format!("Index `{}`: Attribute `many` is not filterable. Available filterable attributes are: `title`.\n1:5 many = Glass", index.uid), - "code": "invalid_search_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_filter" - }); - index - .search(json!({"filter": ["many = Glass"]}), |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); - }) - .await; + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["title"]}), + &json!({"filter": ["many = Glass"]}), + |response, code| { + snapshot!(response, @r###" + { + "message": "Index `test`: Attribute `many` is not filterable. Available filterable attribute patterns are: `title`.\n1:5 many = Glass", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + snapshot!(code, @"400 Bad Request"); + }, + ) + .await; } #[actix_rt::test] async fn filter_invalid_attribute_string() { - let server = Server::new_shared(); - let index = server.unique_index(); - - let (task, _code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; - index.wait_task(task.uid()).await.succeeded(); - - let expected_response = json!({ - "message": format!("Index `{}`: Attribute `many` is not filterable. Available filterable attributes are: `title`.\n1:5 many = Glass", index.uid), - "code": "invalid_search_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_filter" - }); - index - .search(json!({"filter": "many = Glass"}), |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); - }) - .await; + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["title"]}), + &json!({"filter": "many = Glass"}), + |response, code| { + snapshot!(response, @r###" + { + "message": "Index `test`: Attribute `many` is not filterable. Available filterable attribute patterns are: `title`.\n1:5 many = Glass", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + snapshot!(code, @"400 Bad Request"); + }, + ) + .await; } #[actix_rt::test] async fn filter_reserved_geo_attribute_array() { - let server = Server::new_shared(); - let index = server.unique_index(); - - let (task, _code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; - index.wait_task(task.uid()).await.succeeded(); - - let expected_response = json!({ - "message": "`_geo` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:13 _geo = Glass", - "code": "invalid_search_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_filter" - }); - index - .search(json!({"filter": ["_geo = Glass"]}), |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); - }) - .await; + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["title"]}), + &json!({"filter": ["_geo = Glass"]}), + |response, code| { + snapshot!(response, @r###" + { + "message": "`_geo` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:13 _geo = Glass", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + snapshot!(code, @"400 Bad Request"); + }, + ) + .await; } #[actix_rt::test] async fn filter_reserved_geo_attribute_string() { - let server = Server::new_shared(); - let index = server.unique_index(); - - let (task, _code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; - index.wait_task(task.uid()).await.succeeded(); - - let expected_response = json!({ - "message": "`_geo` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:13 _geo = Glass", - "code": "invalid_search_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_filter" - }); - index - .search(json!({"filter": "_geo = Glass"}), |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); - }) - .await; + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["title"]}), + &json!({"filter": "_geo = Glass"}), + |response, code| { + snapshot!(response, @r###" + { + "message": "`_geo` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:13 _geo = Glass", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + snapshot!(code, @"400 Bad Request"); + }, + ) + .await; } #[actix_rt::test] async fn filter_reserved_attribute_array() { - let server = Server::new_shared(); - let index = server.unique_index(); - - let (task, _code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; - index.wait_task(task.uid()).await.succeeded(); - - let expected_response = json!({ - "message": "`_geoDistance` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:21 _geoDistance = Glass", - "code": "invalid_search_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_filter" - }); - index - .search(json!({"filter": ["_geoDistance = Glass"]}), |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); - }) - .await; + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["title"]}), + &json!({"filter": ["_geoDistance = Glass"]}), + |response, code| { + snapshot!(response, @r###" + { + "message": "`_geoDistance` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:21 _geoDistance = Glass", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + snapshot!(code, @"400 Bad Request"); + }, + ) + .await; } #[actix_rt::test] async fn filter_reserved_attribute_string() { - let server = Server::new_shared(); - let index = server.unique_index(); - - let (task, _code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; - index.wait_task(task.uid()).await.succeeded(); - - let expected_response = json!({ - "message": "`_geoDistance` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:21 _geoDistance = Glass", - "code": "invalid_search_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_filter" - }); - index - .search(json!({"filter": "_geoDistance = Glass"}), |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); - }) - .await; + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["title"]}), + &json!({"filter": "_geoDistance = Glass"}), + |response, code| { + snapshot!(response, @r###" + { + "message": "`_geoDistance` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:21 _geoDistance = Glass", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + snapshot!(code, @"400 Bad Request"); + }, + ) + .await; } #[actix_rt::test] async fn filter_reserved_geo_point_array() { - let server = Server::new_shared(); - let index = server.unique_index(); - - let (task, _code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; - index.wait_task(task.uid()).await.succeeded(); - - let expected_response = json!({ - "message": "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:18 _geoPoint = Glass", - "code": "invalid_search_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_filter" - }); - index - .search(json!({"filter": ["_geoPoint = Glass"]}), |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); - }) - .await; + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["title"]}), + &json!({"filter": ["_geoPoint = Glass"]}), + |response, code| { + snapshot!(response, @r###" + { + "message": "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:18 _geoPoint = Glass", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + snapshot!(code, @"400 Bad Request"); + }, + ) + .await; } #[actix_rt::test] async fn filter_reserved_geo_point_string() { - let server = Server::new_shared(); - let index = server.unique_index(); + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["title"]}), + &json!({"filter": "_geoPoint = Glass"}), + |response, code| { + snapshot!(response, @r###" + { + "message": "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:18 _geoPoint = Glass", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + snapshot!(code, @"400 Bad Request"); + }, + ) + .await; +} - let (task, _code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; - index.wait_task(task.uid()).await.succeeded(); +#[actix_rt::test] +async fn search_with_pattern_filter_settings_errors() { + // Check if the Equality filter works with patterns + test_settings_documents_indexing_swapping_and_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [{ + "attributePatterns": ["cattos","doggos.age"], + "features": { + "facetSearch": false, + "filter": {"equality": false, "comparison": true} + } + }]}), + &json!({ + "filter": "cattos = pésti" + }), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Index `test`: Filter operator `=` is not allowed for the attribute `cattos`.\n - Note: allowed operators: OR, AND, NOT, <, >, <=, >=, TO, IS EMPTY, IS NULL, EXISTS.\n - Note: field `cattos` matched rule #0 in `filterableAttributes`", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + }, + ) + .await; - let expected_response = json!({ - "message": "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:18 _geoPoint = Glass", - "code": "invalid_search_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_filter" - }); - index - .search(json!({"filter": "_geoPoint = Glass"}), |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); - }) - .await; + test_settings_documents_indexing_swapping_and_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [{ + "attributePatterns": ["cattos","doggos.age"], + "features": { + "facetSearch": false, + "filter": {"equality": false, "comparison": true} + } + }]}), + &json!({ + "filter": "cattos IN [pésti, simba]" + }), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Index `test`: Filter operator `=` is not allowed for the attribute `cattos`.\n - Note: allowed operators: OR, AND, NOT, <, >, <=, >=, TO, IS EMPTY, IS NULL, EXISTS.\n - Note: field `cattos` matched rule #0 in `filterableAttributes`", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + }, +) +.await; + + // Check if the Comparison filter works with patterns + test_settings_documents_indexing_swapping_and_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [{"attributePatterns": ["cattos","doggos.age"]}]}), + &json!({ + "filter": "doggos.age > 2" + }), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Index `test`: Filter operator `>` is not allowed for the attribute `doggos.age`.\n - Note: allowed operators: OR, AND, NOT, =, !=, IN, IS EMPTY, IS NULL, EXISTS.\n - Note: field `doggos.age` matched rule #0 in `filterableAttributes`", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [{ + "attributePatterns": ["cattos","doggos.age"], + "features": { + "facetSearch": false, + "filter": {"equality": true, "comparison": false} + } + }]}), + &json!({ + "filter": "doggos.age > 2" + }), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Index `test`: Filter operator `>` is not allowed for the attribute `doggos.age`.\n - Note: allowed operators: OR, AND, NOT, =, !=, IN, IS EMPTY, IS NULL, EXISTS.\n - Note: field `doggos.age` matched rule #0 in `filterableAttributes`", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [{ + "attributePatterns": ["cattos","doggos.age"], + "features": { + "facetSearch": false, + "filter": {"equality": true, "comparison": false} + } + }]}), + &json!({ + "filter": "doggos.age 2 TO 4" + }), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Index `test`: Filter operator `TO` is not allowed for the attribute `doggos.age`.\n - Note: allowed operators: OR, AND, NOT, =, !=, IN, IS EMPTY, IS NULL, EXISTS.\n - Note: field `doggos.age` matched rule #0 in `filterableAttributes`", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + }, + ) + .await; } #[actix_rt::test] @@ -1018,109 +1136,115 @@ async fn sort_unset_ranking_rule() { #[actix_rt::test] async fn search_on_unknown_field() { - let server = Server::new_shared(); - let index = server.unique_index(); - let (response, _code) = - index.update_settings_searchable_attributes(json!(["id", "title"])).await; - index.wait_task(response.uid()).await.succeeded(); - - let expected_response = json!({ - "message": format!("Index `{}`: Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.", index.uid), - "code": "invalid_search_attributes_to_search_on", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on" - }); - index - .search( - json!({"q": "Captain Marvel", "attributesToSearchOn": ["unknown"]}), - |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); - }, - ) - .await; + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"searchableAttributes": ["id", "title"]}), + &json!({"q": "Captain Marvel", "attributesToSearchOn": ["unknown"]}), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Index `test`: Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.", + "code": "invalid_search_attributes_to_search_on", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on" + } + "###); + }, + ) + .await; } #[actix_rt::test] async fn search_on_unknown_field_plus_joker() { - let server = Server::new_shared(); - let index = server.unique_index(); - let (response, _code) = - index.update_settings_searchable_attributes(json!(["id", "title"])).await; - index.wait_task(response.uid()).await.succeeded(); + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"searchableAttributes": ["id", "title"]}), + &json!({"q": "Captain Marvel", "attributesToSearchOn": ["*", "unknown"]}), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Index `test`: Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.", + "code": "invalid_search_attributes_to_search_on", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on" + } + "###); + }, + ) + .await; - let expected_response = json!({ - "message": format!("Index `{}`: Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.", index.uid), - "code": "invalid_search_attributes_to_search_on", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on" - }); - index - .search( - json!({"q": "Captain Marvel", "attributesToSearchOn": ["*", "unknown"]}), - |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); - }, - ) - .await; - - index - .search( - json!({"q": "Captain Marvel", "attributesToSearchOn": ["unknown", "*"]}), - |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); - }, - ) - .await; + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"searchableAttributes": ["id", "title"]}), + &json!({"q": "Captain Marvel", "attributesToSearchOn": ["unknown", "*"]}), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Index `test`: Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.", + "code": "invalid_search_attributes_to_search_on", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on" + } + "###); + }, + ) + .await; } #[actix_rt::test] async fn distinct_at_search_time() { - let server = Server::new_shared(); - let index = server.unique_index(); + let server = Server::new().await; + let index = server.index("test"); let (task, _) = index.create(None).await; index.wait_task(task.uid()).await.succeeded(); + let (response, _code) = + index.add_documents(json!([{"id": 1, "color": "Doggo", "machin": "Action"}]), None).await; + index.wait_task(response.uid()).await.succeeded(); - let expected_response = json!({ - "message": format!("Index `{}`: Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. This index does not have configured filterable attributes.", index.uid), - "code": "invalid_search_distinct", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" - }); let (response, code) = index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await; - assert_eq!(response, expected_response); - assert_eq!(code, 400); + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Index `test`: Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. This index does not have configured filterable attributes.", + "code": "invalid_search_distinct", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" + } + "###); let (task, _) = index.update_settings_filterable_attributes(json!(["color", "machin"])).await; index.wait_task(task.uid()).await.succeeded(); - let expected_response = json!({ - "message": format!("Index `{}`: Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, machin`.", index.uid), - "code": "invalid_search_distinct", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" - }); let (response, code) = index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await; - assert_eq!(response, expected_response); - assert_eq!(code, 400); + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Index `test`: Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes patterns are: `color, machin`.", + "code": "invalid_search_distinct", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" + } + "###); let (task, _) = index.update_settings_displayed_attributes(json!(["color"])).await; index.wait_task(task.uid()).await.succeeded(); - let expected_response = json!({ - "message": format!("Index `{}`: Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, <..hidden-attributes>`.", index.uid), - "code": "invalid_search_distinct", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" - }); let (response, code) = index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await; - assert_eq!(response, expected_response); - assert_eq!(code, 400); + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Index `test`: Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes patterns are: `color, <..hidden-attributes>`.", + "code": "invalid_search_distinct", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" + } + "###); let (response, code) = index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": true})).await; diff --git a/crates/meilisearch/tests/search/facet_search.rs b/crates/meilisearch/tests/search/facet_search.rs index 7e46c5d15..45b7a381a 100644 --- a/crates/meilisearch/tests/search/facet_search.rs +++ b/crates/meilisearch/tests/search/facet_search.rs @@ -1,7 +1,9 @@ use meili_snap::snapshot; +use meilisearch::Opt; use once_cell::sync::Lazy; +use tempfile::TempDir; -use crate::common::{Server, Value}; +use crate::common::{default_settings, Server, Value, NESTED_DOCUMENTS}; use crate::json; static DOCUMENTS: Lazy = Lazy::new(|| { @@ -34,6 +36,62 @@ static DOCUMENTS: Lazy = Lazy::new(|| { ]) }); +async fn test_settings_documents_indexing_swapping_and_facet_search( + documents: &Value, + settings: &Value, + query: &Value, + test: impl Fn(Value, actix_http::StatusCode) + std::panic::UnwindSafe + Clone, +) { + let temp = TempDir::new().unwrap(); + let server = Server::new_with_options(Opt { ..default_settings(temp.path()) }).await.unwrap(); + + eprintln!("Documents -> Settings -> test"); + let index = server.index("test"); + + let (task, code) = index.add_documents(documents.clone(), None).await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + assert!(response.is_success(), "{:?}", response); + + let (task, code) = index.update_settings(settings.clone()).await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + assert!(response.is_success(), "{:?}", response); + + let (response, code) = index.facet_search(query.clone()).await; + insta::allow_duplicates! { + test(response, code); + } + + let (task, code) = server.delete_index("test").await; + assert_eq!(code, 202, "{}", task); + let response = server.wait_task(task.uid()).await; + assert!(response.is_success(), "{:?}", response); + + eprintln!("Settings -> Documents -> test"); + let index = server.index("test"); + + let (task, code) = index.update_settings(settings.clone()).await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + assert!(response.is_success(), "{:?}", response); + + let (task, code) = index.add_documents(documents.clone(), None).await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + assert!(response.is_success(), "{:?}", response); + + let (response, code) = index.facet_search(query.clone()).await; + insta::allow_duplicates! { + test(response, code); + } + + let (task, code) = server.delete_index("test").await; + assert_eq!(code, 202, "{}", task); + let response = server.wait_task(task.uid()).await; + assert!(response.is_success(), "{:?}", response); +} + #[actix_rt::test] async fn simple_facet_search() { let server = Server::new().await; @@ -436,3 +494,124 @@ async fn deactivate_facet_search_add_documents_and_reset_facet_search() { assert_eq!(code, 200, "{}", response); assert_eq!(dbg!(response)["facetHits"].as_array().unwrap().len(), 2); } + +#[actix_rt::test] +async fn facet_search_with_filterable_attributes_rules() { + test_settings_documents_indexing_swapping_and_facet_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["genres"]}), + &json!({"facetName": "genres", "facetQuery": "a"}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(response["facetHits"], @r###"[{"value":"Action","count":3},{"value":"Adventure","count":2}]"###); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_facet_search( + &DOCUMENTS, + &json!({"filterableAttributes": [{"attributePatterns": ["genres"], "features": {"facetSearch": true, "filter": {"equality": false, "comparison": false}}}]}), + &json!({"facetName": "genres", "facetQuery": "a"}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(response["facetHits"], @r###"[{"value":"Action","count":3},{"value":"Adventure","count":2}]"###); + }, + ).await; + + test_settings_documents_indexing_swapping_and_facet_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": ["doggos.name"]}), + &json!({"facetName": "doggos.name", "facetQuery": "b"}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(response["facetHits"], @r###"[{"value":"bobby","count":1},{"value":"buddy","count":1}]"###); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_facet_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [{"attributePatterns": ["doggos.name"], "features": {"facetSearch": true, "filter": {"equality": false, "comparison": false}}}]}), + &json!({"facetName": "doggos.name", "facetQuery": "b"}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(response["facetHits"], @r###"[{"value":"bobby","count":1},{"value":"buddy","count":1}]"###); + }, + ).await; +} + +#[actix_rt::test] +async fn facet_search_with_filterable_attributes_rules_errors() { + test_settings_documents_indexing_swapping_and_facet_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["genres"]}), + &json!({"facetName": "invalid", "facetQuery": "a"}), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(response["message"], @r###""Attribute `invalid` is not facet-searchable. Available facet-searchable attributes patterns are: `genres`. To make it facet-searchable add it to the `filterableAttributes` index settings.""###); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_facet_search( + &DOCUMENTS, + &json!({"filterableAttributes": [{"attributePatterns": ["genres"]}]}), + &json!({"facetName": "genres", "facetQuery": "a"}), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(response["message"], @r###""Attribute `genres` is not facet-searchable. This index does not have configured facet-searchable attributes. To make it facet-searchable add it to the `filterableAttributes` index settings.""###); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_facet_search( + &DOCUMENTS, + &json!({"filterableAttributes": [{"attributePatterns": ["genres"], "features": {"facetSearch": false, "filter": {"equality": true, "comparison": true}}}]}), + &json!({"facetName": "genres", "facetQuery": "a"}), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(response["message"], @r###""Attribute `genres` is not facet-searchable. This index does not have configured facet-searchable attributes. To make it facet-searchable add it to the `filterableAttributes` index settings.""###); + }, + ).await; + + test_settings_documents_indexing_swapping_and_facet_search( + &DOCUMENTS, + &json!({"filterableAttributes": [{"attributePatterns": ["genres"], "features": {"facetSearch": false, "filter": {"equality": false, "comparison": false}}}]}), + &json!({"facetName": "genres", "facetQuery": "a"}), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(response["message"], @r###""Attribute `genres` is not facet-searchable. This index does not have configured facet-searchable attributes. To make it facet-searchable add it to the `filterableAttributes` index settings.""###); + }, + ).await; + + test_settings_documents_indexing_swapping_and_facet_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [{"attributePatterns": ["doggos.name"]}]}), + &json!({"facetName": "invalid.name", "facetQuery": "b"}), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(response["message"], @r###""Attribute `invalid.name` is not facet-searchable. This index does not have configured facet-searchable attributes. To make it facet-searchable add it to the `filterableAttributes` index settings.""###); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_facet_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [{"attributePatterns": ["doggos.name"], "features": {"facetSearch": false, "filter": {"equality": true, "comparison": true}}}]}), + &json!({"facetName": "doggos.name", "facetQuery": "b"}), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(response["message"], @r###""Attribute `doggos.name` is not facet-searchable. This index does not have configured facet-searchable attributes. To make it facet-searchable add it to the `filterableAttributes` index settings.""###); + }, + ).await; + + test_settings_documents_indexing_swapping_and_facet_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [{"attributePatterns": ["doggos.name"], "features": {"facetSearch": false, "filter": {"equality": false, "comparison": false}}}]}), + &json!({"facetName": "doggos.name", "facetQuery": "b"}), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(response["message"], @r###""Attribute `doggos.name` is not facet-searchable. This index does not have configured facet-searchable attributes. To make it facet-searchable add it to the `filterableAttributes` index settings.""###); + }, + ).await; +} diff --git a/crates/meilisearch/tests/search/filters.rs b/crates/meilisearch/tests/search/filters.rs new file mode 100644 index 000000000..619160a3b --- /dev/null +++ b/crates/meilisearch/tests/search/filters.rs @@ -0,0 +1,758 @@ +use meili_snap::{json_string, snapshot}; +use meilisearch::Opt; +use tempfile::TempDir; + +use super::test_settings_documents_indexing_swapping_and_search; +use crate::{ + common::{default_settings, shared_index_with_documents, Server, DOCUMENTS, NESTED_DOCUMENTS}, + json, +}; + +#[actix_rt::test] +async fn search_with_filter_string_notation() { + let server = Server::new().await; + let index = server.index("test"); + + let (_, code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; + meili_snap::snapshot!(code, @"202 Accepted"); + + let documents = DOCUMENTS.clone(); + let (task, code) = index.add_documents(documents, None).await; + meili_snap::snapshot!(code, @"202 Accepted"); + let res = index.wait_task(task.uid()).await; + meili_snap::snapshot!(res["status"], @r###""succeeded""###); + + index + .search( + json!({ + "filter": "title = Gläss" + }), + |response, code| { + assert_eq!(code, 200, "{}", response); + assert_eq!(response["hits"].as_array().unwrap().len(), 1); + }, + ) + .await; + + let index = server.index("nested"); + + let (_, code) = + index.update_settings(json!({"filterableAttributes": ["cattos", "doggos.age"]})).await; + meili_snap::snapshot!(code, @"202 Accepted"); + + let documents = NESTED_DOCUMENTS.clone(); + let (task, code) = index.add_documents(documents, None).await; + meili_snap::snapshot!(code, @"202 Accepted"); + let res = index.wait_task(task.uid()).await; + meili_snap::snapshot!(res["status"], @r###""succeeded""###); + + index + .search( + json!({ + "filter": "cattos = pésti" + }), + |response, code| { + assert_eq!(code, 200, "{}", response); + assert_eq!(response["hits"].as_array().unwrap().len(), 1); + assert_eq!(response["hits"][0]["id"], json!(852)); + }, + ) + .await; + + index + .search( + json!({ + "filter": "doggos.age > 5" + }), + |response, code| { + assert_eq!(code, 200, "{}", response); + assert_eq!(response["hits"].as_array().unwrap().len(), 2); + assert_eq!(response["hits"][0]["id"], json!(654)); + assert_eq!(response["hits"][1]["id"], json!(951)); + }, + ) + .await; +} + +#[actix_rt::test] +async fn search_with_filter_array_notation() { + let index = shared_index_with_documents().await; + let (response, code) = index + .search_post(json!({ + "filter": ["title = Gläss"] + })) + .await; + assert_eq!(code, 200, "{}", response); + assert_eq!(response["hits"].as_array().unwrap().len(), 1); + + let (response, code) = index + .search_post(json!({ + "filter": [["title = Gläss", "title = \"Shazam!\"", "title = \"Escape Room\""]] + })) + .await; + assert_eq!(code, 200, "{}", response); + assert_eq!(response["hits"].as_array().unwrap().len(), 3); +} + +#[actix_rt::test] +async fn search_with_contains_filter() { + let temp = TempDir::new().unwrap(); + let server = Server::new_with_options(Opt { + experimental_contains_filter: true, + ..default_settings(temp.path()) + }) + .await + .unwrap(); + let index = server.index("movies"); + + index.update_settings(json!({"filterableAttributes": ["title"]})).await; + + let documents = DOCUMENTS.clone(); + let (request, _code) = index.add_documents(documents, None).await; + index.wait_task(request.uid()).await.succeeded(); + + let (response, code) = index + .search_post(json!({ + "filter": "title CONTAINS cap" + })) + .await; + assert_eq!(code, 200, "{}", response); + assert_eq!(response["hits"].as_array().unwrap().len(), 2); +} + +#[actix_rt::test] +async fn search_with_pattern_filter_settings() { + // Check if the Equality filter works with patterns + test_settings_documents_indexing_swapping_and_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [{"attributePatterns": ["cattos","doggos.age"]}]}), + &json!({ + "filter": "cattos = pésti" + }), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 852, + "father": "jean", + "mother": "michelle", + "doggos": [ + { + "name": "bobby", + "age": 2 + }, + { + "name": "buddy", + "age": 4 + } + ], + "cattos": "pésti" + } + ] + "###); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [{ + "attributePatterns": ["cattos","doggos.age"], + "features": { + "facetSearch": false, + "filter": {"equality": true, "comparison": false} + } + }]}), + &json!({ + "filter": "cattos = pésti" + }), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 852, + "father": "jean", + "mother": "michelle", + "doggos": [ + { + "name": "bobby", + "age": 2 + }, + { + "name": "buddy", + "age": 4 + } + ], + "cattos": "pésti" + } + ] + "###); + }, + ) + .await; + + // Check if the Comparison filter works with patterns + test_settings_documents_indexing_swapping_and_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [{ + "attributePatterns": ["cattos","doggos.age"], + "features": { + "facetSearch": false, + "filter": {"equality": false, "comparison": true} + } + }]}), + &json!({ + "filter": "doggos.age > 2" + }), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 852, + "father": "jean", + "mother": "michelle", + "doggos": [ + { + "name": "bobby", + "age": 2 + }, + { + "name": "buddy", + "age": 4 + } + ], + "cattos": "pésti" + }, + { + "id": 654, + "father": "pierre", + "mother": "sabine", + "doggos": [ + { + "name": "gros bill", + "age": 8 + } + ], + "cattos": [ + "simba", + "pestiféré" + ] + }, + { + "id": 951, + "father": "jean-baptiste", + "mother": "sophie", + "doggos": [ + { + "name": "turbo", + "age": 5 + }, + { + "name": "fast", + "age": 6 + } + ], + "cattos": [ + "moumoute", + "gomez" + ] + } + ] + "###); + }, + ) + .await; +} + +#[actix_rt::test] +async fn search_with_pattern_filter_settings_scenario_1() { + let temp = TempDir::new().unwrap(); + let server = Server::new_with_options(Opt { ..default_settings(temp.path()) }).await.unwrap(); + + eprintln!("Documents -> Settings -> test"); + let index = server.index("test"); + + let (task, code) = index.add_documents(NESTED_DOCUMENTS.clone(), None).await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + snapshot!(response["status"], @r###""succeeded""###); + + let (task, code) = index + .update_settings(json!({"filterableAttributes": [{ + "attributePatterns": ["cattos","doggos.age"], + "features": { + "facetSearch": false, + "filter": {"equality": true, "comparison": false} + } + }]})) + .await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + snapshot!(response["status"], @r###""succeeded""###); + + // Check if the Equality filter works + index + .search( + json!({ + "filter": "cattos = pésti" + }), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 852, + "father": "jean", + "mother": "michelle", + "doggos": [ + { + "name": "bobby", + "age": 2 + }, + { + "name": "buddy", + "age": 4 + } + ], + "cattos": "pésti" + } + ] + "###); + }, + ) + .await; + + // Check if the Comparison filter returns an error + index + .search( + json!({ + "filter": "doggos.age > 2" + }), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Index `test`: Filter operator `>` is not allowed for the attribute `doggos.age`.\n - Note: allowed operators: OR, AND, NOT, =, !=, IN, IS EMPTY, IS NULL, EXISTS.\n - Note: field `doggos.age` matched rule #0 in `filterableAttributes`", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + }, + ) + .await; + + // Update the settings activate comparison filter + let (task, code) = index + .update_settings(json!({"filterableAttributes": [{ + "attributePatterns": ["cattos","doggos.age"], + "features": { + "facetSearch": false, + "filter": {"equality": true, "comparison": true} + } + }]})) + .await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + snapshot!(response["status"], @r###""succeeded""###); + + // Check if the Equality filter works + index + .search( + json!({ + "filter": "cattos = pésti" + }), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 852, + "father": "jean", + "mother": "michelle", + "doggos": [ + { + "name": "bobby", + "age": 2 + }, + { + "name": "buddy", + "age": 4 + } + ], + "cattos": "pésti" + } + ] + "###); + }, + ) + .await; + + // Check if the Comparison filter works + index + .search( + json!({ + "filter": "doggos.age > 2" + }), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 852, + "father": "jean", + "mother": "michelle", + "doggos": [ + { + "name": "bobby", + "age": 2 + }, + { + "name": "buddy", + "age": 4 + } + ], + "cattos": "pésti" + }, + { + "id": 654, + "father": "pierre", + "mother": "sabine", + "doggos": [ + { + "name": "gros bill", + "age": 8 + } + ], + "cattos": [ + "simba", + "pestiféré" + ] + }, + { + "id": 951, + "father": "jean-baptiste", + "mother": "sophie", + "doggos": [ + { + "name": "turbo", + "age": 5 + }, + { + "name": "fast", + "age": 6 + } + ], + "cattos": [ + "moumoute", + "gomez" + ] + } + ] + "###); + }, + ) + .await; + + // Update the settings deactivate equality filter + let (task, code) = index + .update_settings(json!({"filterableAttributes": [{ + "attributePatterns": ["cattos","doggos.age"], + "features": { + "facetSearch": false, + "filter": {"equality": false, "comparison": true} + } + }]})) + .await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + snapshot!(response["status"], @r###""succeeded""###); + + // Check if the Equality filter returns an error + index + .search( + json!({ + "filter": "cattos = pésti" + }), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Index `test`: Filter operator `=` is not allowed for the attribute `cattos`.\n - Note: allowed operators: OR, AND, NOT, <, >, <=, >=, TO, IS EMPTY, IS NULL, EXISTS.\n - Note: field `cattos` matched rule #0 in `filterableAttributes`", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + }, + ) + .await; + + // Check if the Comparison filter works + index + .search( + json!({ + "filter": "doggos.age > 2" + }), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 852, + "father": "jean", + "mother": "michelle", + "doggos": [ + { + "name": "bobby", + "age": 2 + }, + { + "name": "buddy", + "age": 4 + } + ], + "cattos": "pésti" + }, + { + "id": 654, + "father": "pierre", + "mother": "sabine", + "doggos": [ + { + "name": "gros bill", + "age": 8 + } + ], + "cattos": [ + "simba", + "pestiféré" + ] + }, + { + "id": 951, + "father": "jean-baptiste", + "mother": "sophie", + "doggos": [ + { + "name": "turbo", + "age": 5 + }, + { + "name": "fast", + "age": 6 + } + ], + "cattos": [ + "moumoute", + "gomez" + ] + } + ] + "###); + }, + ) + .await; + + // rollback the settings + let (task, code) = index + .update_settings(json!({"filterableAttributes": [{ + "attributePatterns": ["cattos","doggos.age"], + "features": { + "facetSearch": false, + "filter": {"equality": true, "comparison": false} + } + }]})) + .await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + snapshot!(response["status"], @r###""succeeded""###); + + // Check if the Equality filter works + index + .search( + json!({ + "filter": "cattos = pésti" + }), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 852, + "father": "jean", + "mother": "michelle", + "doggos": [ + { + "name": "bobby", + "age": 2 + }, + { + "name": "buddy", + "age": 4 + } + ], + "cattos": "pésti" + } + ] + "###); + }, + ) + .await; + + // Check if the Comparison filter returns an error + index + .search( + json!({ + "filter": "doggos.age > 2" + }), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Index `test`: Filter operator `>` is not allowed for the attribute `doggos.age`.\n - Note: allowed operators: OR, AND, NOT, =, !=, IN, IS EMPTY, IS NULL, EXISTS.\n - Note: field `doggos.age` matched rule #0 in `filterableAttributes`", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + }, + ) + .await; +} + +#[actix_rt::test] +async fn test_filterable_attributes_priority() { + // Test that the filterable attributes priority is respected + + // check if doggos.name is filterable + test_settings_documents_indexing_swapping_and_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [ + // deactivated filter + {"attributePatterns": ["doggos.a*"], "features": {"facetSearch": false, "filter": {"equality": false, "comparison": false}}}, + // activated filter + {"attributePatterns": ["doggos.*"]}, + ]}), + &json!({ + "filter": "doggos.name = bobby" + }), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 852, + "father": "jean", + "mother": "michelle", + "doggos": [ + { + "name": "bobby", + "age": 2 + }, + { + "name": "buddy", + "age": 4 + } + ], + "cattos": "pésti" + } + ] + "###); + }, + ) + .await; + + // check if doggos.name is filterable 2 + test_settings_documents_indexing_swapping_and_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [ + // deactivated filter + {"attributePatterns": ["doggos"], "features": {"facetSearch": false, "filter": {"equality": false, "comparison": false}}}, + // activated filter + {"attributePatterns": ["doggos.*"]}, + ]}), + &json!({ + "filter": "doggos.name = bobby" + }), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 852, + "father": "jean", + "mother": "michelle", + "doggos": [ + { + "name": "bobby", + "age": 2 + }, + { + "name": "buddy", + "age": 4 + } + ], + "cattos": "pésti" + } + ] + "###); + }, + ) + .await; + + // check if doggos.age is not filterable + test_settings_documents_indexing_swapping_and_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [ + // deactivated filter + {"attributePatterns": ["doggos.a*"], "features": {"facetSearch": false, "filter": {"equality": false, "comparison": false}}}, + // activated filter + {"attributePatterns": ["doggos.*"]}, + ]}), + &json!({ + "filter": "doggos.age > 2" + }), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Index `test`: Attribute `doggos.age` is not filterable. Available filterable attribute patterns are: `doggos.*`.\n1:11 doggos.age > 2", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + }, + ) + .await; + + // check if doggos is not filterable + test_settings_documents_indexing_swapping_and_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [ + // deactivated filter + {"attributePatterns": ["doggos"], "features": {"facetSearch": false, "filter": {"equality": false, "comparison": false}}}, + // activated filter + {"attributePatterns": ["doggos.*"]}, + ]}), + &json!({ + "filter": "doggos EXISTS" + }), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Index `test`: Attribute `doggos` is not filterable. Available filterable attribute patterns are: `doggos.*`.\n1:7 doggos EXISTS", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + }, + ) + .await; +} diff --git a/crates/meilisearch/tests/search/geo.rs b/crates/meilisearch/tests/search/geo.rs index b0cc8b6ca..a314ca241 100644 --- a/crates/meilisearch/tests/search/geo.rs +++ b/crates/meilisearch/tests/search/geo.rs @@ -1,9 +1,12 @@ use meili_snap::{json_string, snapshot}; +use meilisearch_types::milli::constants::RESERVED_GEO_FIELD_NAME; use once_cell::sync::Lazy; use crate::common::{Server, Value}; use crate::json; +use super::test_settings_documents_indexing_swapping_and_search; + static DOCUMENTS: Lazy = Lazy::new(|| { json!([ { @@ -184,3 +187,184 @@ async fn bug_4640() { ) .await; } + +#[actix_rt::test] +async fn geo_asc_with_words() { + let documents = json!([ + { "id": 0, "doggo": "jean", RESERVED_GEO_FIELD_NAME: { "lat": 0, "lng": 0 } }, + { "id": 1, "doggo": "intel", RESERVED_GEO_FIELD_NAME: { "lat": 88, "lng": 0 } }, + { "id": 2, "doggo": "jean bob", RESERVED_GEO_FIELD_NAME: { "lat": -89, "lng": 0 } }, + { "id": 3, "doggo": "jean michel", RESERVED_GEO_FIELD_NAME: { "lat": 0, "lng": 178 } }, + { "id": 4, "doggo": "bob marley", RESERVED_GEO_FIELD_NAME: { "lat": 0, "lng": -179 } }, + ]); + + test_settings_documents_indexing_swapping_and_search( + &documents, + &json!({"searchableAttributes": ["id", "doggo"], "rankingRules": ["words", "geo:asc"]}), + &json!({"q": "jean"}), + |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "id": 0, + "doggo": "jean", + "_geo": { + "lat": 0, + "lng": 0 + } + }, + { + "id": 2, + "doggo": "jean bob", + "_geo": { + "lat": -89, + "lng": 0 + } + }, + { + "id": 3, + "doggo": "jean michel", + "_geo": { + "lat": 0, + "lng": 178 + } + } + ], + "query": "jean", + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 3 + } + "###); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_search( + &documents, + &json!({"searchableAttributes": ["id", "doggo"], "rankingRules": ["words", "geo:asc"]}), + &json!({"q": "bob"}), + |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "id": 2, + "doggo": "jean bob", + "_geo": { + "lat": -89, + "lng": 0 + } + }, + { + "id": 4, + "doggo": "bob marley", + "_geo": { + "lat": 0, + "lng": -179 + } + } + ], + "query": "bob", + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2 + } + "###); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_search( + &documents, + &json!({"searchableAttributes": ["id", "doggo"], "rankingRules": ["words", "geo:asc"]}), + &json!({"q": "intel"}), + |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "id": 1, + "doggo": "intel", + "_geo": { + "lat": 88, + "lng": 0 + } + } + ], + "query": "intel", + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 1 + } + "###); + }, + ) + .await; +} + +#[actix_rt::test] +async fn geo_sort_with_words() { + let documents = json!([ + { "id": 0, "doggo": "jean", RESERVED_GEO_FIELD_NAME: { "lat": 0, "lng": 0 } }, + { "id": 1, "doggo": "intel", RESERVED_GEO_FIELD_NAME: { "lat": 88, "lng": 0 } }, + { "id": 2, "doggo": "jean bob", RESERVED_GEO_FIELD_NAME: { "lat": -89, "lng": 0 } }, + { "id": 3, "doggo": "jean michel", RESERVED_GEO_FIELD_NAME: { "lat": 0, "lng": 178 } }, + { "id": 4, "doggo": "bob marley", RESERVED_GEO_FIELD_NAME: { "lat": 0, "lng": -179 } }, + ]); + + test_settings_documents_indexing_swapping_and_search( + &documents, + &json!({"searchableAttributes": ["id", "doggo"], "rankingRules": ["words", "sort"], "sortableAttributes": [RESERVED_GEO_FIELD_NAME]}), + &json!({"q": "jean", "sort": ["_geoPoint(0.0, 0.0):asc"]}), + |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "id": 0, + "doggo": "jean", + "_geo": { + "lat": 0, + "lng": 0 + }, + "_geoDistance": 0 + }, + { + "id": 2, + "doggo": "jean bob", + "_geo": { + "lat": -89, + "lng": 0 + }, + "_geoDistance": 9896348 + }, + { + "id": 3, + "doggo": "jean michel", + "_geo": { + "lat": 0, + "lng": 178 + }, + "_geoDistance": 19792697 + } + ], + "query": "jean", + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 3 + } + "###); + }, + ) + .await; +} diff --git a/crates/meilisearch/tests/search/mod.rs b/crates/meilisearch/tests/search/mod.rs index a5fa94eea..d7a09b58e 100644 --- a/crates/meilisearch/tests/search/mod.rs +++ b/crates/meilisearch/tests/search/mod.rs @@ -4,6 +4,7 @@ mod distinct; mod errors; mod facet_search; +mod filters; mod formatted; mod geo; mod hybrid; @@ -21,10 +22,58 @@ use tempfile::TempDir; use crate::common::{ default_settings, shared_index_with_documents, shared_index_with_nested_documents, Server, - DOCUMENTS, FRUITS_DOCUMENTS, NESTED_DOCUMENTS, SCORE_DOCUMENTS, VECTOR_DOCUMENTS, + Value, DOCUMENTS, FRUITS_DOCUMENTS, NESTED_DOCUMENTS, SCORE_DOCUMENTS, VECTOR_DOCUMENTS, }; use crate::json; +async fn test_settings_documents_indexing_swapping_and_search( + documents: &Value, + settings: &Value, + query: &Value, + test: impl Fn(Value, actix_http::StatusCode) + std::panic::UnwindSafe + Clone, +) { + let temp = TempDir::new().unwrap(); + let server = Server::new_with_options(Opt { ..default_settings(temp.path()) }).await.unwrap(); + + eprintln!("Documents -> Settings -> test"); + let index = server.index("test"); + + let (task, code) = index.add_documents(documents.clone(), None).await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + assert!(response.is_success(), "{:?}", response); + + let (task, code) = index.update_settings(settings.clone()).await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + assert!(response.is_success(), "{:?}", response); + + index.search(query.clone(), test.clone()).await; + let (task, code) = server.delete_index("test").await; + assert_eq!(code, 202, "{}", task); + let response = server.wait_task(task.uid()).await; + assert!(response.is_success(), "{:?}", response); + + eprintln!("Settings -> Documents -> test"); + let index = server.index("test"); + + let (task, code) = index.update_settings(settings.clone()).await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + assert!(response.is_success(), "{:?}", response); + + let (task, code) = index.add_documents(documents.clone(), None).await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + assert!(response.is_success(), "{:?}", response); + + index.search(query.clone(), test.clone()).await; + let (task, code) = server.delete_index("test").await; + assert_eq!(code, 202, "{}", task); + let response = server.wait_task(task.uid()).await; + assert!(response.is_success(), "{:?}", response); +} + #[actix_rt::test] async fn simple_placeholder_search() { let index = shared_index_with_documents().await; @@ -355,118 +404,6 @@ async fn search_multiple_params() { .await; } -#[actix_rt::test] -async fn search_with_filter_string_notation() { - let server = Server::new().await; - let index = server.index("test"); - - let (_, code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; - meili_snap::snapshot!(code, @"202 Accepted"); - - let documents = DOCUMENTS.clone(); - let (task, code) = index.add_documents(documents, None).await; - meili_snap::snapshot!(code, @"202 Accepted"); - let res = index.wait_task(task.uid()).await; - meili_snap::snapshot!(res["status"], @r###""succeeded""###); - - index - .search( - json!({ - "filter": "title = Gläss" - }), - |response, code| { - assert_eq!(code, 200, "{}", response); - assert_eq!(response["hits"].as_array().unwrap().len(), 1); - }, - ) - .await; - - let index = server.index("nested"); - - let (_, code) = - index.update_settings(json!({"filterableAttributes": ["cattos", "doggos.age"]})).await; - meili_snap::snapshot!(code, @"202 Accepted"); - - let documents = NESTED_DOCUMENTS.clone(); - let (task, code) = index.add_documents(documents, None).await; - meili_snap::snapshot!(code, @"202 Accepted"); - let res = index.wait_task(task.uid()).await; - meili_snap::snapshot!(res["status"], @r###""succeeded""###); - - index - .search( - json!({ - "filter": "cattos = pésti" - }), - |response, code| { - assert_eq!(code, 200, "{}", response); - assert_eq!(response["hits"].as_array().unwrap().len(), 1); - assert_eq!(response["hits"][0]["id"], json!(852)); - }, - ) - .await; - - index - .search( - json!({ - "filter": "doggos.age > 5" - }), - |response, code| { - assert_eq!(code, 200, "{}", response); - assert_eq!(response["hits"].as_array().unwrap().len(), 2); - assert_eq!(response["hits"][0]["id"], json!(654)); - assert_eq!(response["hits"][1]["id"], json!(951)); - }, - ) - .await; -} - -#[actix_rt::test] -async fn search_with_filter_array_notation() { - let index = shared_index_with_documents().await; - let (response, code) = index - .search_post(json!({ - "filter": ["title = Gläss"] - })) - .await; - assert_eq!(code, 200, "{}", response); - assert_eq!(response["hits"].as_array().unwrap().len(), 1); - - let (response, code) = index - .search_post(json!({ - "filter": [["title = Gläss", "title = \"Shazam!\"", "title = \"Escape Room\""]] - })) - .await; - assert_eq!(code, 200, "{}", response); - assert_eq!(response["hits"].as_array().unwrap().len(), 3); -} - -#[actix_rt::test] -async fn search_with_contains_filter() { - let temp = TempDir::new().unwrap(); - let server = Server::new_with_options(Opt { - experimental_contains_filter: true, - ..default_settings(temp.path()) - }) - .await - .unwrap(); - let index = server.index("movies"); - - index.update_settings(json!({"filterableAttributes": ["title"]})).await; - - let documents = DOCUMENTS.clone(); - let (request, _code) = index.add_documents(documents, None).await; - index.wait_task(request.uid()).await.succeeded(); - - let (response, code) = index - .search_post(json!({ - "filter": "title CONTAINS cap" - })) - .await; - assert_eq!(code, 200, "{}", response); - assert_eq!(response["hits"].as_array().unwrap().len(), 2); -} - #[actix_rt::test] async fn search_with_sort_on_numbers() { let index = shared_index_with_documents().await; @@ -589,7 +526,7 @@ async fn search_facet_distribution() { |response, code| { assert_eq!(code, 200, "{}", response); let dist = response["facetDistribution"].as_object().unwrap(); - assert_eq!(dist.len(), 1); + assert_eq!(dist.len(), 1, "{:?}", dist); assert_eq!( dist["doggos.name"], json!({ "bobby": 1, "buddy": 1, "gros bill": 1, "turbo": 1, "fast": 1}) @@ -606,7 +543,7 @@ async fn search_facet_distribution() { |response, code| { assert_eq!(code, 200, "{}", response); let dist = response["facetDistribution"].as_object().unwrap(); - assert_eq!(dist.len(), 3); + assert_eq!(dist.len(), 3, "{:?}", dist); assert_eq!( dist["doggos.name"], json!({ "bobby": 1, "buddy": 1, "gros bill": 1, "turbo": 1, "fast": 1}) @@ -1559,6 +1496,293 @@ async fn change_attributes_settings() { .await; } +#[actix_rt::test] +async fn test_nested_fields() { + let documents = json!([ + { + "id": 0, + "title": "The zeroth document", + }, + { + "id": 1, + "title": "The first document", + "nested": { + "object": "field", + "machin": "bidule", + }, + }, + { + "id": 2, + "title": "The second document", + "nested": [ + "array", + { + "object": "field", + }, + { + "prout": "truc", + "machin": "lol", + }, + ], + }, + { + "id": 3, + "title": "The third document", + "nested": "I lied", + }, + ]); + + let settings = json!({ + "searchableAttributes": ["title", "nested.object", "nested.machin"], + "filterableAttributes": ["title", "nested.object", "nested.machin"] + }); + + // Test empty search returns all documents + test_settings_documents_indexing_swapping_and_search( + &documents, + &settings, + &json!({"q": "document"}), + |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 0, + "title": "The zeroth document" + }, + { + "id": 1, + "title": "The first document", + "nested": { + "object": "field", + "machin": "bidule" + } + }, + { + "id": 2, + "title": "The second document", + "nested": [ + "array", + { + "object": "field" + }, + { + "prout": "truc", + "machin": "lol" + } + ] + }, + { + "id": 3, + "title": "The third document", + "nested": "I lied" + } + ] + "###); + }, + ) + .await; + + // Test searching specific documents + test_settings_documents_indexing_swapping_and_search( + &documents, + &settings, + &json!({"q": "zeroth"}), + |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 0, + "title": "The zeroth document" + } + ] + "###); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_search( + &documents, + &settings, + &json!({"q": "first"}), + |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "title": "The first document", + "nested": { + "object": "field", + "machin": "bidule" + } + } + ] + "###); + }, + ) + .await; + + // Test searching nested fields + test_settings_documents_indexing_swapping_and_search( + &documents, + &settings, + &json!({"q": "field"}), + |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "title": "The first document", + "nested": { + "object": "field", + "machin": "bidule" + } + }, + { + "id": 2, + "title": "The second document", + "nested": [ + "array", + { + "object": "field" + }, + { + "prout": "truc", + "machin": "lol" + } + ] + } + ] + "###); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_search( + &documents, + &settings, + &json!({"q": "array"}), + |response, code| { + assert_eq!(code, 200, "{}", response); + // nested is not searchable + snapshot!(json_string!(response["hits"]), @"[]"); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_search( + &documents, + &settings, + &json!({"q": "lied"}), + |response, code| { + assert_eq!(code, 200, "{}", response); + // nested is not searchable + snapshot!(json_string!(response["hits"]), @"[]"); + }, + ) + .await; + + // Test filtering on nested fields + test_settings_documents_indexing_swapping_and_search( + &documents, + &settings, + &json!({"filter": "nested.object = field"}), + |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "title": "The first document", + "nested": { + "object": "field", + "machin": "bidule" + } + }, + { + "id": 2, + "title": "The second document", + "nested": [ + "array", + { + "object": "field" + }, + { + "prout": "truc", + "machin": "lol" + } + ] + } + ] + "###); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_search( + &documents, + &settings, + &json!({"filter": "nested.machin = bidule"}), + |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "title": "The first document", + "nested": { + "object": "field", + "machin": "bidule" + } + } + ] + "###); + }, + ) + .await; + + // Test filtering on non-filterable nested field fails + test_settings_documents_indexing_swapping_and_search( + &documents, + &settings, + &json!({"filter": "nested = array"}), + |response, code| { + assert_eq!(code, 400, "{}", response); + snapshot!(json_string!(response), @r###" + { + "message": "Index `test`: Attribute `nested` is not filterable. Available filterable attribute patterns are: `nested.machin`, `nested.object`, `title`.\n1:7 nested = array", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + }, + ) + .await; + + // Test filtering on non-filterable nested field fails + test_settings_documents_indexing_swapping_and_search( + &documents, + &settings, + &json!({"filter": r#"nested = "I lied""#}), + |response, code| { + assert_eq!(code, 400, "{}", response); + snapshot!(json_string!(response), @r###" + { + "message": "Index `test`: Attribute `nested` is not filterable. Available filterable attribute patterns are: `nested.machin`, `nested.object`, `title`.\n1:7 nested = \"I lied\"", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + }, + ) + .await; +} + /// Modifying facets with different casing should work correctly #[actix_rt::test] async fn change_facet_casing() { diff --git a/crates/meilisearch/tests/search/multi/mod.rs b/crates/meilisearch/tests/search/multi/mod.rs index 2a95a5dd2..df8b2f1eb 100644 --- a/crates/meilisearch/tests/search/multi/mod.rs +++ b/crates/meilisearch/tests/search/multi/mod.rs @@ -3647,7 +3647,7 @@ async fn federation_non_faceted_for_an_index() { snapshot!(code, @"400 Bad Request"); insta::assert_json_snapshot!(response, { ".processingTimeMs" => "[time]" }, @r###" { - "message": "Inside `.federation.facetsByIndex.fruits-no-name`: Invalid facet distribution, attribute `name` is not filterable. The available filterable attributes are `BOOST, id`.\n - Note: index `fruits-no-name` used in `.queries[1]`", + "message": "Inside `.federation.facetsByIndex.fruits-no-name`: Invalid facet distribution, attribute `name` is not filterable. The available filterable attribute patterns are `BOOST, id`.\n - Note: index `fruits-no-name` used in `.queries[1]`", "code": "invalid_multi_search_facets", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_multi_search_facets" @@ -3669,7 +3669,7 @@ async fn federation_non_faceted_for_an_index() { snapshot!(code, @"400 Bad Request"); insta::assert_json_snapshot!(response, { ".processingTimeMs" => "[time]" }, @r###" { - "message": "Inside `.federation.facetsByIndex.fruits-no-name`: Invalid facet distribution, attribute `name` is not filterable. The available filterable attributes are `BOOST, id`.\n - Note: index `fruits-no-name` is not used in queries", + "message": "Inside `.federation.facetsByIndex.fruits-no-name`: Invalid facet distribution, attribute `name` is not filterable. The available filterable attribute patterns are `BOOST, id`.\n - Note: index `fruits-no-name` is not used in queries", "code": "invalid_multi_search_facets", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_multi_search_facets" diff --git a/crates/meilisearch/tests/settings/get_settings.rs b/crates/meilisearch/tests/settings/get_settings.rs index 2a7d713f2..fbb97f999 100644 --- a/crates/meilisearch/tests/settings/get_settings.rs +++ b/crates/meilisearch/tests/settings/get_settings.rs @@ -1,3 +1,5 @@ +use meili_snap::{json_string, snapshot}; + use crate::common::Server; use crate::json; @@ -510,3 +512,127 @@ async fn set_and_reset_distinct_attribute_with_dedicated_route() { assert_eq!(response, json!(null)); } + +#[actix_rt::test] +async fn granular_filterable_attributes() { + let server = Server::new().await; + let index = server.index("test"); + index.create(None).await; + + let (response, code) = + index.update_settings(json!({ "filterableAttributes": [ + { "attributePatterns": ["name"], "features": { "facetSearch": true, "filter": {"equality": true, "comparison": false} } }, + { "attributePatterns": ["age"], "features": { "facetSearch": false, "filter": {"equality": true, "comparison": true} } }, + { "attributePatterns": ["id"] }, + { "attributePatterns": ["default-filterable-features-null"], "features": { "facetSearch": true } }, + { "attributePatterns": ["default-filterable-features-equality"], "features": { "facetSearch": true, "filter": {"comparison": true} } }, + { "attributePatterns": ["default-filterable-features-comparison"], "features": { "facetSearch": true, "filter": {"equality": true} } }, + { "attributePatterns": ["default-filterable-features-empty"], "features": { "facetSearch": true, "filter": {} } }, + { "attributePatterns": ["default-facet-search"], "features": { "filter": {"equality": true, "comparison": true} } }, + ] })).await; + assert_eq!(code, 202); + index.wait_task(response.uid()).await.succeeded(); + + let (response, code) = index.settings().await; + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["filterableAttributes"]), @r###" + [ + { + "attributePatterns": [ + "name" + ], + "features": { + "facetSearch": true, + "filter": { + "equality": true, + "comparison": false + } + } + }, + { + "attributePatterns": [ + "age" + ], + "features": { + "facetSearch": false, + "filter": { + "equality": true, + "comparison": true + } + } + }, + { + "attributePatterns": [ + "id" + ], + "features": { + "facetSearch": false, + "filter": { + "equality": true, + "comparison": false + } + } + }, + { + "attributePatterns": [ + "default-filterable-features-null" + ], + "features": { + "facetSearch": true, + "filter": { + "equality": true, + "comparison": false + } + } + }, + { + "attributePatterns": [ + "default-filterable-features-equality" + ], + "features": { + "facetSearch": true, + "filter": { + "equality": true, + "comparison": true + } + } + }, + { + "attributePatterns": [ + "default-filterable-features-comparison" + ], + "features": { + "facetSearch": true, + "filter": { + "equality": true, + "comparison": false + } + } + }, + { + "attributePatterns": [ + "default-filterable-features-empty" + ], + "features": { + "facetSearch": true, + "filter": { + "equality": true, + "comparison": false + } + } + }, + { + "attributePatterns": [ + "default-facet-search" + ], + "features": { + "facetSearch": false, + "filter": { + "equality": true, + "comparison": true + } + } + } + ] + "###); +} diff --git a/crates/meilisearch/tests/similar/errors.rs b/crates/meilisearch/tests/similar/errors.rs index 75bd6e46b..5c4ac1f38 100644 --- a/crates/meilisearch/tests/similar/errors.rs +++ b/crates/meilisearch/tests/similar/errors.rs @@ -452,18 +452,19 @@ async fn filter_invalid_attribute_array() { snapshot!(code, @"202 Accepted"); index.wait_task(value.uid()).await.succeeded(); - let expected_response = json!({ - "message": "Attribute `many` is not filterable. Available filterable attributes are: `title`.\n1:5 many = Glass", - "code": "invalid_similar_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_similar_filter" - }); index .similar( json!({"id": 287947, "filter": ["many = Glass"], "embedder": "manual"}), |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Attribute `many` is not filterable. Available filterable attribute patterns are: `title`.\n1:5 many = Glass", + "code": "invalid_similar_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_filter" + } + "###); }, ) .await; @@ -492,18 +493,19 @@ async fn filter_invalid_attribute_string() { snapshot!(code, @"202 Accepted"); index.wait_task(value.uid()).await.succeeded(); - let expected_response = json!({ - "message": "Attribute `many` is not filterable. Available filterable attributes are: `title`.\n1:5 many = Glass", - "code": "invalid_similar_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_similar_filter" - }); index .similar( json!({"id": 287947, "filter": "many = Glass", "embedder": "manual"}), |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Attribute `many` is not filterable. Available filterable attribute patterns are: `title`.\n1:5 many = Glass", + "code": "invalid_similar_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_filter" + } + "###); }, ) .await; diff --git a/crates/milli/src/attribute_patterns.rs b/crates/milli/src/attribute_patterns.rs new file mode 100644 index 000000000..00caa2a6d --- /dev/null +++ b/crates/milli/src/attribute_patterns.rs @@ -0,0 +1,152 @@ +use deserr::Deserr; +use serde::{Deserialize, Serialize}; +use utoipa::ToSchema; + +use crate::is_faceted_by; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] +#[repr(transparent)] +#[serde(transparent)] +pub struct AttributePatterns { + #[schema(example = json!(["title", "overview_*", "release_date"]))] + pub patterns: Vec, +} + +impl Deserr for AttributePatterns { + fn deserialize_from_value( + value: deserr::Value, + location: deserr::ValuePointerRef, + ) -> Result { + Vec::::deserialize_from_value(value, location).map(|patterns| Self { patterns }) + } +} + +impl From> for AttributePatterns { + fn from(patterns: Vec) -> Self { + Self { patterns } + } +} + +impl AttributePatterns { + /// Match a string against the attribute patterns using the match_pattern function. + pub fn match_str(&self, str: &str) -> PatternMatch { + let mut pattern_match = PatternMatch::NoMatch; + for pattern in &self.patterns { + match match_pattern(pattern, str) { + PatternMatch::Match => return PatternMatch::Match, + PatternMatch::Parent => pattern_match = PatternMatch::Parent, + PatternMatch::NoMatch => {} + } + } + pattern_match + } +} + +/// Match a string against a pattern. +/// +/// The pattern can be a wildcard, a prefix, a suffix or an exact match. +/// +/// # Arguments +/// +/// * `pattern` - The pattern to match against. +/// * `str` - The string to match against the pattern. +fn match_pattern(pattern: &str, str: &str) -> PatternMatch { + // If the pattern is a wildcard, return Match + if pattern == "*" { + return PatternMatch::Match; + } else if pattern.starts_with('*') && pattern.ends_with('*') { + // If the pattern starts and ends with a wildcard, return Match if the string contains the pattern without the wildcards + if str.contains(&pattern[1..pattern.len() - 1]) { + return PatternMatch::Match; + } + } else if let Some(pattern) = pattern.strip_prefix('*') { + // If the pattern starts with a wildcard, return Match if the string ends with the pattern without the wildcard + if str.ends_with(pattern) { + return PatternMatch::Match; + } + } else if let Some(pattern) = pattern.strip_suffix('*') { + // If the pattern ends with a wildcard, return Match if the string starts with the pattern without the wildcard + if str.starts_with(pattern) { + return PatternMatch::Match; + } + } else if pattern == str { + // If the pattern is exactly the string, return Match + return PatternMatch::Match; + } + + // If the field is a parent field of the pattern, return Parent + if is_faceted_by(pattern, str) { + PatternMatch::Parent + } else { + PatternMatch::NoMatch + } +} + +/// Match a field against a pattern using the legacy behavior. +/// +/// A field matches a pattern if it is a parent of the pattern or if it is the pattern itself. +/// This behavior is used to match the sortable attributes, the searchable attributes and the filterable attributes rules `Field`. +/// +/// # Arguments +/// +/// * `pattern` - The pattern to match against. +/// * `field` - The field to match against the pattern. +pub fn match_field_legacy(pattern: &str, field: &str) -> PatternMatch { + if is_faceted_by(field, pattern) { + // If the field matches the pattern or is a nested field of the pattern, return Match (legacy behavior) + PatternMatch::Match + } else if is_faceted_by(pattern, field) { + // If the field is a parent field of the pattern, return Parent + PatternMatch::Parent + } else { + // If the field does not match the pattern and is not a parent of a nested field that matches the pattern, return NoMatch + PatternMatch::NoMatch + } +} + +/// Match a field against a distinct field. +pub fn match_distinct_field(distinct_field: Option<&str>, field: &str) -> PatternMatch { + if let Some(distinct_field) = distinct_field { + if field == distinct_field { + // If the field matches exactly the distinct field, return Match + return PatternMatch::Match; + } else if is_faceted_by(distinct_field, field) { + // If the field is a parent field of the distinct field, return Parent + return PatternMatch::Parent; + } + } + // If the field does not match the distinct field and is not a parent of a nested field that matches the distinct field, return NoMatch + PatternMatch::NoMatch +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum PatternMatch { + /// The field is a parent of a nested field that matches the pattern + /// For example, the field is `toto`, and the pattern is `toto.titi` + Parent, + /// The field matches the pattern + Match, + /// The field does not match the pattern + NoMatch, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_match_pattern() { + assert_eq!(match_pattern("*", "test"), PatternMatch::Match); + assert_eq!(match_pattern("test*", "test"), PatternMatch::Match); + assert_eq!(match_pattern("test*", "testa"), PatternMatch::Match); + assert_eq!(match_pattern("*test", "test"), PatternMatch::Match); + assert_eq!(match_pattern("*test", "atest"), PatternMatch::Match); + assert_eq!(match_pattern("*test*", "test"), PatternMatch::Match); + assert_eq!(match_pattern("*test*", "atesta"), PatternMatch::Match); + assert_eq!(match_pattern("*test*", "atest"), PatternMatch::Match); + assert_eq!(match_pattern("*test*", "testa"), PatternMatch::Match); + assert_eq!(match_pattern("test*test", "test"), PatternMatch::NoMatch); + assert_eq!(match_pattern("*test", "testa"), PatternMatch::NoMatch); + assert_eq!(match_pattern("test*", "atest"), PatternMatch::NoMatch); + } +} diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index c977362d6..f0972de75 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -122,10 +122,10 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco and can not be more than 511 bytes.", .document_id.to_string() )] InvalidDocumentId { document_id: Value }, - #[error("Invalid facet distribution, {}", format_invalid_filter_distribution(.invalid_facets_name, .valid_facets_name))] + #[error("Invalid facet distribution, {}", format_invalid_filter_distribution(.invalid_facets_name, .valid_patterns))] InvalidFacetsDistribution { invalid_facets_name: BTreeSet, - valid_facets_name: BTreeSet, + valid_patterns: BTreeSet, }, #[error(transparent)] InvalidGeoField(#[from] GeoError), @@ -139,6 +139,13 @@ and can not be more than 511 bytes.", .document_id.to_string() InvalidFilter(String), #[error("Invalid type for filter subexpression: expected: {}, found: {}.", .0.join(", "), .1)] InvalidFilterExpression(&'static [&'static str], Value), + #[error("Filter operator `{operator}` is not allowed for the attribute `{field}`.\n - Note: allowed operators: {}.\n - Note: field `{field}` {} in `filterableAttributes`", allowed_operators.join(", "), format!("matched rule #{rule_index}"))] + FilterOperatorNotAllowed { + field: String, + allowed_operators: Vec, + operator: String, + rule_index: usize, + }, #[error("Attribute `{}` is not sortable. {}", .field, match .valid_fields.is_empty() { @@ -152,28 +159,32 @@ and can not be more than 511 bytes.", .document_id.to_string() InvalidSortableAttribute { field: String, valid_fields: BTreeSet, hidden_fields: bool }, #[error("Attribute `{}` is not filterable and thus, cannot be used as distinct attribute. {}", .field, - match .valid_fields.is_empty() { + match .valid_patterns.is_empty() { true => "This index does not have configured filterable attributes.".to_string(), - false => format!("Available filterable attributes are: `{}{}`.", - valid_fields.iter().map(AsRef::as_ref).collect::>().join(", "), + false => format!("Available filterable attributes patterns are: `{}{}`.", + valid_patterns.iter().map(AsRef::as_ref).collect::>().join(", "), .hidden_fields.then_some(", <..hidden-attributes>").unwrap_or(""), ), } )] - InvalidDistinctAttribute { field: String, valid_fields: BTreeSet, hidden_fields: bool }, + InvalidDistinctAttribute { + field: String, + valid_patterns: BTreeSet, + hidden_fields: bool, + }, #[error("Attribute `{}` is not facet-searchable. {}", .field, - match .valid_fields.is_empty() { + match .valid_patterns.is_empty() { true => "This index does not have configured facet-searchable attributes. To make it facet-searchable add it to the `filterableAttributes` index settings.".to_string(), - false => format!("Available facet-searchable attributes are: `{}{}`. To make it facet-searchable add it to the `filterableAttributes` index settings.", - valid_fields.iter().map(AsRef::as_ref).collect::>().join(", "), + false => format!("Available facet-searchable attributes patterns are: `{}{}`. To make it facet-searchable add it to the `filterableAttributes` index settings.", + valid_patterns.iter().map(AsRef::as_ref).collect::>().join(", "), .hidden_fields.then_some(", <..hidden-attributes>").unwrap_or(""), ), } )] InvalidFacetSearchFacetName { field: String, - valid_fields: BTreeSet, + valid_patterns: BTreeSet, hidden_fields: bool, }, #[error("Attribute `{}` is not searchable. Available searchable attributes are: `{}{}`.", @@ -380,9 +391,9 @@ pub enum GeoError { fn format_invalid_filter_distribution( invalid_facets_name: &BTreeSet, - valid_facets_name: &BTreeSet, + valid_patterns: &BTreeSet, ) -> String { - if valid_facets_name.is_empty() { + if valid_patterns.is_empty() { return "this index does not have configured filterable attributes.".into(); } @@ -404,17 +415,17 @@ fn format_invalid_filter_distribution( .unwrap(), }; - match valid_facets_name.len() { + match valid_patterns.len() { 1 => write!( result, - " The available filterable attribute is `{}`.", - valid_facets_name.first().unwrap() + " The available filterable attribute pattern is `{}`.", + valid_patterns.first().unwrap() ) .unwrap(), _ => write!( result, - " The available filterable attributes are `{}`.", - valid_facets_name.iter().map(AsRef::as_ref).collect::>().join(", ") + " The available filterable attribute patterns are `{}`.", + valid_patterns.iter().map(AsRef::as_ref).collect::>().join(", ") ) .unwrap(), } diff --git a/crates/milli/src/fieldids_weights_map.rs b/crates/milli/src/fieldids_weights_map.rs index 57c99f77f..0c57ba109 100644 --- a/crates/milli/src/fieldids_weights_map.rs +++ b/crates/milli/src/fieldids_weights_map.rs @@ -43,11 +43,6 @@ impl FieldidsWeightsMap { self.map.get(&fid).copied() } - /// Returns highest weight contained in the map if any. - pub fn max_weight(&self) -> Option { - self.map.values().copied().max() - } - /// Return an iterator visiting all field ids in arbitrary order. pub fn ids(&self) -> impl Iterator + '_ { self.map.keys().copied() diff --git a/crates/milli/src/fields_ids_map/global.rs b/crates/milli/src/fields_ids_map/global.rs index 2ffc45eb7..235d509e9 100644 --- a/crates/milli/src/fields_ids_map/global.rs +++ b/crates/milli/src/fields_ids_map/global.rs @@ -105,6 +105,18 @@ impl<'indexing> GlobalFieldsIdsMap<'indexing> { self.local.name(id) } + + /// Get the metadata of a field based on its id. + pub fn metadata(&mut self, id: FieldId) -> Option { + if self.local.metadata(id).is_none() { + let global = self.global.read().unwrap(); + + let (name, metadata) = global.name_with_metadata(id)?; + self.local.insert(name, id, metadata); + } + + self.local.metadata(id) + } } impl<'indexing> MutFieldIdMapper for GlobalFieldsIdsMap<'indexing> { diff --git a/crates/milli/src/fields_ids_map/metadata.rs b/crates/milli/src/fields_ids_map/metadata.rs index 65a1111fa..89b0a446b 100644 --- a/crates/milli/src/fields_ids_map/metadata.rs +++ b/crates/milli/src/fields_ids_map/metadata.rs @@ -5,14 +5,29 @@ use charabia::Language; use heed::RoTxn; use super::FieldsIdsMap; -use crate::{FieldId, Index, LocalizedAttributesRule, Result}; +use crate::attribute_patterns::{match_field_legacy, PatternMatch}; +use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME}; +use crate::{ + is_faceted_by, FieldId, FilterableAttributesFeatures, FilterableAttributesRule, Index, + LocalizedAttributesRule, Result, Weight, +}; #[derive(Debug, Clone, Copy)] pub struct Metadata { - pub searchable: bool, - pub filterable: bool, + /// The weight as defined in the FieldidsWeightsMap of the searchable attribute if it is searchable. + pub searchable: Option, + /// The field is part of the sortable attributes. pub sortable: bool, - localized_attributes_rule_id: Option, + /// The field is defined as the distinct attribute. + pub distinct: bool, + /// The field has been defined as asc/desc in the ranking rules. + pub asc_desc: bool, + /// The field is a geo field (`_geo`, `_geo.lat`, `_geo.lng`). + pub geo: bool, + /// The id of the localized attributes rule if the field is localized. + pub localized_attributes_rule_id: Option, + /// The id of the filterable attributes rule if the field is filterable. + pub filterable_attributes_rule_id: Option, } #[derive(Debug, Clone)] @@ -106,76 +121,227 @@ impl Metadata { let rule = rules.get((localized_attributes_rule_id - 1) as usize).unwrap(); Some(rule.locales()) } + + pub fn filterable_attributes<'rules>( + &self, + rules: &'rules [FilterableAttributesRule], + ) -> Option<&'rules FilterableAttributesRule> { + self.filterable_attributes_with_rule_index(rules).map(|(_, rule)| rule) + } + + pub fn filterable_attributes_with_rule_index<'rules>( + &self, + rules: &'rules [FilterableAttributesRule], + ) -> Option<(usize, &'rules FilterableAttributesRule)> { + let filterable_attributes_rule_id = self.filterable_attributes_rule_id?.get(); + let rule_id = (filterable_attributes_rule_id - 1) as usize; + let rule = rules.get(rule_id).unwrap(); + Some((rule_id, rule)) + } + + pub fn filterable_attributes_features( + &self, + rules: &[FilterableAttributesRule], + ) -> FilterableAttributesFeatures { + let (_, features) = self.filterable_attributes_features_with_rule_index(rules); + features + } + + pub fn filterable_attributes_features_with_rule_index( + &self, + rules: &[FilterableAttributesRule], + ) -> (Option, FilterableAttributesFeatures) { + self.filterable_attributes_with_rule_index(rules) + .map(|(rule_index, rule)| (Some(rule_index), rule.features())) + // if there is no filterable attributes rule, return no features + .unwrap_or_else(|| (None, FilterableAttributesFeatures::no_features())) + } + + pub fn is_sortable(&self) -> bool { + self.sortable + } + + pub fn is_searchable(&self) -> bool { + self.searchable.is_some() + } + + pub fn searchable_weight(&self) -> Option { + self.searchable + } + + pub fn is_distinct(&self) -> bool { + self.distinct + } + + pub fn is_asc_desc(&self) -> bool { + self.asc_desc + } + + pub fn is_geo(&self) -> bool { + self.geo + } + + /// Returns `true` if the field is part of the facet databases. (sortable, distinct, asc_desc, filterable or facet searchable) + pub fn is_faceted(&self, rules: &[FilterableAttributesRule]) -> bool { + if self.is_distinct() || self.is_sortable() || self.is_asc_desc() { + return true; + } + + let features = self.filterable_attributes_features(rules); + if features.is_filterable() || features.is_facet_searchable() { + return true; + } + + false + } + + pub fn require_facet_level_database(&self, rules: &[FilterableAttributesRule]) -> bool { + let features = self.filterable_attributes_features(rules); + + self.is_sortable() || self.is_asc_desc() || features.is_filterable_comparison() + } } #[derive(Debug, Clone)] pub struct MetadataBuilder { - searchable_attributes: Vec, - filterable_attributes: HashSet, + searchable_attributes: Option>, + filterable_attributes: Vec, sortable_attributes: HashSet, localized_attributes: Option>, + distinct_attribute: Option, + asc_desc_attributes: HashSet, } impl MetadataBuilder { pub fn from_index(index: &Index, rtxn: &RoTxn) -> Result { - let searchable_attributes = - index.searchable_fields(rtxn)?.into_iter().map(|s| s.to_string()).collect(); - let filterable_attributes = index.filterable_fields(rtxn)?; + let searchable_attributes = index + .user_defined_searchable_fields(rtxn)? + .map(|fields| fields.into_iter().map(|s| s.to_string()).collect()); + let filterable_attributes = index.filterable_attributes_rules(rtxn)?; let sortable_attributes = index.sortable_fields(rtxn)?; let localized_attributes = index.localized_attributes_rules(rtxn)?; + let distinct_attribute = index.distinct_field(rtxn)?.map(|s| s.to_string()); + let asc_desc_attributes = index.asc_desc_fields(rtxn)?; - Ok(Self { + Ok(Self::new( searchable_attributes, filterable_attributes, sortable_attributes, localized_attributes, - }) + distinct_attribute, + asc_desc_attributes, + )) } + /// Build a new `MetadataBuilder` from the given parameters. + /// + /// This is used for testing, prefer using `MetadataBuilder::from_index` instead. pub fn new( - searchable_attributes: Vec, - filterable_attributes: HashSet, + searchable_attributes: Option>, + filterable_attributes: Vec, sortable_attributes: HashSet, localized_attributes: Option>, + distinct_attribute: Option, + asc_desc_attributes: HashSet, ) -> Self { + let searchable_attributes = match searchable_attributes { + Some(fields) if fields.iter().any(|f| f == "*") => None, + Some(fields) => Some(fields), + None => None, + }; + Self { searchable_attributes, filterable_attributes, sortable_attributes, localized_attributes, + distinct_attribute, + asc_desc_attributes, } } pub fn metadata_for_field(&self, field: &str) -> Metadata { - let searchable = self - .searchable_attributes + if is_faceted_by(field, RESERVED_VECTORS_FIELD_NAME) { + // Vectors fields are not searchable, filterable, distinct or asc_desc + return Metadata { + searchable: None, + sortable: false, + distinct: false, + asc_desc: false, + geo: false, + localized_attributes_rule_id: None, + filterable_attributes_rule_id: None, + }; + } + + // A field is sortable if it is faceted by a sortable attribute + let sortable = self + .sortable_attributes .iter() - .any(|attribute| attribute == "*" || attribute == field); + .any(|pattern| match_field_legacy(pattern, field) == PatternMatch::Match); - let filterable = self.filterable_attributes.contains(field); + let filterable_attributes_rule_id = self + .filterable_attributes + .iter() + .position(|attribute| attribute.match_str(field) == PatternMatch::Match) + // saturating_add(1): make `id` `NonZero` + .map(|id| NonZeroU16::new(id.saturating_add(1).try_into().unwrap()).unwrap()); - let sortable = self.sortable_attributes.contains(field); + if match_field_legacy(RESERVED_GEO_FIELD_NAME, field) == PatternMatch::Match { + // Geo fields are not searchable, distinct or asc_desc + return Metadata { + searchable: None, + sortable, + distinct: false, + asc_desc: false, + geo: true, + localized_attributes_rule_id: None, + filterable_attributes_rule_id, + }; + } + + let searchable = match &self.searchable_attributes { + // A field is searchable if it is faceted by a searchable attribute + Some(attributes) => attributes + .iter() + .enumerate() + .find(|(_i, pattern)| is_faceted_by(field, pattern)) + .map(|(i, _)| i as u16), + None => Some(0), + }; + + let distinct = + self.distinct_attribute.as_ref().is_some_and(|distinct_field| field == distinct_field); + let asc_desc = self.asc_desc_attributes.contains(field); let localized_attributes_rule_id = self .localized_attributes .iter() .flat_map(|v| v.iter()) - .position(|rule| rule.match_str(field)) + .position(|rule| rule.match_str(field) == PatternMatch::Match) // saturating_add(1): make `id` `NonZero` .map(|id| NonZeroU16::new(id.saturating_add(1).try_into().unwrap()).unwrap()); - Metadata { searchable, filterable, sortable, localized_attributes_rule_id } + Metadata { + searchable, + sortable, + distinct, + asc_desc, + geo: false, + localized_attributes_rule_id, + filterable_attributes_rule_id, + } } - pub fn searchable_attributes(&self) -> &[String] { - self.searchable_attributes.as_slice() + pub fn searchable_attributes(&self) -> Option<&[String]> { + self.searchable_attributes.as_deref() } pub fn sortable_attributes(&self) -> &HashSet { &self.sortable_attributes } - pub fn filterable_attributes(&self) -> &HashSet { + pub fn filterable_attributes(&self) -> &[FilterableAttributesRule] { &self.filterable_attributes } diff --git a/crates/milli/src/filterable_attributes_rules.rs b/crates/milli/src/filterable_attributes_rules.rs new file mode 100644 index 000000000..53af30fd6 --- /dev/null +++ b/crates/milli/src/filterable_attributes_rules.rs @@ -0,0 +1,368 @@ +use deserr::{DeserializeError, Deserr, ValuePointerRef}; +use serde::{Deserialize, Serialize}; +use std::collections::{BTreeSet, HashSet}; +use utoipa::ToSchema; + +use crate::{ + attribute_patterns::{match_distinct_field, match_field_legacy, PatternMatch}, + constants::RESERVED_GEO_FIELD_NAME, + AttributePatterns, +}; + +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug, ToSchema)] +#[serde(untagged)] +pub enum FilterableAttributesRule { + Field(String), + Pattern(FilterableAttributesPatterns), +} + +impl FilterableAttributesRule { + /// Match a field against the filterable attributes rule. + pub fn match_str(&self, field: &str) -> PatternMatch { + match self { + // If the rule is a field, match the field against the pattern using the legacy behavior + FilterableAttributesRule::Field(pattern) => match_field_legacy(pattern, field), + // If the rule is a pattern, match the field against the pattern using the new behavior + FilterableAttributesRule::Pattern(patterns) => patterns.match_str(field), + } + } + + /// Check if the rule is a geo field. + /// + /// prefer using `index.is_geo_enabled`, `index.is_geo_filtering_enabled` or `index.is_geo_sorting_enabled` + /// to check if the geo feature is enabled. + pub fn has_geo(&self) -> bool { + matches!(self, FilterableAttributesRule::Field(field_name) if field_name == RESERVED_GEO_FIELD_NAME) + } + + /// Get the features of the rule. + pub fn features(&self) -> FilterableAttributesFeatures { + match self { + // If the rule is a field, return the legacy default features + FilterableAttributesRule::Field(_) => FilterableAttributesFeatures::legacy_default(), + // If the rule is a pattern, return the features of the pattern + FilterableAttributesRule::Pattern(patterns) => patterns.features(), + } + } +} + +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug, Deserr, ToSchema)] +#[serde(deny_unknown_fields, rename_all = "camelCase")] +#[deserr(rename_all = camelCase, deny_unknown_fields)] +pub struct FilterableAttributesPatterns { + pub attribute_patterns: AttributePatterns, + #[serde(default)] + #[deserr(default)] + pub features: FilterableAttributesFeatures, +} + +impl FilterableAttributesPatterns { + pub fn match_str(&self, field: &str) -> PatternMatch { + self.attribute_patterns.match_str(field) + } + + pub fn features(&self) -> FilterableAttributesFeatures { + self.features + } +} + +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy, Debug, Deserr, ToSchema)] +#[serde(deny_unknown_fields, rename_all = "camelCase")] +#[deserr(rename_all = camelCase, deny_unknown_fields)] +#[derive(Default)] +pub struct FilterableAttributesFeatures { + #[serde(default)] + #[deserr(default)] + facet_search: bool, + #[serde(default)] + #[deserr(default)] + filter: FilterFeatures, +} + +impl FilterableAttributesFeatures { + /// Create a new `FilterableAttributesFeatures` with the legacy default features. + /// + /// This is the default behavior for `FilterableAttributesRule::Field`. + /// This will set the facet search to true and activate all the filter operators. + pub fn legacy_default() -> Self { + Self { facet_search: true, filter: FilterFeatures::legacy_default() } + } + + /// Create a new `FilterableAttributesFeatures` with no features. + pub fn no_features() -> Self { + Self { facet_search: false, filter: FilterFeatures::no_features() } + } + + pub fn is_filterable(&self) -> bool { + self.filter.is_filterable() + } + + /// Check if `IS EMPTY` is allowed + pub fn is_filterable_empty(&self) -> bool { + self.filter.is_filterable_empty() + } + + /// Check if `=` and `IN` are allowed + pub fn is_filterable_equality(&self) -> bool { + self.filter.is_filterable_equality() + } + + /// Check if `IS NULL` is allowed + pub fn is_filterable_null(&self) -> bool { + self.filter.is_filterable_null() + } + + /// Check if `IS EXISTS` is allowed + pub fn is_filterable_exists(&self) -> bool { + self.filter.is_filterable_exists() + } + + /// Check if `<`, `>`, `<=`, `>=` or `TO` are allowed + pub fn is_filterable_comparison(&self) -> bool { + self.filter.is_filterable_comparison() + } + + /// Check if the facet search is allowed + pub fn is_facet_searchable(&self) -> bool { + self.facet_search + } + + pub fn allowed_filter_operators(&self) -> Vec { + self.filter.allowed_operators() + } +} + +impl Deserr for FilterableAttributesRule { + fn deserialize_from_value( + value: deserr::Value, + location: ValuePointerRef, + ) -> Result { + if value.kind() == deserr::ValueKind::Map { + Ok(Self::Pattern(FilterableAttributesPatterns::deserialize_from_value( + value, location, + )?)) + } else { + Ok(Self::Field(String::deserialize_from_value(value, location)?)) + } + } +} + +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy, Debug, Deserr, ToSchema)] +#[serde(deny_unknown_fields, rename_all = "camelCase")] +#[deserr(rename_all = camelCase, deny_unknown_fields)] +pub struct FilterFeatures { + #[serde(default = "default_true")] + #[deserr(default = true)] + equality: bool, + #[serde(default)] + #[deserr(default)] + comparison: bool, +} + +fn default_true() -> bool { + true +} + +impl FilterFeatures { + /// Get the allowed operators for the filter. + pub fn allowed_operators(&self) -> Vec { + if !self.is_filterable() { + return vec![]; + } + + let mut operators = vec!["OR", "AND", "NOT"]; + if self.is_filterable_equality() { + operators.extend_from_slice(&["=", "!=", "IN"]); + } + if self.is_filterable_comparison() { + operators.extend_from_slice(&["<", ">", "<=", ">=", "TO"]); + } + if self.is_filterable_empty() { + operators.push("IS EMPTY"); + } + if self.is_filterable_null() { + operators.push("IS NULL"); + } + if self.is_filterable_exists() { + operators.push("EXISTS"); + } + + operators.into_iter().map(String::from).collect() + } + + pub fn is_filterable(&self) -> bool { + self.equality || self.comparison + } + + pub fn is_filterable_equality(&self) -> bool { + self.equality + } + + /// Check if `<`, `>`, `<=`, `>=` or `TO` are allowed + pub fn is_filterable_comparison(&self) -> bool { + self.comparison + } + + /// Check if `IS EMPTY` is allowed + pub fn is_filterable_empty(&self) -> bool { + self.is_filterable() + } + + /// Check if `IS EXISTS` is allowed + pub fn is_filterable_exists(&self) -> bool { + self.is_filterable() + } + + /// Check if `IS NULL` is allowed + pub fn is_filterable_null(&self) -> bool { + self.is_filterable() + } + + /// Create a new `FilterFeatures` with the legacy default features. + /// + /// This is the default behavior for `FilterableAttributesRule::Field`. + /// This will set the equality and comparison to true. + pub fn legacy_default() -> Self { + Self { equality: true, comparison: true } + } + + /// Create a new `FilterFeatures` with no features. + pub fn no_features() -> Self { + Self { equality: false, comparison: false } + } +} + +impl Default for FilterFeatures { + fn default() -> Self { + Self { equality: true, comparison: false } + } +} + +/// Match a field against a set of filterable attributes rules. +/// +/// This function will return the set of patterns that match the given filter. +/// +/// # Arguments +/// +/// * `filterable_attributes` - The set of filterable attributes rules to match against. +/// * `filter` - The filter function to apply to the filterable attributes rules. +pub fn filtered_matching_patterns<'patterns>( + filterable_attributes: &'patterns [FilterableAttributesRule], + filter: &impl Fn(FilterableAttributesFeatures) -> bool, +) -> BTreeSet<&'patterns str> { + let mut result = BTreeSet::new(); + + for rule in filterable_attributes { + if filter(rule.features()) { + match rule { + FilterableAttributesRule::Field(field) => { + result.insert(field.as_str()); + } + FilterableAttributesRule::Pattern(patterns) => { + patterns.attribute_patterns.patterns.iter().for_each(|pattern| { + result.insert(pattern); + }); + } + } + } + } + + result +} + +/// Match a field against a set of filterable attributes rules. +/// +/// This function will return the features that match the given field name. +/// +/// # Arguments +/// +/// * `field_name` - The field name to match against. +/// * `filterable_attributes` - The set of filterable attributes rules to match against. +/// +/// # Returns +/// +/// * `Some((rule_index, features))` - The features of the matching rule and the index of the rule in the `filterable_attributes` array. +/// * `None` - No matching rule was found. +pub fn matching_features( + field_name: &str, + filterable_attributes: &[FilterableAttributesRule], +) -> Option<(usize, FilterableAttributesFeatures)> { + for (id, filterable_attribute) in filterable_attributes.iter().enumerate() { + if filterable_attribute.match_str(field_name) == PatternMatch::Match { + return Some((id, filterable_attribute.features())); + } + } + None +} + +/// Match a field against a set of filterable, facet searchable fields, distinct field, sortable fields, and asc_desc fields. +pub fn match_faceted_field( + field_name: &str, + filterable_fields: &[FilterableAttributesRule], + sortable_fields: &HashSet, + asc_desc_fields: &HashSet, + distinct_field: &Option, +) -> PatternMatch { + // Check if the field matches any filterable or facet searchable field + let mut selection = match_pattern_by_features(field_name, filterable_fields, &|features| { + features.is_facet_searchable() || features.is_filterable() + }); + + // If the field matches the pattern, return Match + if selection == PatternMatch::Match { + return selection; + } + + match match_distinct_field(distinct_field.as_deref(), field_name) { + PatternMatch::Match => return PatternMatch::Match, + PatternMatch::Parent => selection = PatternMatch::Parent, + PatternMatch::NoMatch => (), + } + + // Otherwise, check if the field matches any sortable/asc_desc field + for pattern in sortable_fields.iter().chain(asc_desc_fields.iter()) { + match match_field_legacy(pattern, field_name) { + PatternMatch::Match => return PatternMatch::Match, + PatternMatch::Parent => selection = PatternMatch::Parent, + PatternMatch::NoMatch => (), + } + } + + selection +} + +fn match_pattern_by_features( + field_name: &str, + filterable_attributes: &[FilterableAttributesRule], + filter: &impl Fn(FilterableAttributesFeatures) -> bool, +) -> PatternMatch { + let mut selection = PatternMatch::NoMatch; + + // `can_match` becomes false if the field name matches (PatternMatch::Match) any pattern that is not facet searchable or filterable, + // this ensures that the field doesn't match a pattern with a lower priority, however it can still match a pattern for a nested field as a parent (PatternMatch::Parent). + // See the test `search::filters::test_filterable_attributes_priority` for more details. + let mut can_match = true; + + // Check if the field name matches any pattern that is facet searchable or filterable + for pattern in filterable_attributes { + match pattern.match_str(field_name) { + PatternMatch::Match => { + let features = pattern.features(); + if filter(features) && can_match { + return PatternMatch::Match; + } else { + can_match = false; + } + } + PatternMatch::Parent => { + let features = pattern.features(); + if filter(features) { + selection = PatternMatch::Parent; + } + } + PatternMatch::NoMatch => (), + } + } + + selection +} diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index c748324ae..798cf3073 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -1,6 +1,5 @@ use std::borrow::Cow; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; -use std::convert::TryInto; use std::fs::File; use std::path::Path; @@ -10,10 +9,11 @@ use roaring::RoaringBitmap; use rstar::RTree; use serde::{Deserialize, Serialize}; -use crate::constants::{self, RESERVED_VECTORS_FIELD_NAME}; +use crate::constants::{self, RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME}; use crate::database_stats::DatabaseStats; use crate::documents::PrimaryKey; use crate::error::{InternalError, UserError}; +use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::fields_ids_map::FieldsIdsMap; use crate::heed_codec::facet::{ FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, @@ -27,8 +27,9 @@ use crate::vector::{ArroyStats, ArroyWrapper, Embedding, EmbeddingConfig}; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec, - FieldidsWeightsMap, GeoPoint, LocalizedAttributesRule, ObkvCodec, Result, RoaringBitmapCodec, - RoaringBitmapLenCodec, Search, U8StrStrCodec, Weight, BEU16, BEU32, BEU64, + FieldidsWeightsMap, FilterableAttributesRule, GeoPoint, LocalizedAttributesRule, ObkvCodec, + Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, Weight, BEU16, BEU32, + BEU64, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; @@ -513,6 +514,16 @@ impl Index { .unwrap_or_default()) } + /// Returns the fields ids map with metadata. + /// + /// This structure is not yet stored in the index, and is generated on the fly. + pub fn fields_ids_map_with_metadata(&self, rtxn: &RoTxn<'_>) -> Result { + Ok(FieldIdMapWithMetadata::new( + self.fields_ids_map(rtxn)?, + MetadataBuilder::from_index(self, rtxn)?, + )) + } + /* fieldids weights map */ // This maps the fields ids to their weights. // Their weights is defined by the ordering of the searchable attributes. @@ -548,6 +559,17 @@ impl Index { self.main.remap_key_type::().delete(wtxn, main_key::FIELDIDS_WEIGHTS_MAP_KEY) } + pub fn max_searchable_attribute_weight(&self, rtxn: &RoTxn<'_>) -> Result> { + let user_defined_searchable_fields = self.user_defined_searchable_fields(rtxn)?; + if let Some(user_defined_searchable_fields) = user_defined_searchable_fields { + if !user_defined_searchable_fields.contains(&"*") { + return Ok(Some(user_defined_searchable_fields.len().saturating_sub(1) as Weight)); + } + } + + Ok(None) + } + pub fn searchable_fields_and_weights<'a>( &self, rtxn: &'a RoTxn<'a>, @@ -738,8 +760,7 @@ impl Index { &self, wtxn: &mut RwTxn<'_>, user_fields: &[&str], - non_searchable_fields_ids: &[FieldId], - fields_ids_map: &FieldsIdsMap, + fields_ids_map: &FieldIdMapWithMetadata, ) -> Result<()> { // We can write the user defined searchable fields as-is. self.put_user_defined_searchable_fields(wtxn, user_fields)?; @@ -747,29 +768,17 @@ impl Index { let mut weights = FieldidsWeightsMap::default(); // Now we generate the real searchable fields: - // 1. Take the user defined searchable fields as-is to keep the priority defined by the attributes criterion. - // 2. Iterate over the user defined searchable fields. - // 3. If a user defined field is a subset of a field defined in the fields_ids_map - // (ie doggo.name is a subset of doggo) right after doggo and with the same weight. let mut real_fields = Vec::new(); - - for (id, field_from_map) in fields_ids_map.iter() { - for (weight, user_field) in user_fields.iter().enumerate() { - if crate::is_faceted_by(field_from_map, user_field) - && !real_fields.contains(&field_from_map) - && !non_searchable_fields_ids.contains(&id) - { - real_fields.push(field_from_map); - - let weight: u16 = - weight.try_into().map_err(|_| UserError::AttributeLimitReached)?; - weights.insert(id, weight); - } + for (id, field_from_map, metadata) in fields_ids_map.iter() { + if let Some(weight) = metadata.searchable_weight() { + real_fields.push(field_from_map); + weights.insert(id, weight); } } self.put_searchable_fields(wtxn, &real_fields)?; self.put_fieldids_weights_map(wtxn, &weights)?; + Ok(()) } @@ -876,26 +885,32 @@ impl Index { /* filterable fields */ - /// Writes the filterable fields names in the database. - pub(crate) fn put_filterable_fields( + /// Writes the filterable attributes rules in the database. + pub(crate) fn put_filterable_attributes_rules( &self, wtxn: &mut RwTxn<'_>, - fields: &HashSet, + fields: &[FilterableAttributesRule], ) -> heed::Result<()> { self.main.remap_types::>().put( wtxn, main_key::FILTERABLE_FIELDS_KEY, - fields, + &fields, ) } - /// Deletes the filterable fields ids in the database. - pub(crate) fn delete_filterable_fields(&self, wtxn: &mut RwTxn<'_>) -> heed::Result { + /// Deletes the filterable attributes rules in the database. + pub(crate) fn delete_filterable_attributes_rules( + &self, + wtxn: &mut RwTxn<'_>, + ) -> heed::Result { self.main.remap_key_type::().delete(wtxn, main_key::FILTERABLE_FIELDS_KEY) } - /// Returns the filterable fields names. - pub fn filterable_fields(&self, rtxn: &RoTxn<'_>) -> heed::Result> { + /// Returns the filterable attributes rules. + pub fn filterable_attributes_rules( + &self, + rtxn: &RoTxn<'_>, + ) -> heed::Result> { Ok(self .main .remap_types::>() @@ -903,21 +918,6 @@ impl Index { .unwrap_or_default()) } - /// Identical to `filterable_fields`, but returns ids instead. - pub fn filterable_fields_ids(&self, rtxn: &RoTxn<'_>) -> Result> { - let fields = self.filterable_fields(rtxn)?; - let fields_ids_map = self.fields_ids_map(rtxn)?; - - let mut fields_ids = HashSet::new(); - for name in fields { - if let Some(field_id) = fields_ids_map.id(&name) { - fields_ids.insert(field_id); - } - } - - Ok(fields_ids) - } - /* sortable fields */ /// Writes the sortable fields names in the database. @@ -954,83 +954,37 @@ impl Index { Ok(fields.into_iter().filter_map(|name| fields_ids_map.id(&name)).collect()) } - /* faceted fields */ - - /// Writes the faceted fields in the database. - pub(crate) fn put_faceted_fields( - &self, - wtxn: &mut RwTxn<'_>, - fields: &HashSet, - ) -> heed::Result<()> { - self.main.remap_types::>().put( - wtxn, - main_key::HIDDEN_FACETED_FIELDS_KEY, - fields, - ) + /// Returns true if the geo feature is enabled. + pub fn is_geo_enabled(&self, rtxn: &RoTxn<'_>) -> Result { + let geo_filter = self.is_geo_filtering_enabled(rtxn)?; + let geo_sortable = self.is_geo_sorting_enabled(rtxn)?; + Ok(geo_filter || geo_sortable) } - /// Returns the faceted fields names. - pub fn faceted_fields(&self, rtxn: &RoTxn<'_>) -> heed::Result> { - Ok(self - .main - .remap_types::>() - .get(rtxn, main_key::HIDDEN_FACETED_FIELDS_KEY)? - .unwrap_or_default()) + /// Returns true if the geo sorting feature is enabled. + pub fn is_geo_sorting_enabled(&self, rtxn: &RoTxn<'_>) -> Result { + let geo_sortable = self.sortable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME); + Ok(geo_sortable) } - /// Identical to `faceted_fields`, but returns ids instead. - pub fn faceted_fields_ids(&self, rtxn: &RoTxn<'_>) -> Result> { - let fields = self.faceted_fields(rtxn)?; - let fields_ids_map = self.fields_ids_map(rtxn)?; - - let mut fields_ids = HashSet::new(); - for name in fields { - if let Some(field_id) = fields_ids_map.id(&name) { - fields_ids.insert(field_id); - } - } - - Ok(fields_ids) + /// Returns true if the geo filtering feature is enabled. + pub fn is_geo_filtering_enabled(&self, rtxn: &RoTxn<'_>) -> Result { + let geo_filter = + self.filterable_attributes_rules(rtxn)?.iter().any(|field| field.has_geo()); + Ok(geo_filter) } - /* faceted documents ids */ - - /// Returns the user defined faceted fields names. - /// - /// The user faceted fields are the union of all the filterable, sortable, distinct, and Asc/Desc fields. - pub fn user_defined_faceted_fields(&self, rtxn: &RoTxn<'_>) -> Result> { - let filterable_fields = self.filterable_fields(rtxn)?; - let sortable_fields = self.sortable_fields(rtxn)?; - let distinct_field = self.distinct_field(rtxn)?; - let asc_desc_fields = - self.criteria(rtxn)?.into_iter().filter_map(|criterion| match criterion { + pub fn asc_desc_fields(&self, rtxn: &RoTxn<'_>) -> Result> { + let asc_desc_fields = self + .criteria(rtxn)? + .into_iter() + .filter_map(|criterion| match criterion { Criterion::Asc(field) | Criterion::Desc(field) => Some(field), _otherwise => None, - }); + }) + .collect(); - let mut faceted_fields = filterable_fields; - faceted_fields.extend(sortable_fields); - faceted_fields.extend(asc_desc_fields); - if let Some(field) = distinct_field { - faceted_fields.insert(field.to_owned()); - } - - Ok(faceted_fields) - } - - /// Identical to `user_defined_faceted_fields`, but returns ids instead. - pub fn user_defined_faceted_fields_ids(&self, rtxn: &RoTxn<'_>) -> Result> { - let fields = self.user_defined_faceted_fields(rtxn)?; - let fields_ids_map = self.fields_ids_map(rtxn)?; - - let mut fields_ids = HashSet::new(); - for name in fields { - if let Some(field_id) = fields_ids_map.id(&name) { - fields_ids.insert(field_id); - } - } - - Ok(fields_ids) + Ok(asc_desc_fields) } /* faceted documents ids */ @@ -1833,7 +1787,7 @@ pub(crate) mod tests { use big_s::S; use bumpalo::Bump; use heed::{EnvOpenOptions, RwTxn}; - use maplit::{btreemap, hashset}; + use maplit::btreemap; use memmap2::Mmap; use tempfile::TempDir; @@ -1849,7 +1803,8 @@ pub(crate) mod tests { use crate::vector::settings::{EmbedderSource, EmbeddingSettings}; use crate::vector::EmbeddingConfigs; use crate::{ - db_snap, obkv_to_json, Filter, Index, Search, SearchResult, ThreadPoolNoAbortBuilder, + db_snap, obkv_to_json, Filter, FilterableAttributesRule, Index, Search, SearchResult, + ThreadPoolNoAbortBuilder, }; pub(crate) struct TempIndex { @@ -2256,7 +2211,7 @@ pub(crate) mod tests { let rtxn = index.read_txn().unwrap(); let real = index.searchable_fields(&rtxn).unwrap(); - assert_eq!(real, &["doggo", "name"]); + assert!(real.is_empty()); let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); assert_eq!(user_defined, &["doggo", "name"]); @@ -2284,7 +2239,9 @@ pub(crate) mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S(RESERVED_GEO_FIELD_NAME) }); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + RESERVED_GEO_FIELD_NAME.to_string(), + )]); }) .unwrap(); index @@ -2392,7 +2349,9 @@ pub(crate) mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S("doggo") }); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + "doggo".to_string(), + )]); }) .unwrap(); index @@ -2429,15 +2388,14 @@ pub(crate) mod tests { #[test] fn replace_documents_external_ids_and_soft_deletion_check() { - use big_s::S; - use maplit::hashset; - let index = TempIndex::new(); index .update_settings(|settings| { settings.set_primary_key("id".to_owned()); - settings.set_filterable_fields(hashset! { S("doggo") }); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + "doggo".to_string(), + )]); }) .unwrap(); @@ -2970,8 +2928,9 @@ pub(crate) mod tests { index .update_settings(|settings| { settings.set_primary_key("id".to_string()); - settings - .set_filterable_fields(HashSet::from([RESERVED_GEO_FIELD_NAME.to_string()])); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + RESERVED_GEO_FIELD_NAME.to_string(), + )]); }) .unwrap(); @@ -3005,8 +2964,9 @@ pub(crate) mod tests { index .update_settings(|settings| { settings.set_primary_key("id".to_string()); - settings - .set_filterable_fields(HashSet::from([RESERVED_GEO_FIELD_NAME.to_string()])); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + RESERVED_GEO_FIELD_NAME.to_string(), + )]); }) .unwrap(); @@ -3039,7 +2999,9 @@ pub(crate) mod tests { index .update_settings(|settings| { settings.set_searchable_fields(vec![S("name")]); - settings.set_filterable_fields(HashSet::from([S("age")])); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + "age".to_string(), + )]); }) .unwrap(); @@ -3047,35 +3009,37 @@ pub(crate) mod tests { .add_documents(documents!({ "id": 1, "name": "Many", "age": 28, "realName": "Maxime" })) .unwrap(); db_snap!(index, fields_ids_map, @r###" - 0 name | - 1 id | + 0 id | + 1 name | 2 age | 3 realName | "###); db_snap!(index, searchable_fields, @r###"["name"]"###); db_snap!(index, fieldids_weights_map, @r###" fid weight - 0 0 | + 1 0 | "###); index .update_settings(|settings| { settings.set_searchable_fields(vec![S("name"), S("realName")]); - settings.set_filterable_fields(HashSet::from([S("age")])); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + "age".to_string(), + )]); }) .unwrap(); // The order of the field id map shouldn't change db_snap!(index, fields_ids_map, @r###" - 0 name | - 1 id | + 0 id | + 1 name | 2 age | 3 realName | "###); db_snap!(index, searchable_fields, @r###"["name", "realName"]"###); db_snap!(index, fieldids_weights_map, @r###" fid weight - 0 0 | + 1 0 | 3 1 | "###); } @@ -3160,14 +3124,16 @@ pub(crate) mod tests { index .update_settings(|settings| { settings.set_searchable_fields(vec![S("_vectors"), S("_vectors.doggo")]); - settings.set_filterable_fields(hashset![S("_vectors"), S("_vectors.doggo")]); + settings.set_filterable_fields(vec![ + FilterableAttributesRule::Field("_vectors".to_string()), + FilterableAttributesRule::Field("_vectors.doggo".to_string()), + ]); }) .unwrap(); db_snap!(index, fields_ids_map, @r###" 0 id | 1 _vectors | - 2 _vectors.doggo | "###); db_snap!(index, searchable_fields, @"[]"); db_snap!(index, fieldids_weights_map, @r###" @@ -3200,7 +3166,6 @@ pub(crate) mod tests { db_snap!(index, fields_ids_map, @r###" 0 id | 1 _vectors | - 2 _vectors.doggo | "###); db_snap!(index, searchable_fields, @"[]"); db_snap!(index, fieldids_weights_map, @r###" diff --git a/crates/milli/src/lib.rs b/crates/milli/src/lib.rs index 1d6d04fc7..85540c82e 100644 --- a/crates/milli/src/lib.rs +++ b/crates/milli/src/lib.rs @@ -9,12 +9,14 @@ pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; pub mod documents; mod asc_desc; +mod attribute_patterns; mod criterion; pub mod database_stats; mod error; mod external_documents_ids; pub mod facet; mod fields_ids_map; +mod filterable_attributes_rules; pub mod heed_codec; pub mod index; mod localized_attributes_rules; @@ -52,6 +54,8 @@ pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbor pub use {charabia as tokenizer, heed, rhai}; pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; +pub use self::attribute_patterns::AttributePatterns; +pub use self::attribute_patterns::PatternMatch; pub use self::criterion::{default_criteria, Criterion, CriterionError}; pub use self::error::{ Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError, @@ -59,6 +63,10 @@ pub use self::error::{ pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fieldids_weights_map::FieldidsWeightsMap; pub use self::fields_ids_map::{FieldsIdsMap, GlobalFieldsIdsMap}; +pub use self::filterable_attributes_rules::{ + FilterFeatures, FilterableAttributesFeatures, FilterableAttributesPatterns, + FilterableAttributesRule, +}; pub use self::heed_codec::{ BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, @@ -67,7 +75,6 @@ pub use self::heed_codec::{ }; pub use self::index::Index; pub use self::localized_attributes_rules::LocalizedAttributesRule; -use self::localized_attributes_rules::LocalizedFieldIds; pub use self::search::facet::{FacetValueHit, SearchForFacetValues}; pub use self::search::similar::Similar; pub use self::search::{ diff --git a/crates/milli/src/localized_attributes_rules.rs b/crates/milli/src/localized_attributes_rules.rs index 2b9bf099c..81015c458 100644 --- a/crates/milli/src/localized_attributes_rules.rs +++ b/crates/milli/src/localized_attributes_rules.rs @@ -4,8 +4,9 @@ use charabia::Language; use serde::{Deserialize, Serialize}; use utoipa::ToSchema; +use crate::attribute_patterns::PatternMatch; use crate::fields_ids_map::FieldsIdsMap; -use crate::FieldId; +use crate::{AttributePatterns, FieldId}; /// A rule that defines which locales are supported for a given attribute. /// @@ -17,18 +18,18 @@ use crate::FieldId; /// The pattern `*attribute_name*` matches any attribute name that contains `attribute_name`. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] pub struct LocalizedAttributesRule { - pub attribute_patterns: Vec, + pub attribute_patterns: AttributePatterns, #[schema(value_type = Vec)] pub locales: Vec, } impl LocalizedAttributesRule { pub fn new(attribute_patterns: Vec, locales: Vec) -> Self { - Self { attribute_patterns, locales } + Self { attribute_patterns: AttributePatterns::from(attribute_patterns), locales } } - pub fn match_str(&self, str: &str) -> bool { - self.attribute_patterns.iter().any(|pattern| match_pattern(pattern.as_str(), str)) + pub fn match_str(&self, str: &str) -> PatternMatch { + self.attribute_patterns.match_str(str) } pub fn locales(&self) -> &[Language] { @@ -36,20 +37,6 @@ impl LocalizedAttributesRule { } } -fn match_pattern(pattern: &str, str: &str) -> bool { - if pattern == "*" { - true - } else if pattern.starts_with('*') && pattern.ends_with('*') { - str.contains(&pattern[1..pattern.len() - 1]) - } else if let Some(pattern) = pattern.strip_prefix('*') { - str.ends_with(pattern) - } else if let Some(pattern) = pattern.strip_suffix('*') { - str.starts_with(pattern) - } else { - pattern == str - } -} - #[derive(Debug, Clone, PartialEq, Eq)] pub struct LocalizedFieldIds { field_id_to_locales: HashMap>, @@ -65,13 +52,13 @@ impl LocalizedFieldIds { if let Some(rules) = rules { let fields = fields_ids.filter_map(|field_id| { - fields_ids_map.name(field_id).map(|field_name| (field_id, field_name)) + fields_ids_map.name(field_id).map(|field_name: &str| (field_id, field_name)) }); for (field_id, field_name) in fields { let mut locales = Vec::new(); for rule in rules { - if rule.match_str(field_name) { + if rule.match_str(field_name) == PatternMatch::Match { locales.extend(rule.locales.iter()); // Take the first rule that matches break; @@ -89,10 +76,6 @@ impl LocalizedFieldIds { Self { field_id_to_locales } } - pub fn locales(&self, fields_id: FieldId) -> Option<&[Language]> { - self.field_id_to_locales.get(&fields_id).map(Vec::as_slice) - } - pub fn all_locales(&self) -> Vec { let mut locales = Vec::new(); for field_locales in self.field_id_to_locales.values() { @@ -108,24 +91,3 @@ impl LocalizedFieldIds { locales } } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_match_pattern() { - assert!(match_pattern("*", "test")); - assert!(match_pattern("test*", "test")); - assert!(match_pattern("test*", "testa")); - assert!(match_pattern("*test", "test")); - assert!(match_pattern("*test", "atest")); - assert!(match_pattern("*test*", "test")); - assert!(match_pattern("*test*", "atesta")); - assert!(match_pattern("*test*", "atest")); - assert!(match_pattern("*test*", "testa")); - assert!(!match_pattern("test*test", "test")); - assert!(!match_pattern("*test", "testa")); - assert!(!match_pattern("test*", "atest")); - } -} diff --git a/crates/milli/src/prompt/fields.rs b/crates/milli/src/prompt/fields.rs index ab15c31b0..ffafffd63 100644 --- a/crates/milli/src/prompt/fields.rs +++ b/crates/milli/src/prompt/fields.rs @@ -7,14 +7,14 @@ use liquid::model::{ }; use liquid::{ObjectView, ValueView}; -use super::{FieldMetadata, FieldsIdsMapWithMetadata}; +use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, Metadata}; use crate::GlobalFieldsIdsMap; #[derive(Debug, Clone, Copy)] pub struct FieldValue<'a, D: ObjectView> { name: &'a str, document: &'a D, - metadata: FieldMetadata, + metadata: Metadata, } impl<'a, D: ObjectView> ValueView for FieldValue<'a, D> { @@ -67,7 +67,10 @@ impl<'a, D: ObjectView> FieldValue<'a, D> { } pub fn is_searchable(&self) -> &bool { - &self.metadata.searchable + match self.metadata.is_searchable() { + true => &true, + false => &false, + } } pub fn is_empty(&self) -> bool { @@ -125,15 +128,11 @@ pub struct BorrowedFields<'a, 'map, D: ObjectView> { } impl<'a, D: ObjectView> OwnedFields<'a, D> { - pub fn new(document: &'a D, field_id_map: &'a FieldsIdsMapWithMetadata<'a>) -> Self { + pub fn new(document: &'a D, field_id_map: &'a FieldIdMapWithMetadata) -> Self { Self( std::iter::repeat(document) .zip(field_id_map.iter()) - .map(|(document, (fid, name))| FieldValue { - document, - name, - metadata: field_id_map.metadata(fid).unwrap_or_default(), - }) + .map(|(document, (_fid, name, metadata))| FieldValue { document, name, metadata }) .collect(), ) } @@ -187,7 +186,7 @@ impl<'a, 'map, D: ObjectView> ArrayView for BorrowedFields<'a, 'map, D> { let fv = self.doc_alloc.alloc(FieldValue { name: self.doc_alloc.alloc_str(&k), document: self.document, - metadata: FieldMetadata { searchable: metadata.searchable }, + metadata, }); fv as _ })) @@ -207,7 +206,7 @@ impl<'a, 'map, D: ObjectView> ArrayView for BorrowedFields<'a, 'map, D> { let fv = self.doc_alloc.alloc(FieldValue { name: self.doc_alloc.alloc_str(&key), document: self.document, - metadata: FieldMetadata { searchable: metadata.searchable }, + metadata, }); Some(fv as _) } diff --git a/crates/milli/src/prompt/mod.rs b/crates/milli/src/prompt/mod.rs index 3eb91611e..a5cb8de48 100644 --- a/crates/milli/src/prompt/mod.rs +++ b/crates/milli/src/prompt/mod.rs @@ -5,11 +5,9 @@ mod fields; mod template_checker; use std::cell::RefCell; -use std::collections::BTreeMap; use std::convert::TryFrom; use std::fmt::Debug; use std::num::NonZeroUsize; -use std::ops::Deref; use bumpalo::Bump; use document::ParseableDocument; @@ -18,8 +16,9 @@ use fields::{BorrowedFields, OwnedFields}; use self::context::Context; use self::document::Document; +use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::update::del_add::DelAdd; -use crate::{FieldId, FieldsIdsMap, GlobalFieldsIdsMap}; +use crate::GlobalFieldsIdsMap; pub struct Prompt { template: liquid::Template, @@ -145,9 +144,9 @@ impl Prompt { &self, document: &obkv::KvReaderU16, side: DelAdd, - field_id_map: &FieldsIdsMapWithMetadata, + field_id_map: &FieldIdMapWithMetadata, ) -> Result { - let document = Document::new(document, side, field_id_map); + let document = Document::new(document, side, field_id_map.as_fields_ids_map()); let fields = OwnedFields::new(&document, field_id_map); let context = Context::new(&document, &fields); @@ -172,40 +171,6 @@ fn truncate(s: &mut String, max_bytes: usize) { } } -pub struct FieldsIdsMapWithMetadata<'a> { - fields_ids_map: &'a FieldsIdsMap, - metadata: BTreeMap, -} - -impl<'a> FieldsIdsMapWithMetadata<'a> { - pub fn new(fields_ids_map: &'a FieldsIdsMap, searchable_fields_ids: &'_ [FieldId]) -> Self { - let mut metadata: BTreeMap = - fields_ids_map.ids().map(|id| (id, Default::default())).collect(); - for searchable_field_id in searchable_fields_ids { - let Some(metadata) = metadata.get_mut(searchable_field_id) else { continue }; - metadata.searchable = true; - } - Self { fields_ids_map, metadata } - } - - pub fn metadata(&self, field_id: FieldId) -> Option { - self.metadata.get(&field_id).copied() - } -} - -impl<'a> Deref for FieldsIdsMapWithMetadata<'a> { - type Target = FieldsIdsMap; - - fn deref(&self) -> &Self::Target { - self.fields_ids_map - } -} - -#[derive(Debug, Default, Clone, Copy)] -pub struct FieldMetadata { - pub searchable: bool, -} - #[cfg(test)] mod test { use super::Prompt; diff --git a/crates/milli/src/search/facet/facet_distribution.rs b/crates/milli/src/search/facet/facet_distribution.rs index ee0fad535..4b5c1158e 100644 --- a/crates/milli/src/search/facet/facet_distribution.rs +++ b/crates/milli/src/search/facet/facet_distribution.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeMap, HashMap, HashSet}; +use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::fmt::Display; use std::ops::ControlFlow; use std::{fmt, mem}; @@ -9,8 +9,9 @@ use indexmap::IndexMap; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; -use crate::error::UserError; +use crate::attribute_patterns::match_field_legacy; use crate::facet::FacetType; +use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features}; use crate::heed_codec::facet::{ FacetGroupKeyCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, OrderedF64Codec, }; @@ -18,7 +19,7 @@ use crate::heed_codec::{BytesRefCodec, StrRefCodec}; use crate::search::facet::facet_distribution_iter::{ count_iterate_over_facet_distribution, lexicographically_iterate_over_facet_distribution, }; -use crate::{FieldId, Index, Result}; +use crate::{Error, FieldId, FilterableAttributesRule, Index, PatternMatch, Result, UserError}; /// The default number of values by facets that will /// be fetched from the key-value store. @@ -287,37 +288,19 @@ impl<'a> FacetDistribution<'a> { } pub fn compute_stats(&self) -> Result> { - let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; - let filterable_fields = self.index.filterable_fields(self.rtxn)?; let candidates = if let Some(candidates) = self.candidates.clone() { candidates } else { return Ok(Default::default()); }; - let fields = match &self.facets { - Some(facets) => { - let invalid_fields: HashSet<_> = facets - .iter() - .map(|(name, _)| name) - .filter(|facet| !crate::is_faceted(facet, &filterable_fields)) - .collect(); - if !invalid_fields.is_empty() { - return Err(UserError::InvalidFacetsDistribution { - invalid_facets_name: invalid_fields.into_iter().cloned().collect(), - valid_facets_name: filterable_fields.into_iter().collect(), - } - .into()); - } else { - facets.iter().map(|(name, _)| name).cloned().collect() - } - } - None => filterable_fields, - }; + let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; + let filterable_attributes_rules = self.index.filterable_attributes_rules(self.rtxn)?; + self.check_faceted_fields(&filterable_attributes_rules)?; let mut distribution = BTreeMap::new(); for (fid, name) in fields_ids_map.iter() { - if crate::is_faceted(name, &fields) { + if self.select_field(name, &filterable_attributes_rules) { let min_value = if let Some(min_value) = crate::search::facet::facet_min_value( self.index, self.rtxn, @@ -348,31 +331,12 @@ impl<'a> FacetDistribution<'a> { pub fn execute(&self) -> Result>> { let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; - let filterable_fields = self.index.filterable_fields(self.rtxn)?; - - let fields = match self.facets { - Some(ref facets) => { - let invalid_fields: HashSet<_> = facets - .iter() - .map(|(name, _)| name) - .filter(|facet| !crate::is_faceted(facet, &filterable_fields)) - .collect(); - if !invalid_fields.is_empty() { - return Err(UserError::InvalidFacetsDistribution { - invalid_facets_name: invalid_fields.into_iter().cloned().collect(), - valid_facets_name: filterable_fields.into_iter().collect(), - } - .into()); - } else { - facets.iter().map(|(name, _)| name).cloned().collect() - } - } - None => filterable_fields, - }; + let filterable_attributes_rules = self.index.filterable_attributes_rules(self.rtxn)?; + self.check_faceted_fields(&filterable_attributes_rules)?; let mut distribution = BTreeMap::new(); for (fid, name) in fields_ids_map.iter() { - if crate::is_faceted(name, &fields) { + if self.select_field(name, &filterable_attributes_rules) { let order_by = self .facets .as_ref() @@ -385,6 +349,62 @@ impl<'a> FacetDistribution<'a> { Ok(distribution) } + + /// Select a field if it is filterable and in the facets. + fn select_field( + &self, + name: &str, + filterable_attributes_rules: &[FilterableAttributesRule], + ) -> bool { + // If the field is not filterable, we don't want to compute the facet distribution. + if !matching_features(name, filterable_attributes_rules) + .map_or(false, |(_, features)| features.is_filterable()) + { + return false; + } + + match &self.facets { + Some(facets) => { + // The list of facets provided by the user is a legacy pattern ("dog.age" must be selected with "dog"). + facets.keys().any(|key| match_field_legacy(key, name) == PatternMatch::Match) + } + None => true, + } + } + + /// Check if the fields in the facets are valid filterable fields. + fn check_faceted_fields( + &self, + filterable_attributes_rules: &[FilterableAttributesRule], + ) -> Result<()> { + let mut invalid_facets = BTreeSet::new(); + if let Some(facets) = &self.facets { + for field in facets.keys() { + let is_valid_filterable_field = + matching_features(field, filterable_attributes_rules) + .map_or(false, |(_, features)| features.is_filterable()); + if !is_valid_filterable_field { + invalid_facets.insert(field.to_string()); + } + } + } + + if !invalid_facets.is_empty() { + let valid_patterns = + filtered_matching_patterns(filterable_attributes_rules, &|features| { + features.is_filterable() + }) + .into_iter() + .map(String::from) + .collect(); + return Err(Error::UserError(UserError::InvalidFacetsDistribution { + invalid_facets_name: invalid_facets, + valid_patterns, + })); + } + + Ok(()) + } } impl fmt::Debug for FacetDistribution<'_> { @@ -412,11 +432,10 @@ mod tests { use std::iter; use big_s::S; - use maplit::hashset; use crate::documents::mmap_from_objects; use crate::index::tests::TempIndex; - use crate::{milli_snap, FacetDistribution, OrderBy}; + use crate::{milli_snap, FacetDistribution, FilterableAttributesRule, OrderBy}; #[test] fn few_candidates_few_facet_values() { @@ -426,7 +445,9 @@ mod tests { let index = TempIndex::new(); index - .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .update_settings(|settings| { + settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S("colour"))]) + }) .unwrap(); let documents = documents!([ @@ -497,7 +518,9 @@ mod tests { let index = TempIndex::new_with_map_size(4096 * 10_000); index - .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .update_settings(|settings| { + settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S("colour"))]) + }) .unwrap(); let facet_values = ["Red", "RED", " red ", "Blue", "BLUE"]; @@ -582,7 +605,9 @@ mod tests { let index = TempIndex::new_with_map_size(4096 * 10_000); index - .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .update_settings(|settings| { + settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S("colour"))]) + }) .unwrap(); let facet_values = (0..1000).map(|x| format!("{x:x}")).collect::>(); @@ -641,7 +666,9 @@ mod tests { let index = TempIndex::new_with_map_size(4096 * 10_000); index - .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .update_settings(|settings| { + settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S("colour"))]) + }) .unwrap(); let facet_values = (0..1000).collect::>(); @@ -692,7 +719,9 @@ mod tests { let index = TempIndex::new_with_map_size(4096 * 10_000); index - .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .update_settings(|settings| { + settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S("colour"))]) + }) .unwrap(); let facet_values = (0..1000).collect::>(); @@ -743,7 +772,9 @@ mod tests { let index = TempIndex::new_with_map_size(4096 * 10_000); index - .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .update_settings(|settings| { + settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S("colour"))]) + }) .unwrap(); let facet_values = (0..1000).collect::>(); @@ -794,7 +825,9 @@ mod tests { let index = TempIndex::new_with_map_size(4096 * 10_000); index - .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .update_settings(|settings| { + settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S("colour"))]) + }) .unwrap(); let facet_values = (0..1000).collect::>(); diff --git a/crates/milli/src/search/facet/filter.rs b/crates/milli/src/search/facet/filter.rs index 76f9ed6ff..eb370a757 100644 --- a/crates/milli/src/search/facet/filter.rs +++ b/crates/milli/src/search/facet/filter.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; +use std::collections::BTreeSet; use std::fmt::{Debug, Display}; use std::ops::Bound::{self, Excluded, Included}; @@ -12,12 +12,14 @@ use serde_json::Value; use super::facet_range_search; use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::error::{Error, UserError}; +use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features}; use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, OrderedF64Codec, }; use crate::index::db_name::FACET_ID_STRING_DOCIDS; use crate::{ - distance_between_two_points, lat_lng_to_xyz, FieldId, Index, InternalError, Result, + distance_between_two_points, lat_lng_to_xyz, FieldId, FieldsIdsMap, + FilterableAttributesFeatures, FilterableAttributesRule, Index, InternalError, Result, SerializationError, }; @@ -60,7 +62,7 @@ impl Display for BadGeoError { #[derive(Debug)] enum FilterError<'a> { - AttributeNotFilterable { attribute: &'a str, filterable_fields: HashSet }, + AttributeNotFilterable { attribute: &'a str, filterable_patterns: BTreeSet<&'a str> }, ParseGeoError(BadGeoError), TooDeep, } @@ -75,14 +77,14 @@ impl<'a> From for FilterError<'a> { impl<'a> Display for FilterError<'a> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - Self::AttributeNotFilterable { attribute, filterable_fields } => { + Self::AttributeNotFilterable { attribute, filterable_patterns } => { write!(f, "Attribute `{attribute}` is not filterable.")?; - if filterable_fields.is_empty() { + if filterable_patterns.is_empty() { write!(f, " This index does not have configured filterable attributes.") } else { - write!(f, " Available filterable attributes are: ")?; + write!(f, " Available filterable attribute patterns are: ")?; let mut filterables_list = - filterable_fields.iter().map(AsRef::as_ref).collect::>(); + filterable_patterns.iter().map(AsRef::as_ref).collect::>(); filterables_list.sort_unstable(); for (idx, filterable) in filterables_list.iter().enumerate() { write!(f, "`{filterable}`")?; @@ -230,17 +232,27 @@ impl<'a> Filter<'a> { impl<'a> Filter<'a> { pub fn evaluate(&self, rtxn: &heed::RoTxn<'_>, index: &Index) -> Result { // to avoid doing this for each recursive call we're going to do it ONCE ahead of time - let filterable_fields = index.filterable_fields(rtxn)?; + let fields_ids_map = index.fields_ids_map(rtxn)?; + let filterable_attributes_rules = index.filterable_attributes_rules(rtxn)?; for fid in self.condition.fids(MAX_FILTER_DEPTH) { let attribute = fid.value(); - if !crate::is_faceted(attribute, &filterable_fields) { - return Err(fid.as_external_error(FilterError::AttributeNotFilterable { - attribute, - filterable_fields, - }))?; + if matching_features(attribute, &filterable_attributes_rules) + .map_or(false, |(_, features)| features.is_filterable()) + { + continue; } + + // If the field is not filterable, return an error + return Err(fid.as_external_error(FilterError::AttributeNotFilterable { + attribute, + filterable_patterns: filtered_matching_patterns( + &filterable_attributes_rules, + &|features| features.is_filterable(), + ), + }))?; } - self.inner_evaluate(rtxn, index, &filterable_fields, None) + + self.inner_evaluate(rtxn, index, &fields_ids_map, &filterable_attributes_rules, None) } fn evaluate_operator( @@ -249,6 +261,8 @@ impl<'a> Filter<'a> { field_id: FieldId, universe: Option<&RoaringBitmap>, operator: &Condition<'a>, + features: &FilterableAttributesFeatures, + rule_index: usize, ) -> Result { let numbers_db = index.facet_id_f64_docids; let strings_db = index.facet_id_string_docids; @@ -258,6 +272,38 @@ impl<'a> Filter<'a> { // field id and the level. let (left, right) = match operator { + // return an error if the filter is not allowed for this field + Condition::GreaterThan(_) + | Condition::GreaterThanOrEqual(_) + | Condition::LowerThan(_) + | Condition::LowerThanOrEqual(_) + | Condition::Between { .. } + if !features.is_filterable_comparison() => + { + return Err(generate_filter_error( + rtxn, index, field_id, operator, features, rule_index, + )); + } + Condition::Empty if !features.is_filterable_empty() => { + return Err(generate_filter_error( + rtxn, index, field_id, operator, features, rule_index, + )); + } + Condition::Null if !features.is_filterable_null() => { + return Err(generate_filter_error( + rtxn, index, field_id, operator, features, rule_index, + )); + } + Condition::Exists if !features.is_filterable_exists() => { + return Err(generate_filter_error( + rtxn, index, field_id, operator, features, rule_index, + )); + } + Condition::Equal(_) | Condition::NotEqual(_) if !features.is_filterable_equality() => { + return Err(generate_filter_error( + rtxn, index, field_id, operator, features, rule_index, + )); + } Condition::GreaterThan(val) => { (Excluded(val.parse_finite_float()?), Included(f64::MAX)) } @@ -307,7 +353,9 @@ impl<'a> Filter<'a> { } Condition::NotEqual(val) => { let operator = Condition::Equal(val.clone()); - let docids = Self::evaluate_operator(rtxn, index, field_id, None, &operator)?; + let docids = Self::evaluate_operator( + rtxn, index, field_id, None, &operator, features, rule_index, + )?; let all_ids = index.documents_ids(rtxn)?; return Ok(all_ids - docids); } @@ -409,7 +457,8 @@ impl<'a> Filter<'a> { &self, rtxn: &heed::RoTxn<'_>, index: &Index, - filterable_fields: &HashSet, + field_ids_map: &FieldsIdsMap, + filterable_attribute_rules: &[FilterableAttributesRule], universe: Option<&RoaringBitmap>, ) -> Result { if universe.map_or(false, |u| u.is_empty()) { @@ -422,7 +471,8 @@ impl<'a> Filter<'a> { &(f.as_ref().clone()).into(), rtxn, index, - filterable_fields, + field_ids_map, + filterable_attribute_rules, universe, )?; match universe { @@ -434,42 +484,49 @@ impl<'a> Filter<'a> { } } FilterCondition::In { fid, els } => { - if crate::is_faceted(fid.value(), filterable_fields) { - let field_ids_map = index.fields_ids_map(rtxn)?; - if let Some(fid) = field_ids_map.id(fid.value()) { - els.iter() - .map(|el| Condition::Equal(el.clone())) - .map(|op| Self::evaluate_operator(rtxn, index, fid, universe, &op)) - .union() - } else { - Ok(RoaringBitmap::new()) - } - } else { - Err(fid.as_external_error(FilterError::AttributeNotFilterable { - attribute: fid.value(), - filterable_fields: filterable_fields.clone(), - }))? - } + let Some(field_id) = field_ids_map.id(fid.value()) else { + return Ok(RoaringBitmap::new()); + }; + let Some((rule_index, features)) = + matching_features(fid.value(), filterable_attribute_rules) + else { + return Ok(RoaringBitmap::new()); + }; + + els.iter() + .map(|el| Condition::Equal(el.clone())) + .map(|op| { + Self::evaluate_operator( + rtxn, index, field_id, universe, &op, &features, rule_index, + ) + }) + .union() } FilterCondition::Condition { fid, op } => { - if crate::is_faceted(fid.value(), filterable_fields) { - let field_ids_map = index.fields_ids_map(rtxn)?; - if let Some(fid) = field_ids_map.id(fid.value()) { - Self::evaluate_operator(rtxn, index, fid, universe, op) - } else { - Ok(RoaringBitmap::new()) - } - } else { - Err(fid.as_external_error(FilterError::AttributeNotFilterable { - attribute: fid.value(), - filterable_fields: filterable_fields.clone(), - }))? - } + let Some(field_id) = field_ids_map.id(fid.value()) else { + return Ok(RoaringBitmap::new()); + }; + let Some((rule_index, features)) = + matching_features(fid.value(), filterable_attribute_rules) + else { + return Ok(RoaringBitmap::new()); + }; + + Self::evaluate_operator(rtxn, index, field_id, universe, op, &features, rule_index) } FilterCondition::Or(subfilters) => subfilters .iter() .cloned() - .map(|f| Self::inner_evaluate(&f.into(), rtxn, index, filterable_fields, universe)) + .map(|f| { + Self::inner_evaluate( + &f.into(), + rtxn, + index, + field_ids_map, + filterable_attribute_rules, + universe, + ) + }) .union(), FilterCondition::And(subfilters) => { let mut subfilters_iter = subfilters.iter(); @@ -478,7 +535,8 @@ impl<'a> Filter<'a> { &(first_subfilter.clone()).into(), rtxn, index, - filterable_fields, + field_ids_map, + filterable_attribute_rules, universe, )?; for f in subfilters_iter { @@ -492,7 +550,8 @@ impl<'a> Filter<'a> { &(f.clone()).into(), rtxn, index, - filterable_fields, + field_ids_map, + filterable_attribute_rules, Some(&bitmap), )?; } @@ -502,7 +561,7 @@ impl<'a> Filter<'a> { } } FilterCondition::GeoLowerThan { point, radius } => { - if filterable_fields.contains(RESERVED_GEO_FIELD_NAME) { + if index.is_geo_filtering_enabled(rtxn)? { let base_point: [f64; 2] = [point[0].parse_finite_float()?, point[1].parse_finite_float()?]; if !(-90.0..=90.0).contains(&base_point[0]) { @@ -532,12 +591,15 @@ impl<'a> Filter<'a> { } else { Err(point[0].as_external_error(FilterError::AttributeNotFilterable { attribute: RESERVED_GEO_FIELD_NAME, - filterable_fields: filterable_fields.clone(), + filterable_patterns: filtered_matching_patterns( + filterable_attribute_rules, + &|features| features.is_filterable(), + ), }))? } } FilterCondition::GeoBoundingBox { top_right_point, bottom_left_point } => { - if filterable_fields.contains(RESERVED_GEO_FIELD_NAME) { + if index.is_geo_filtering_enabled(rtxn)? { let top_right: [f64; 2] = [ top_right_point[0].parse_finite_float()?, top_right_point[1].parse_finite_float()?, @@ -592,7 +654,8 @@ impl<'a> Filter<'a> { let selected_lat = Filter { condition: condition_lat }.inner_evaluate( rtxn, index, - filterable_fields, + field_ids_map, + filterable_attribute_rules, universe, )?; @@ -625,7 +688,8 @@ impl<'a> Filter<'a> { let left = Filter { condition: condition_left }.inner_evaluate( rtxn, index, - filterable_fields, + field_ids_map, + filterable_attribute_rules, universe, )?; @@ -639,7 +703,8 @@ impl<'a> Filter<'a> { let right = Filter { condition: condition_right }.inner_evaluate( rtxn, index, - filterable_fields, + field_ids_map, + filterable_attribute_rules, universe, )?; @@ -655,7 +720,8 @@ impl<'a> Filter<'a> { Filter { condition: condition_lng }.inner_evaluate( rtxn, index, - filterable_fields, + field_ids_map, + filterable_attribute_rules, universe, )? }; @@ -665,7 +731,10 @@ impl<'a> Filter<'a> { Err(top_right_point[0].as_external_error( FilterError::AttributeNotFilterable { attribute: RESERVED_GEO_FIELD_NAME, - filterable_fields: filterable_fields.clone(), + filterable_patterns: filtered_matching_patterns( + filterable_attribute_rules, + &|features| features.is_filterable(), + ), }, ))? } @@ -674,6 +743,28 @@ impl<'a> Filter<'a> { } } +fn generate_filter_error( + rtxn: &heed::RoTxn<'_>, + index: &Index, + field_id: FieldId, + operator: &Condition<'_>, + features: &FilterableAttributesFeatures, + rule_index: usize, +) -> Error { + match index.fields_ids_map(rtxn) { + Ok(fields_ids_map) => { + let field = fields_ids_map.name(field_id).unwrap_or_default(); + Error::UserError(UserError::FilterOperatorNotAllowed { + field: field.to_string(), + allowed_operators: features.allowed_filter_operators(), + operator: operator.operator().to_string(), + rule_index, + }) + } + Err(e) => e.into(), + } +} + impl<'a> From> for Filter<'a> { fn from(fc: FilterCondition<'a>) -> Self { Self { condition: fc } @@ -687,12 +778,12 @@ mod tests { use big_s::S; use either::Either; - use maplit::hashset; + use meili_snap::snapshot; use roaring::RoaringBitmap; use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::index::tests::TempIndex; - use crate::Filter; + use crate::{Filter, FilterableAttributesRule}; #[test] fn empty_db() { @@ -700,7 +791,9 @@ mod tests { //Set the filterable fields to be the channel. index .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S("PrIcE") }); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + "PrIcE".to_string(), + )]); }) .unwrap(); @@ -784,27 +877,32 @@ mod tests { let rtxn = index.read_txn().unwrap(); let filter = Filter::from_str("_geoRadius(42, 150, 10)").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); - assert!(error.to_string().starts_with( - "Attribute `_geo` is not filterable. This index does not have configured filterable attributes." - )); + snapshot!(error.to_string(), @r###" + Attribute `_geo` is not filterable. This index does not have configured filterable attributes. + 12:14 _geoRadius(42, 150, 10) + "###); let filter = Filter::from_str("_geoBoundingBox([42, 150], [30, 10])").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); - assert!(error.to_string().starts_with( - "Attribute `_geo` is not filterable. This index does not have configured filterable attributes." - )); + snapshot!(error.to_string(), @r###" + Attribute `_geo` is not filterable. This index does not have configured filterable attributes. + 18:20 _geoBoundingBox([42, 150], [30, 10]) + "###); let filter = Filter::from_str("dog = \"bernese mountain\"").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); - assert!(error.to_string().starts_with( - "Attribute `dog` is not filterable. This index does not have configured filterable attributes." - )); + snapshot!(error.to_string(), @r###" + Attribute `dog` is not filterable. This index does not have configured filterable attributes. + 1:4 dog = "bernese mountain" + "###); drop(rtxn); index .update_settings(|settings| { settings.set_searchable_fields(vec![S("title")]); - settings.set_filterable_fields(hashset! { S("title") }); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + "title".to_string(), + )]); }) .unwrap(); @@ -812,39 +910,45 @@ mod tests { let filter = Filter::from_str("_geoRadius(-100, 150, 10)").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); - assert!(error.to_string().starts_with( - "Attribute `_geo` is not filterable. Available filterable attributes are: `title`." - )); + snapshot!(error.to_string(), @r###" + Attribute `_geo` is not filterable. Available filterable attribute patterns are: `title`. + 12:16 _geoRadius(-100, 150, 10) + "###); let filter = Filter::from_str("_geoBoundingBox([42, 150], [30, 10])").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); - assert!(error.to_string().starts_with( - "Attribute `_geo` is not filterable. Available filterable attributes are: `title`." - )); + snapshot!(error.to_string(), @r###" + Attribute `_geo` is not filterable. Available filterable attribute patterns are: `title`. + 18:20 _geoBoundingBox([42, 150], [30, 10]) + "###); let filter = Filter::from_str("name = 12").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); - assert!(error.to_string().starts_with( - "Attribute `name` is not filterable. Available filterable attributes are: `title`." - )); + snapshot!(error.to_string(), @r###" + Attribute `name` is not filterable. Available filterable attribute patterns are: `title`. + 1:5 name = 12 + "###); let filter = Filter::from_str("title = \"test\" AND name = 12").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); - assert!(error.to_string().starts_with( - "Attribute `name` is not filterable. Available filterable attributes are: `title`." - )); + snapshot!(error.to_string(), @r###" + Attribute `name` is not filterable. Available filterable attribute patterns are: `title`. + 20:24 title = "test" AND name = 12 + "###); let filter = Filter::from_str("title = \"test\" AND name IN [12]").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); - assert!(error.to_string().starts_with( - "Attribute `name` is not filterable. Available filterable attributes are: `title`." - )); + snapshot!(error.to_string(), @r###" + Attribute `name` is not filterable. Available filterable attribute patterns are: `title`. + 20:24 title = "test" AND name IN [12] + "###); let filter = Filter::from_str("title = \"test\" AND name != 12").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); - assert!(error.to_string().starts_with( - "Attribute `name` is not filterable. Available filterable attributes are: `title`." - )); + snapshot!(error.to_string(), @r###" + Attribute `name` is not filterable. Available filterable attribute patterns are: `title`. + 20:24 title = "test" AND name != 12 + "###); } #[test] @@ -870,7 +974,9 @@ mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset!(S("monitor_diagonal"))); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + "monitor_diagonal".to_string(), + )]); }) .unwrap(); @@ -901,7 +1007,9 @@ mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S(RESERVED_GEO_FIELD_NAME) }); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S( + RESERVED_GEO_FIELD_NAME, + ))]); }) .unwrap(); @@ -948,7 +1056,10 @@ mod tests { index .update_settings(|settings| { settings.set_searchable_fields(vec![S(RESERVED_GEO_FIELD_NAME), S("price")]); // to keep the fields order - settings.set_filterable_fields(hashset! { S(RESERVED_GEO_FIELD_NAME), S("price") }); + settings.set_filterable_fields(vec![ + FilterableAttributesRule::Field(S(RESERVED_GEO_FIELD_NAME)), + FilterableAttributesRule::Field("price".to_string()), + ]); }) .unwrap(); @@ -998,7 +1109,10 @@ mod tests { index .update_settings(|settings| { settings.set_searchable_fields(vec![S(RESERVED_GEO_FIELD_NAME), S("price")]); // to keep the fields order - settings.set_filterable_fields(hashset! { S(RESERVED_GEO_FIELD_NAME), S("price") }); + settings.set_filterable_fields(vec![ + FilterableAttributesRule::Field(S(RESERVED_GEO_FIELD_NAME)), + FilterableAttributesRule::Field("price".to_string()), + ]); }) .unwrap(); @@ -1108,7 +1222,9 @@ mod tests { index .update_settings(|settings| { settings.set_searchable_fields(vec![S("price")]); // to keep the fields order - settings.set_filterable_fields(hashset! { S("price") }); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + "price".to_string(), + )]); }) .unwrap(); index @@ -1164,7 +1280,11 @@ mod tests { index .update_settings(|settings| { settings.set_primary_key("id".to_owned()); - settings.set_filterable_fields(hashset! { S("id"), S("one"), S("two") }); + settings.set_filterable_fields(vec![ + FilterableAttributesRule::Field("id".to_string()), + FilterableAttributesRule::Field("one".to_string()), + FilterableAttributesRule::Field("two".to_string()), + ]); }) .unwrap(); diff --git a/crates/milli/src/search/facet/search.rs b/crates/milli/src/search/facet/search.rs index cdba7ee16..719028a24 100644 --- a/crates/milli/src/search/facet/search.rs +++ b/crates/milli/src/search/facet/search.rs @@ -10,6 +10,7 @@ use roaring::RoaringBitmap; use tracing::error; use crate::error::UserError; +use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue}; use crate::search::build_dfa; use crate::{DocumentId, FieldId, OrderBy, Result, Search}; @@ -73,25 +74,28 @@ impl<'a> SearchForFacetValues<'a> { let index = self.search_query.index; let rtxn = self.search_query.rtxn; - let filterable_fields = index.filterable_fields(rtxn)?; - if !filterable_fields.contains(&self.facet) { - let (valid_fields, hidden_fields) = - index.remove_hidden_fields(rtxn, filterable_fields)?; + let filterable_attributes_rules = index.filterable_attributes_rules(rtxn)?; + if !matching_features(&self.facet, &filterable_attributes_rules) + .map_or(false, |(_, features)| features.is_facet_searchable()) + { + let matching_field_names = + filtered_matching_patterns(&filterable_attributes_rules, &|features| { + features.is_facet_searchable() + }); + let (valid_patterns, hidden_fields) = + index.remove_hidden_fields(rtxn, matching_field_names)?; return Err(UserError::InvalidFacetSearchFacetName { field: self.facet.clone(), - valid_fields, + valid_patterns, hidden_fields, } .into()); - } + }; let fields_ids_map = index.fields_ids_map(rtxn)?; - let fid = match fields_ids_map.id(&self.facet) { - Some(fid) => fid, - // we return an empty list of results when the attribute has been - // set as filterable but no document contains this field (yet). - None => return Ok(Vec::new()), + let Some(fid) = fields_ids_map.id(&self.facet) else { + return Ok(Vec::new()); }; let fst = match self.search_query.index.facet_id_string_fst.get(rtxn, &fid)? { diff --git a/crates/milli/src/search/mod.rs b/crates/milli/src/search/mod.rs index d5b05f515..694a872c4 100644 --- a/crates/milli/src/search/mod.rs +++ b/crates/milli/src/search/mod.rs @@ -9,6 +9,7 @@ use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, Filter, OrderBy, DEFAULT_VALUES_PER_FACET}; pub use self::new::matches::{FormatOptions, MatchBounds, MatcherBuilder, MatchingWords}; use self::new::{execute_vector_search, PartialSearchResult}; +use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features}; use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::vector::Embedder; use crate::{ @@ -187,13 +188,22 @@ impl<'a> Search<'a> { } if let Some(distinct) = &self.distinct { - let filterable_fields = ctx.index.filterable_fields(ctx.txn)?; - if !crate::is_faceted(distinct, &filterable_fields) { - let (valid_fields, hidden_fields) = - ctx.index.remove_hidden_fields(ctx.txn, filterable_fields)?; + let filterable_fields = ctx.index.filterable_attributes_rules(ctx.txn)?; + // check if the distinct field is in the filterable fields + if !matching_features(distinct, &filterable_fields) + .map_or(false, |(_, features)| features.is_filterable()) + { + // if not, remove the hidden fields from the filterable fields to generate the error message + let matching_patterns = + filtered_matching_patterns(&filterable_fields, &|features| { + features.is_filterable() + }); + let (valid_patterns, hidden_fields) = + ctx.index.remove_hidden_fields(ctx.txn, matching_patterns)?; + // and return the error return Err(Error::UserError(UserError::InvalidDistinctAttribute { field: distinct.clone(), - valid_fields, + valid_patterns, hidden_fields, })); } diff --git a/crates/milli/src/search/new/ranking_rule_graph/fid/mod.rs b/crates/milli/src/search/new/ranking_rule_graph/fid/mod.rs index 67775ddea..5f0c37cc3 100644 --- a/crates/milli/src/search/new/ranking_rule_graph/fid/mod.rs +++ b/crates/milli/src/search/new/ranking_rule_graph/fid/mod.rs @@ -57,6 +57,7 @@ impl RankingRuleGraphTrait for FidGraph { let term = to_term; let mut all_fields = FxHashSet::default(); + let mut current_max_weight = 0; for word in term.term_subset.all_single_words_except_prefix_db(ctx)? { let fields = ctx.get_db_word_fids(word.interned())?; all_fields.extend(fields); @@ -81,6 +82,9 @@ impl RankingRuleGraphTrait for FidGraph { let weight = weights_map .weight(fid) .ok_or(InternalError::FieldidsWeightsMapMissingEntry { key: fid })?; + if weight > current_max_weight { + current_max_weight = weight; + } edges.push(( weight as u32 * term.term_ids.len() as u32, conditions_interner.insert(FidCondition { term: term.clone(), fid: Some(fid) }), @@ -88,10 +92,10 @@ impl RankingRuleGraphTrait for FidGraph { } // always lookup the max_fid if we don't already and add an artificial condition for max scoring - let max_weight: Option = weights_map.max_weight(); + let max_weight = ctx.index.max_searchable_attribute_weight(ctx.txn)?; if let Some(max_weight) = max_weight { - if !all_fields.contains(&max_weight) { + if current_max_weight < max_weight { edges.push(( max_weight as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10. conditions_interner.insert(FidCondition { diff --git a/crates/milli/src/search/new/tests/cutoff.rs b/crates/milli/src/search/new/tests/cutoff.rs index 63b67f2e7..f2dfb45d6 100644 --- a/crates/milli/src/search/new/tests/cutoff.rs +++ b/crates/milli/src/search/new/tests/cutoff.rs @@ -5,13 +5,11 @@ use std::time::Duration; -use big_s::S; -use maplit::hashset; use meili_snap::snapshot; use crate::index::tests::TempIndex; use crate::score_details::{ScoreDetails, ScoringStrategy}; -use crate::{Criterion, Filter, Search, TimeBudget}; +use crate::{Criterion, Filter, FilterableAttributesRule, Search, TimeBudget}; fn create_index() -> TempIndex { let index = TempIndex::new(); @@ -20,7 +18,7 @@ fn create_index() -> TempIndex { .update_settings(|s| { s.set_primary_key("id".to_owned()); s.set_searchable_fields(vec!["text".to_owned()]); - s.set_filterable_fields(hashset! { S("id") }); + s.set_filterable_fields(vec![FilterableAttributesRule::Field("id".to_owned())]); s.set_criteria(vec![Criterion::Words, Criterion::Typo]); }) .unwrap(); diff --git a/crates/milli/src/search/new/tests/distinct.rs b/crates/milli/src/search/new/tests/distinct.rs index dd27bfc8a..d3c453957 100644 --- a/crates/milli/src/search/new/tests/distinct.rs +++ b/crates/milli/src/search/new/tests/distinct.rs @@ -19,7 +19,10 @@ use maplit::hashset; use super::collect_field_values; use crate::index::tests::TempIndex; -use crate::{AscDesc, Criterion, Index, Member, Search, SearchResult, TermsMatchingStrategy}; +use crate::{ + AscDesc, Criterion, FilterableAttributesRule, Index, Member, Search, SearchResult, + TermsMatchingStrategy, +}; fn create_index() -> TempIndex { let index = TempIndex::new(); @@ -236,7 +239,7 @@ fn test_distinct_placeholder_no_ranking_rules() { // Set the letter as filterable and unset the distinct attribute. index .update_settings(|s| { - s.set_filterable_fields(hashset! { S("letter") }); + s.set_filterable_fields(vec![FilterableAttributesRule::Field("letter".to_owned())]); s.reset_distinct_field(); }) .unwrap(); diff --git a/crates/milli/src/search/new/tests/integration.rs b/crates/milli/src/search/new/tests/integration.rs index e60a09ec5..e718eb39d 100644 --- a/crates/milli/src/search/new/tests/integration.rs +++ b/crates/milli/src/search/new/tests/integration.rs @@ -9,7 +9,7 @@ use crate::progress::Progress; use crate::update::new::indexer; use crate::update::{IndexerConfig, Settings}; use crate::vector::EmbeddingConfigs; -use crate::{db_snap, Criterion, Index}; +use crate::{db_snap, Criterion, FilterableAttributesRule, Index}; pub const CONTENT: &str = include_str!("../../../../tests/assets/test_set.ndjson"); use crate::constants::RESERVED_GEO_FIELD_NAME; @@ -25,14 +25,14 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_criteria(criteria.to_vec()); - builder.set_filterable_fields(hashset! { - S("tag"), - S("asc_desc_rank"), - S(RESERVED_GEO_FIELD_NAME), - S("opt1"), - S("opt1.opt2"), - S("tag_in") - }); + builder.set_filterable_fields(vec![ + FilterableAttributesRule::Field(S("tag")), + FilterableAttributesRule::Field(S("asc_desc_rank")), + FilterableAttributesRule::Field(S(RESERVED_GEO_FIELD_NAME)), + FilterableAttributesRule::Field(S("opt1")), + FilterableAttributesRule::Field(S("opt1.opt2")), + FilterableAttributesRule::Field(S("tag_in")), + ]); builder.set_sortable_fields(hashset! { S("tag"), S("asc_desc_rank"), diff --git a/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_different_fields.snap b/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_different_fields.snap index 2626ee7d4..e55e221a2 100644 --- a/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_different_fields.snap +++ b/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_different_fields.snap @@ -1,5 +1,5 @@ --- -source: milli/src/search/new/tests/attribute_position.rs +source: crates/milli/src/search/new/tests/attribute_position.rs expression: "format!(\"{document_ids_scores:#?}\")" --- [ diff --git a/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_ngrams.snap b/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_ngrams.snap index 2626ee7d4..e55e221a2 100644 --- a/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_ngrams.snap +++ b/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_ngrams.snap @@ -1,5 +1,5 @@ --- -source: milli/src/search/new/tests/attribute_position.rs +source: crates/milli/src/search/new/tests/attribute_position.rs expression: "format!(\"{document_ids_scores:#?}\")" --- [ diff --git a/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_repeated.snap b/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_repeated.snap index 73dec5f8b..5ae6fafc5 100644 --- a/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_repeated.snap +++ b/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_repeated.snap @@ -1,5 +1,5 @@ --- -source: milli/src/search/new/tests/attribute_position.rs +source: crates/milli/src/search/new/tests/attribute_position.rs expression: "format!(\"{document_ids_scores:#?}\")" --- [ diff --git a/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_simple-2.snap b/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_simple-2.snap index 2626ee7d4..e55e221a2 100644 --- a/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_simple-2.snap +++ b/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_simple-2.snap @@ -1,5 +1,5 @@ --- -source: milli/src/search/new/tests/attribute_position.rs +source: crates/milli/src/search/new/tests/attribute_position.rs expression: "format!(\"{document_ids_scores:#?}\")" --- [ diff --git a/crates/milli/src/snapshot_tests.rs b/crates/milli/src/snapshot_tests.rs index 6635ab2f4..3e58c44d9 100644 --- a/crates/milli/src/snapshot_tests.rs +++ b/crates/milli/src/snapshot_tests.rs @@ -386,7 +386,7 @@ pub fn snap_settings(index: &Index) -> String { write_setting_to_snap!(criteria); write_setting_to_snap!(displayed_fields); write_setting_to_snap!(distinct_field); - write_setting_to_snap!(filterable_fields); + write_setting_to_snap!(filterable_attributes_rules); write_setting_to_snap!(sortable_fields); write_setting_to_snap!(synonyms); write_setting_to_snap!(authorize_typos); diff --git a/crates/milli/src/update/del_add.rs b/crates/milli/src/update/del_add.rs index 97ff86f2a..6825e2bd3 100644 --- a/crates/milli/src/update/del_add.rs +++ b/crates/milli/src/update/del_add.rs @@ -81,6 +81,17 @@ pub enum DelAddOperation { DeletionAndAddition, } +impl DelAddOperation { + /// Merge two DelAddOperation enum variants. + pub fn merge(self, other: Self) -> Self { + match (self, other) { + (Self::Deletion, Self::Deletion) => Self::Deletion, + (Self::Addition, Self::Addition) => Self::Addition, + _ => Self::DeletionAndAddition, + } + } +} + /// Creates a Kv> from two Kv /// /// putting each deletion obkv's keys under an DelAdd::Deletion diff --git a/crates/milli/src/update/facet/bulk.rs b/crates/milli/src/update/facet/bulk.rs index 1ab8740ed..5de0ff4ed 100644 --- a/crates/milli/src/update/facet/bulk.rs +++ b/crates/milli/src/update/facet/bulk.rs @@ -6,7 +6,7 @@ use heed::types::Bytes; use heed::{BytesDecode, BytesEncode, Error, PutFlags, RoTxn, RwTxn}; use roaring::RoaringBitmap; -use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; +use super::{clear_facet_levels, FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; use crate::facet::FacetType; use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, @@ -97,9 +97,7 @@ pub(crate) struct FacetsUpdateBulkInner { impl FacetsUpdateBulkInner { pub fn update(mut self, wtxn: &mut RwTxn<'_>, field_ids: &[u16]) -> Result<()> { self.update_level0(wtxn)?; - for &field_id in field_ids.iter() { - self.clear_levels(wtxn, field_id)?; - } + clear_facet_levels(wtxn, &self.db.remap_data_type(), field_ids)?; for &field_id in field_ids.iter() { let level_readers = self.compute_levels_for_field_id(field_id, wtxn)?; @@ -114,14 +112,6 @@ impl FacetsUpdateBulkInner { Ok(()) } - fn clear_levels(&self, wtxn: &mut heed::RwTxn<'_>, field_id: FieldId) -> Result<()> { - let left = FacetGroupKey::<&[u8]> { field_id, level: 1, left_bound: &[] }; - let right = FacetGroupKey::<&[u8]> { field_id, level: u8::MAX, left_bound: &[] }; - let range = left..=right; - self.db.delete_range(wtxn, &range).map(drop)?; - Ok(()) - } - fn update_level0(&mut self, wtxn: &mut RwTxn<'_>) -> Result<()> { let delta_data = match self.delta_data.take() { Some(x) => x, @@ -365,8 +355,6 @@ impl FacetsUpdateBulkInner { mod tests { use std::iter::once; - use big_s::S; - use maplit::hashset; use roaring::RoaringBitmap; use crate::documents::mmap_from_objects; @@ -374,7 +362,7 @@ mod tests { use crate::heed_codec::StrRefCodec; use crate::index::tests::TempIndex; use crate::update::facet::test_helpers::{ordered_string, FacetIndex}; - use crate::{db_snap, milli_snap}; + use crate::{db_snap, milli_snap, FilterableAttributesRule}; #[test] fn insert() { @@ -474,7 +462,8 @@ mod tests { index .update_settings(|settings| { settings.set_primary_key("id".to_owned()); - settings.set_filterable_fields(hashset! { S("id") }); + settings + .set_filterable_fields(vec![FilterableAttributesRule::Field("id".to_string())]); }) .unwrap(); diff --git a/crates/milli/src/update/facet/mod.rs b/crates/milli/src/update/facet/mod.rs index dbacf6248..027bb355e 100644 --- a/crates/milli/src/update/facet/mod.rs +++ b/crates/milli/src/update/facet/mod.rs @@ -89,6 +89,7 @@ use time::OffsetDateTime; use tracing::debug; use self::incremental::FacetsUpdateIncremental; +use super::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use super::{FacetsUpdateBulk, MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps}; use crate::facet::FacetType; use crate::heed_codec::facet::{ @@ -147,7 +148,11 @@ impl<'i> FacetsUpdate<'i> { } } - pub fn execute(self, wtxn: &mut heed::RwTxn<'_>) -> Result<()> { + pub fn execute( + self, + wtxn: &mut heed::RwTxn<'_>, + new_settings: &InnerIndexSettings, + ) -> Result<()> { if self.data_size == 0 { return Ok(()); } @@ -156,8 +161,7 @@ impl<'i> FacetsUpdate<'i> { // See self::comparison_bench::benchmark_facet_indexing if self.data_size >= (self.database.len(wtxn)? / 500) { - let field_ids = - self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::>(); + let field_ids = facet_levels_field_ids(new_settings); let bulk_update = FacetsUpdateBulk::new( self.index, field_ids, @@ -291,6 +295,53 @@ fn index_facet_search( Ok(()) } +/// Clear all the levels greater than 0 for given field ids. +pub fn clear_facet_levels<'a, I>( + wtxn: &mut heed::RwTxn<'_>, + db: &heed::Database, DecodeIgnore>, + field_ids: I, +) -> Result<()> +where + I: IntoIterator, +{ + for field_id in field_ids { + let field_id = *field_id; + let left = FacetGroupKey::<&[u8]> { field_id, level: 1, left_bound: &[] }; + let right = FacetGroupKey::<&[u8]> { field_id, level: u8::MAX, left_bound: &[] }; + let range = left..=right; + db.delete_range(wtxn, &range).map(drop)?; + } + Ok(()) +} + +pub fn clear_facet_levels_based_on_settings_diff( + wtxn: &mut heed::RwTxn<'_>, + index: &Index, + settings_diff: &InnerIndexSettingsDiff, +) -> Result<()> { + let new_field_ids: BTreeSet<_> = facet_levels_field_ids(&settings_diff.new); + let old_field_ids: BTreeSet<_> = facet_levels_field_ids(&settings_diff.old); + + let field_ids_to_clear: Vec<_> = old_field_ids.difference(&new_field_ids).copied().collect(); + clear_facet_levels(wtxn, &index.facet_id_string_docids.remap_types(), &field_ids_to_clear)?; + clear_facet_levels(wtxn, &index.facet_id_f64_docids.remap_types(), &field_ids_to_clear)?; + Ok(()) +} + +fn facet_levels_field_ids(settings: &InnerIndexSettings) -> B +where + B: FromIterator, +{ + settings + .fields_ids_map + .iter_id_metadata() + .filter(|(_, metadata)| { + metadata.require_facet_level_database(&settings.filterable_attributes_rules) + }) + .map(|(id, _)| id) + .collect() +} + #[cfg(test)] pub(crate) mod test_helpers { use std::cell::Cell; diff --git a/crates/milli/src/update/index_documents/enrich.rs b/crates/milli/src/update/index_documents/enrich.rs index c35701961..1f15dd570 100644 --- a/crates/milli/src/update/index_documents/enrich.rs +++ b/crates/milli/src/update/index_documents/enrich.rs @@ -95,12 +95,7 @@ pub fn enrich_documents_batch( // If the settings specifies that a _geo field must be used therefore we must check the // validity of it in all the documents of this batch and this is when we return `Some`. let geo_field_id = match documents_batch_index.id(RESERVED_GEO_FIELD_NAME) { - Some(geo_field_id) - if index.sortable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME) - || index.filterable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME) => - { - Some(geo_field_id) - } + Some(geo_field_id) if index.is_geo_enabled(rtxn)? => Some(geo_field_id), _otherwise => None, }; diff --git a/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 606ae6b54..d502e69cc 100644 --- a/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -150,9 +150,14 @@ fn searchable_fields_changed( obkv: &KvReader, settings_diff: &InnerIndexSettingsDiff, ) -> bool { - let searchable_fields = &settings_diff.new.searchable_fields_ids; for (field_id, field_bytes) in obkv.iter() { - if searchable_fields.contains(&field_id) { + let Some(metadata) = settings_diff.new.fields_ids_map.metadata(field_id) else { + // If the field id is not in the fields ids map, skip it. + // This happens for the vectors sub-fields. for example: + // "_vectors": { "manual": [1, 2, 3]} -> "_vectors.manual" is not registered. + continue; + }; + if metadata.is_searchable() { let del_add = KvReaderDelAdd::from_slice(field_bytes); match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) { // if both fields are None, check the next field. @@ -200,8 +205,14 @@ fn tokens_from_document<'a>( buffers.obkv_buffer.clear(); let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer); for (field_id, field_bytes) in obkv.iter() { + let Some(metadata) = settings.fields_ids_map.metadata(field_id) else { + // If the field id is not in the fields ids map, skip it. + // This happens for the vectors sub-fields. for example: + // "_vectors": { "manual": [1, 2, 3]} -> "_vectors.manual" is not registered. + continue; + }; // if field is searchable. - if settings.searchable_fields_ids.contains(&field_id) { + if metadata.is_searchable() { // extract deletion or addition only. if let Some(field_bytes) = KvReaderDelAdd::from_slice(field_bytes).get(del_add) { // parse json. @@ -216,7 +227,7 @@ fn tokens_from_document<'a>( buffers.field_buffer.clear(); if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) { // create an iterator of token with their positions. - let locales = settings.localized_searchable_fields_ids.locales(field_id); + let locales = metadata.locales(&settings.localized_attributes_rules); let tokens = process_tokens(tokenizer.tokenize_with_allow_list(field, locales)) .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); diff --git a/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index d330ea5a0..5b7639e59 100644 --- a/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -12,12 +12,11 @@ use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; use crate::heed_codec::{BEU16StrCodec, StrRefCodec}; -use crate::localized_attributes_rules::LocalizedFieldIds; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::helpers::{ MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps, }; -use crate::update::settings::InnerIndexSettingsDiff; +use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; /// Extracts the facet string and the documents ids where this facet string appear. @@ -33,13 +32,10 @@ pub fn extract_facet_string_docids( if settings_diff.settings_update_only() { extract_facet_string_docids_settings(docid_fid_facet_string, indexer, settings_diff) } else { - let localized_field_ids = &settings_diff.new.localized_faceted_fields_ids; - let facet_search = settings_diff.new.facet_search; extract_facet_string_docids_document_update( docid_fid_facet_string, indexer, - localized_field_ids, - facet_search, + &settings_diff.new, ) } } @@ -52,8 +48,7 @@ pub fn extract_facet_string_docids( fn extract_facet_string_docids_document_update( docid_fid_facet_string: grenad::Reader, indexer: GrenadParameters, - localized_field_ids: &LocalizedFieldIds, - facet_search: bool, + settings: &InnerIndexSettings, ) -> Result<(grenad::Reader>, grenad::Reader>)> { let max_memory = indexer.max_memory_by_thread(); @@ -92,6 +87,14 @@ fn extract_facet_string_docids_document_update( let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); + let Some(metadata) = settings.fields_ids_map.metadata(field_id) else { + unreachable!("metadata not found for field_id: {}", field_id) + }; + + if !metadata.is_faceted(&settings.filterable_attributes_rules) { + continue; + } + let (document_id_bytes, normalized_value_bytes) = try_split_array_at::<_, 4>(bytes).unwrap(); let document_id = u32::from_be_bytes(document_id_bytes); @@ -99,8 +102,10 @@ fn extract_facet_string_docids_document_update( let normalized_value = str::from_utf8(normalized_value_bytes)?; // Facet search normalization - if facet_search { - let locales = localized_field_ids.locales(field_id); + let features = + metadata.filterable_attributes_features(&settings.filterable_attributes_rules); + if features.is_facet_searchable() && settings.facet_search { + let locales = metadata.locales(&settings.localized_attributes_rules); let hyper_normalized_value = normalize_facet_string(normalized_value, locales); let set = BTreeSet::from_iter(std::iter::once(normalized_value)); @@ -178,8 +183,15 @@ fn extract_facet_string_docids_settings( let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); - let old_locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id); - let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id); + let Some(old_metadata) = settings_diff.old.fields_ids_map.metadata(field_id) else { + unreachable!("old metadata not found for field_id: {}", field_id) + }; + let Some(new_metadata) = settings_diff.new.fields_ids_map.metadata(field_id) else { + unreachable!("new metadata not found for field_id: {}", field_id) + }; + + let old_locales = old_metadata.locales(&settings_diff.old.localized_attributes_rules); + let new_locales = new_metadata.locales(&settings_diff.new.localized_attributes_rules); let are_same_locales = old_locales == new_locales; let reindex_facet_search = @@ -197,10 +209,15 @@ fn extract_facet_string_docids_settings( // Facet search normalization if settings_diff.new.facet_search { + let new_filterable_features = new_metadata + .filterable_attributes_features(&settings_diff.new.filterable_attributes_rules); let new_hyper_normalized_value = normalize_facet_string(normalized_value, new_locales); let old_hyper_normalized_value; + let old_filterable_features = old_metadata + .filterable_attributes_features(&settings_diff.old.filterable_attributes_rules); let old_hyper_normalized_value = if !settings_diff.old.facet_search || deladd_reader.get(DelAdd::Deletion).is_none() + || !old_filterable_features.is_facet_searchable() { // if the facet search is disabled in the old settings or if no facet string is deleted, // we don't need to normalize the facet string. @@ -215,7 +232,9 @@ fn extract_facet_string_docids_settings( let set = BTreeSet::from_iter(std::iter::once(normalized_value)); // if the facet string is the same, we can put the deletion and addition in the same obkv. - if old_hyper_normalized_value == Some(&new_hyper_normalized_value) { + if old_hyper_normalized_value == Some(&new_hyper_normalized_value) + && new_filterable_features.is_facet_searchable() + { // nothing to do if we delete and re-add the value. if is_same_value { continue; @@ -249,7 +268,9 @@ fn extract_facet_string_docids_settings( } // addition - if deladd_reader.get(DelAdd::Addition).is_some() { + if new_filterable_features.is_facet_searchable() + && deladd_reader.get(DelAdd::Addition).is_some() + { // insert new value let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; buffer.clear(); diff --git a/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 88c02fe70..de87c5a7c 100644 --- a/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -76,9 +76,9 @@ pub fn extract_fid_docid_facet_values( let mut strings_key_buffer = Vec::new(); let old_faceted_fids: BTreeSet<_> = - settings_diff.old.faceted_fields_ids.iter().copied().collect(); + settings_diff.list_faceted_fields_from_fid_map(DelAdd::Deletion); let new_faceted_fids: BTreeSet<_> = - settings_diff.new.faceted_fields_ids.iter().copied().collect(); + settings_diff.list_faceted_fields_from_fid_map(DelAdd::Addition); if !settings_diff.settings_update_only || settings_diff.reindex_facets() { let mut cursor = obkv_documents.into_cursor()?; diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index 642cd610a..cb8c121ce 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -15,8 +15,9 @@ use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::constants::RESERVED_VECTORS_FIELD_NAME; use crate::error::FaultSource; +use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::index::IndexEmbeddingConfig; -use crate::prompt::{FieldsIdsMapWithMetadata, Prompt}; +use crate::prompt::Prompt; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::settings::InnerIndexSettingsDiff; use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution}; @@ -190,12 +191,8 @@ pub fn extract_vector_points( let reindex_vectors = settings_diff.reindex_vectors(); let old_fields_ids_map = &settings_diff.old.fields_ids_map; - let old_fields_ids_map = - FieldsIdsMapWithMetadata::new(old_fields_ids_map, &settings_diff.old.searchable_fields_ids); let new_fields_ids_map = &settings_diff.new.fields_ids_map; - let new_fields_ids_map = - FieldsIdsMapWithMetadata::new(new_fields_ids_map, &settings_diff.new.searchable_fields_ids); // the vector field id may have changed let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME); @@ -383,7 +380,7 @@ pub fn extract_vector_points( ); continue; } - regenerate_prompt(obkv, prompt, &new_fields_ids_map)? + regenerate_prompt(obkv, prompt, new_fields_ids_map)? } }, // prompt regeneration is only triggered for existing embedders @@ -400,7 +397,7 @@ pub fn extract_vector_points( regenerate_if_prompt_changed( obkv, (old_prompt, prompt), - (&old_fields_ids_map, &new_fields_ids_map), + (old_fields_ids_map, new_fields_ids_map), )? } else { // we can simply ignore user provided vectors as they are not regenerated and are @@ -416,7 +413,7 @@ pub fn extract_vector_points( prompt, (add_to_user_provided, remove_from_user_provided), (old, new), - (&old_fields_ids_map, &new_fields_ids_map), + (old_fields_ids_map, new_fields_ids_map), document_id, embedder_name, embedder_is_manual, @@ -486,10 +483,7 @@ fn extract_vector_document_diff( prompt: &Prompt, (add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap), (old, new): (VectorState, VectorState), - (old_fields_ids_map, new_fields_ids_map): ( - &FieldsIdsMapWithMetadata, - &FieldsIdsMapWithMetadata, - ), + (old_fields_ids_map, new_fields_ids_map): (&FieldIdMapWithMetadata, &FieldIdMapWithMetadata), document_id: impl Fn() -> Value, embedder_name: &str, embedder_is_manual: bool, @@ -611,10 +605,7 @@ fn extract_vector_document_diff( fn regenerate_if_prompt_changed( obkv: &obkv::KvReader, (old_prompt, new_prompt): (&Prompt, &Prompt), - (old_fields_ids_map, new_fields_ids_map): ( - &FieldsIdsMapWithMetadata, - &FieldsIdsMapWithMetadata, - ), + (old_fields_ids_map, new_fields_ids_map): (&FieldIdMapWithMetadata, &FieldIdMapWithMetadata), ) -> Result { let old_prompt = old_prompt .render_kvdeladd(obkv, DelAdd::Deletion, old_fields_ids_map) @@ -630,7 +621,7 @@ fn regenerate_if_prompt_changed( fn regenerate_prompt( obkv: &obkv::KvReader, prompt: &Prompt, - new_fields_ids_map: &FieldsIdsMapWithMetadata, + new_fields_ids_map: &FieldIdMapWithMetadata, ) -> Result { let prompt = prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?; diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index 7a62af96f..ae082284a 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -26,6 +26,7 @@ use typed_chunk::{write_typed_chunk_into_index, ChunkAccumulator, TypedChunk}; pub use self::enrich::{extract_finite_float_from_value, DocumentId}; pub use self::helpers::*; pub use self::transform::{Transform, TransformOutput}; +use super::facet::clear_facet_levels_based_on_settings_diff; use super::new::StdResult; use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::{Error, InternalError}; @@ -215,9 +216,8 @@ where flattened_documents, } = output; - // update the internal facet and searchable list, + // update the searchable list, // because they might have changed due to the nested documents flattening. - settings_diff.new.recompute_facets(self.wtxn, self.index)?; settings_diff.new.recompute_searchables(self.wtxn, self.index)?; let settings_diff = Arc::new(settings_diff); @@ -465,6 +465,11 @@ where } } + // If the settings are only being updated, we may have to clear some of the facet levels. + if settings_diff.settings_update_only() { + clear_facet_levels_based_on_settings_diff(self.wtxn, self.index, &settings_diff)?; + } + Ok(()) }).map_err(InternalError::from)??; @@ -776,7 +781,7 @@ mod tests { use crate::search::TermsMatchingStrategy; use crate::update::new::indexer; use crate::update::Setting; - use crate::{all_obkv_to_json, db_snap, Filter, Search, UserError}; + use crate::{all_obkv_to_json, db_snap, Filter, FilterableAttributesRule, Search, UserError}; #[test] fn simple_document_replacement() { @@ -1006,7 +1011,9 @@ mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset!(S(RESERVED_GEO_FIELD_NAME))); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + RESERVED_GEO_FIELD_NAME.to_string(), + )]); }) .unwrap(); } @@ -1018,7 +1025,9 @@ mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset!(S(RESERVED_GEO_FIELD_NAME))); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + RESERVED_GEO_FIELD_NAME.to_string(), + )]); }) .unwrap(); @@ -1234,16 +1243,17 @@ mod tests { let searchable_fields = vec![S("title"), S("nested.object"), S("nested.machin")]; settings.set_searchable_fields(searchable_fields); - let faceted_fields = hashset!(S("title"), S("nested.object"), S("nested.machin")); + let faceted_fields = vec![ + FilterableAttributesRule::Field("title".to_string()), + FilterableAttributesRule::Field("nested.object".to_string()), + FilterableAttributesRule::Field("nested.machin".to_string()), + ]; settings.set_filterable_fields(faceted_fields); }) .unwrap(); let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(facets, hashset!(S("title"), S("nested.object"), S("nested.machin"))); - // testing the simple query search let mut search = crate::Search::new(&rtxn, &index); search.query("document"); @@ -1438,7 +1448,9 @@ mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset!(String::from("dog"))); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + "dog".to_string(), + )]); }) .unwrap(); @@ -1457,10 +1469,6 @@ mod tests { let rtxn = index.read_txn().unwrap(); - let hidden = index.faceted_fields(&rtxn).unwrap(); - - assert_eq!(hidden, hashset!(S("dog"), S("dog.race"), S("dog.race.bernese mountain"))); - for (s, i) in [("zeroth", 0), ("first", 1), ("second", 2), ("third", 3)] { let mut search = crate::Search::new(&rtxn, &index); let filter = format!(r#""dog.race.bernese mountain" = {s}"#); @@ -1478,12 +1486,6 @@ mod tests { db_snap!(index, facet_id_string_docids, @""); db_snap!(index, field_id_docid_facet_strings, @""); - let rtxn = index.read_txn().unwrap(); - - let facets = index.faceted_fields(&rtxn).unwrap(); - - assert_eq!(facets, hashset!()); - // update the settings to test the sortable index .update_settings(|settings| { @@ -1506,10 +1508,6 @@ mod tests { let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - - assert_eq!(facets, hashset!(S("dog.race"), S("dog.race.bernese mountain"))); - let mut search = crate::Search::new(&rtxn, &index); search.sort_criteria(vec![crate::AscDesc::Asc(crate::Member::Field(S( "dog.race.bernese mountain", @@ -1717,8 +1715,6 @@ mod tests { let check_ok = |index: &Index| { let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue"))); let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); @@ -1738,7 +1734,7 @@ mod tests { assert_eq!(bitmap_colour_blue.into_iter().collect::>(), vec![7]); }; - let faceted_fields = hashset!(S("colour")); + let faceted_fields = vec![FilterableAttributesRule::Field("colour".to_string())]; let index = TempIndex::new(); index.add_documents(content()).unwrap(); @@ -1823,8 +1819,6 @@ mod tests { let check_ok = |index: &Index| { let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue"))); let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); @@ -1844,7 +1838,7 @@ mod tests { assert_eq!(bitmap_colour_blue.into_iter().collect::>(), vec![3]); }; - let faceted_fields = hashset!(S("colour")); + let faceted_fields = vec![FilterableAttributesRule::Field("colour".to_string())]; let index = TempIndex::new(); index.add_documents(content()).unwrap(); @@ -1887,8 +1881,6 @@ mod tests { let check_ok = |index: &Index| { let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(facets, hashset!(S("tags"), S("tags.green"), S("tags.green.blue"))); let tags_id = index.fields_ids_map(&rtxn).unwrap().id("tags").unwrap(); let tags_green_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green").unwrap(); @@ -1907,7 +1899,7 @@ mod tests { assert_eq!(bitmap_tags_blue.into_iter().collect::>(), vec![12]); }; - let faceted_fields = hashset!(S("tags")); + let faceted_fields = vec![FilterableAttributesRule::Field("tags".to_string())]; let index = TempIndex::new(); index.add_documents(content()).unwrap(); @@ -2259,7 +2251,9 @@ mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S("title") }); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + "title".to_string(), + )]); }) .unwrap(); @@ -3117,7 +3111,10 @@ mod tests { index .update_settings_using_wtxn(&mut wtxn, |settings| { settings.set_primary_key(S("docid")); - settings.set_filterable_fields(hashset! { S("label"), S("label2") }); + settings.set_filterable_fields(vec![ + FilterableAttributesRule::Field("label".to_string()), + FilterableAttributesRule::Field("label2".to_string()), + ]); }) .unwrap(); wtxn.commit().unwrap(); @@ -3296,7 +3293,9 @@ mod tests { index .update_settings_using_wtxn(&mut wtxn, |settings| { settings.set_primary_key(S("id")); - settings.set_filterable_fields(hashset!(S(RESERVED_GEO_FIELD_NAME))); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + RESERVED_GEO_FIELD_NAME.to_string(), + )]); settings.set_sortable_fields(hashset!(S(RESERVED_GEO_FIELD_NAME))); }) .unwrap(); diff --git a/crates/milli/src/update/index_documents/transform.rs b/crates/milli/src/update/index_documents/transform.rs index d87524a34..769e86b39 100644 --- a/crates/milli/src/update/index_documents/transform.rs +++ b/crates/milli/src/update/index_documents/transform.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::collections::btree_map::Entry as BEntry; use std::collections::hash_map::Entry as HEntry; -use std::collections::{BTreeMap, HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap}; use std::fs::File; use std::io::{Read, Seek}; @@ -18,8 +18,10 @@ use super::helpers::{ ObkvsMergeAdditionsAndDeletions, }; use super::{create_writer, IndexDocumentsMethod, IndexerConfig, KeepFirst}; +use crate::attribute_patterns::PatternMatch; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; +use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::index::{db_name, main_key}; use crate::update::del_add::{ into_del_add_obkv, into_del_add_obkv_conditional_operation, DelAdd, DelAddOperation, @@ -31,9 +33,7 @@ use crate::update::{AvailableIds, UpdateIndexingStep}; use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use crate::vector::settings::WriteBackToDocuments; use crate::vector::ArroyWrapper; -use crate::{ - is_faceted_by, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, -}; +use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, Index, Result}; pub struct TransformOutput { pub primary_key: String, @@ -52,7 +52,7 @@ pub struct TransformOutput { /// containing all those documents. pub struct Transform<'a, 'i> { pub index: &'i Index, - fields_ids_map: FieldsIdsMap, + fields_ids_map: FieldIdMapWithMetadata, indexer_settings: &'a IndexerConfig, pub index_documents_method: IndexDocumentsMethod, @@ -84,7 +84,7 @@ pub enum Operation { /// /// If new fields are present in the addition, they are added to the index field ids map. fn create_fields_mapping( - index_field_map: &mut FieldsIdsMap, + index_field_map: &mut FieldIdMapWithMetadata, batch_field_map: &DocumentsBatchIndex, ) -> Result> { batch_field_map @@ -141,10 +141,13 @@ impl<'a, 'i> Transform<'a, 'i> { true, ); let documents_ids = index.documents_ids(wtxn)?; + let fields_ids_map = index.fields_ids_map(wtxn)?; + let builder = MetadataBuilder::from_index(index, wtxn)?; + let fields_ids_map = FieldIdMapWithMetadata::new(fields_ids_map, builder); Ok(Transform { index, - fields_ids_map: index.fields_ids_map(wtxn)?, + fields_ids_map, indexer_settings, available_documents_ids: AvailableIds::new(&documents_ids), original_sorter, @@ -354,7 +357,7 @@ impl<'a, 'i> Transform<'a, 'i> { documents_seen: documents_count, }); - self.index.put_fields_ids_map(wtxn, &self.fields_ids_map)?; + self.index.put_fields_ids_map(wtxn, self.fields_ids_map.as_fields_ids_map())?; self.index.put_primary_key(wtxn, &primary_key)?; self.documents_count += documents_count; // Now that we have a valid sorter that contains the user id and the obkv we @@ -371,7 +374,7 @@ impl<'a, 'i> Transform<'a, 'i> { )] fn flatten_from_fields_ids_map( obkv: &KvReader, - fields_ids_map: &mut FieldsIdsMap, + fields_ids_map: &mut FieldIdMapWithMetadata, ) -> Result>> { if obkv .iter() @@ -657,7 +660,6 @@ impl<'a, 'i> Transform<'a, 'i> { fn rebind_existing_document( old_obkv: &KvReader, settings_diff: &InnerIndexSettingsDiff, - modified_faceted_fields: &HashSet, mut injected_vectors: serde_json::Map, old_vectors_fid: Option, original_obkv_buffer: Option<&mut Vec>, @@ -667,23 +669,26 @@ impl<'a, 'i> Transform<'a, 'i> { let is_primary_key = |id: FieldId| -> bool { settings_diff.primary_key_id == Some(id) }; // If only a faceted field has been added, keep only this field. - let global_facet_settings_changed = settings_diff.global_facet_settings_changed(); let facet_fids_changed = settings_diff.facet_fids_changed(); - let necessary_faceted_field = - |id: FieldId| -> bool { + + let necessary_faceted_field = |id: FieldId| -> Option { + if facet_fids_changed { let field_name = settings_diff.new.fields_ids_map.name(id).unwrap(); - if global_facet_settings_changed { - settings_diff.new.user_defined_faceted_fields.iter().any(|long| { - is_faceted_by(long, field_name) || is_faceted_by(field_name, long) - }) - } else if facet_fids_changed { - modified_faceted_fields.iter().any(|long| { - is_faceted_by(long, field_name) || is_faceted_by(field_name, long) - }) - } else { - false + // if the faceted fields changed, we need to keep all the field that are + // faceted in the old or new settings. + match ( + settings_diff.old.match_faceted_field(field_name), + settings_diff.new.match_faceted_field(field_name), + ) { + (PatternMatch::NoMatch, PatternMatch::NoMatch) => None, + (PatternMatch::NoMatch, _) => Some(DelAddOperation::Addition), + (_, PatternMatch::NoMatch) => Some(DelAddOperation::Deletion), + (_, _) => Some(DelAddOperation::DeletionAndAddition), } - }; + } else { + None + } + }; // Alway provide all fields when vectors are involved because // we need the fields for the prompt/templating. @@ -734,12 +739,24 @@ impl<'a, 'i> Transform<'a, 'i> { } } - if is_primary_key(id) || necessary_faceted_field(id) || reindex_vectors { + if is_primary_key(id) || reindex_vectors { operations.insert(id, DelAddOperation::DeletionAndAddition); obkv_writer.insert(id, val)?; - } else if let Some(operation) = settings_diff.reindex_searchable_id(id) { - operations.insert(id, operation); - obkv_writer.insert(id, val)?; + } else { + let facet_operation = necessary_faceted_field(id); + let searchable_operation = settings_diff.reindex_searchable_id(id); + let operation = match (facet_operation, searchable_operation) { + (Some(facet_operation), Some(searchable_operation)) => { + Some(facet_operation.merge(searchable_operation)) + } + (Some(operation), None) | (None, Some(operation)) => Some(operation), + (None, None) => None, + }; + + if let Some(operation) = operation { + operations.insert(id, operation); + obkv_writer.insert(id, val)?; + } } } if !injected_vectors.is_empty() { @@ -856,7 +873,6 @@ impl<'a, 'i> Transform<'a, 'i> { }; if original_sorter.is_some() || flattened_sorter.is_some() { - let modified_faceted_fields = settings_diff.modified_faceted_fields(); let mut original_obkv_buffer = Vec::new(); let mut flattened_obkv_buffer = Vec::new(); let mut document_sorter_key_buffer = Vec::new(); @@ -897,7 +913,6 @@ impl<'a, 'i> Transform<'a, 'i> { Self::rebind_existing_document( old_obkv, &settings_diff, - &modified_faceted_fields, injected_vectors, old_vectors_fid, Some(&mut original_obkv_buffer).filter(|_| original_sorter.is_some()), diff --git a/crates/milli/src/update/index_documents/typed_chunk.rs b/crates/milli/src/update/index_documents/typed_chunk.rs index 0809d9601..10dbdc834 100644 --- a/crates/milli/src/update/index_documents/typed_chunk.rs +++ b/crates/milli/src/update/index_documents/typed_chunk.rs @@ -365,7 +365,7 @@ pub(crate) fn write_typed_chunk_into_index( let merger = builder.build(); let indexer = FacetsUpdate::new(index, FacetType::Number, merger, None, data_size); - indexer.execute(wtxn)?; + indexer.execute(wtxn, &settings_diff.new)?; is_merged_database = true; } TypedChunk::FieldIdFacetStringDocids(_) => { @@ -401,7 +401,7 @@ pub(crate) fn write_typed_chunk_into_index( Some(normalized_facet_id_string_merger), data_size, ); - indexer.execute(wtxn)?; + indexer.execute(wtxn, &settings_diff.new)?; is_merged_database = true; } TypedChunk::FieldIdFacetExistsDocids(_) => { diff --git a/crates/milli/src/update/new/document_change.rs b/crates/milli/src/update/new/document_change.rs index 2de9f384b..38369a4d7 100644 --- a/crates/milli/src/update/new/document_change.rs +++ b/crates/milli/src/update/new/document_change.rs @@ -4,10 +4,10 @@ use heed::RoTxn; use super::document::{ Document as _, DocumentFromDb, DocumentFromVersions, MergedDocument, Versions, }; -use super::extract::perm_json_p; use super::vector_document::{ MergedVectorDocument, VectorDocumentFromDb, VectorDocumentFromVersions, }; +use crate::attribute_patterns::PatternMatch; use crate::documents::FieldIdMapper; use crate::vector::EmbeddingConfigs; use crate::{DocumentId, Index, Result}; @@ -167,13 +167,15 @@ impl<'doc> Update<'doc> { } } - /// Returns whether the updated version of the document is different from the current version for the passed subset of fields. + /// Returns whether the updated version of the document is different from the current version for the subset of fields selected by `selector`. /// - /// `true` if at least one top-level-field that is a exactly a member of field or a parent of a member of field changed. + /// `true` if at least one top-level-field that is exactly a selected field or a parent of a selected field changed. /// Otherwise `false`. + /// + /// - Note: `_geo` and `_vectors` are not taken into account by this function. pub fn has_changed_for_fields<'t, Mapper: FieldIdMapper>( &self, - fields: Option<&[&str]>, + selector: &mut impl FnMut(&str) -> PatternMatch, rtxn: &'t RoTxn, index: &'t Index, mapper: &'t Mapper, @@ -185,7 +187,7 @@ impl<'doc> Update<'doc> { for entry in self.only_changed_fields().iter_top_level_fields() { let (key, updated_value) = entry?; - if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip { + if selector(key) == PatternMatch::NoMatch { continue; } @@ -229,7 +231,7 @@ impl<'doc> Update<'doc> { for entry in current.iter_top_level_fields() { let (key, _) = entry?; - if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip { + if selector(key) == PatternMatch::NoMatch { continue; } current_selected_field_count += 1; diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index 41b6a12a2..b3aa8f984 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -5,12 +5,13 @@ use std::ops::DerefMut as _; use bumpalo::collections::Vec as BVec; use bumpalo::Bump; use hashbrown::HashMap; -use heed::RoTxn; use serde_json::Value; use super::super::cache::BalancedCaches; use super::facet_document::extract_document_facets; use super::FacetKind; +use crate::fields_ids_map::metadata::Metadata; +use crate::filterable_attributes_rules::match_faceted_field; use crate::heed_codec::facet::OrderedF64Codec; use crate::update::del_add::DelAdd; use crate::update::new::channel::FieldIdDocidFacetSender; @@ -23,13 +24,17 @@ use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{FullySend, ThreadLocal}; use crate::update::new::DocumentChange; use crate::update::GrenadParameters; -use crate::{DocumentId, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH}; +use crate::{DocumentId, FieldId, FilterableAttributesRule, Result, MAX_FACET_VALUE_LENGTH}; pub struct FacetedExtractorData<'a, 'b> { - attributes_to_extract: &'a [&'a str], sender: &'a FieldIdDocidFacetSender<'a, 'b>, grenad_parameters: &'a GrenadParameters, buckets: usize, + filterable_attributes: &'a [FilterableAttributesRule], + sortable_fields: &'a HashSet, + asc_desc_fields: &'a HashSet, + distinct_field: &'a Option, + is_geo_enabled: bool, } impl<'a, 'b, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a, 'b> { @@ -52,7 +57,11 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a, 'b> let change = change?; FacetedDocidsExtractor::extract_document_change( context, - self.attributes_to_extract, + self.filterable_attributes, + self.sortable_fields, + self.asc_desc_fields, + self.distinct_field, + self.is_geo_enabled, change, self.sender, )? @@ -64,13 +73,18 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a, 'b> pub struct FacetedDocidsExtractor; impl FacetedDocidsExtractor { + #[allow(clippy::too_many_arguments)] fn extract_document_change( context: &DocumentChangeContext>, - attributes_to_extract: &[&str], + filterable_attributes: &[FilterableAttributesRule], + sortable_fields: &HashSet, + asc_desc_fields: &HashSet, + distinct_field: &Option, + is_geo_enabled: bool, document_change: DocumentChange, sender: &FieldIdDocidFacetSender, ) -> Result<()> { - let index = &context.index; + let index = context.index; let rtxn = &context.rtxn; let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); let mut cached_sorter = context.data.borrow_mut_or_yield(); @@ -78,11 +92,15 @@ impl FacetedDocidsExtractor { let docid = document_change.docid(); let res = match document_change { DocumentChange::Deletion(inner) => extract_document_facets( - attributes_to_extract, inner.current(rtxn, index, context.db_fields_ids_map)?, inner.external_document_id(), new_fields_ids_map.deref_mut(), - &mut |fid, depth, value| { + filterable_attributes, + sortable_fields, + asc_desc_fields, + distinct_field, + is_geo_enabled, + &mut |fid, meta, depth, value| { Self::facet_fn_with_options( &context.doc_alloc, cached_sorter.deref_mut(), @@ -91,6 +109,8 @@ impl FacetedDocidsExtractor { DelAddFacetValue::insert_del, docid, fid, + meta, + filterable_attributes, depth, value, ) @@ -98,7 +118,15 @@ impl FacetedDocidsExtractor { ), DocumentChange::Update(inner) => { if !inner.has_changed_for_fields( - Some(attributes_to_extract), + &mut |field_name| { + match_faceted_field( + field_name, + filterable_attributes, + sortable_fields, + asc_desc_fields, + distinct_field, + ) + }, rtxn, index, context.db_fields_ids_map, @@ -107,11 +135,15 @@ impl FacetedDocidsExtractor { } extract_document_facets( - attributes_to_extract, inner.current(rtxn, index, context.db_fields_ids_map)?, inner.external_document_id(), new_fields_ids_map.deref_mut(), - &mut |fid, depth, value| { + filterable_attributes, + sortable_fields, + asc_desc_fields, + distinct_field, + is_geo_enabled, + &mut |fid, meta, depth, value| { Self::facet_fn_with_options( &context.doc_alloc, cached_sorter.deref_mut(), @@ -120,6 +152,8 @@ impl FacetedDocidsExtractor { DelAddFacetValue::insert_del, docid, fid, + meta, + filterable_attributes, depth, value, ) @@ -127,11 +161,15 @@ impl FacetedDocidsExtractor { )?; extract_document_facets( - attributes_to_extract, inner.merged(rtxn, index, context.db_fields_ids_map)?, inner.external_document_id(), new_fields_ids_map.deref_mut(), - &mut |fid, depth, value| { + filterable_attributes, + sortable_fields, + asc_desc_fields, + distinct_field, + is_geo_enabled, + &mut |fid, meta, depth, value| { Self::facet_fn_with_options( &context.doc_alloc, cached_sorter.deref_mut(), @@ -140,6 +178,8 @@ impl FacetedDocidsExtractor { DelAddFacetValue::insert_add, docid, fid, + meta, + filterable_attributes, depth, value, ) @@ -147,11 +187,15 @@ impl FacetedDocidsExtractor { ) } DocumentChange::Insertion(inner) => extract_document_facets( - attributes_to_extract, inner.inserted(), inner.external_document_id(), new_fields_ids_map.deref_mut(), - &mut |fid, depth, value| { + filterable_attributes, + sortable_fields, + asc_desc_fields, + distinct_field, + is_geo_enabled, + &mut |fid, meta, depth, value| { Self::facet_fn_with_options( &context.doc_alloc, cached_sorter.deref_mut(), @@ -160,6 +204,8 @@ impl FacetedDocidsExtractor { DelAddFacetValue::insert_add, docid, fid, + meta, + filterable_attributes, depth, value, ) @@ -180,9 +226,18 @@ impl FacetedDocidsExtractor { facet_fn: impl Fn(&mut DelAddFacetValue<'doc>, FieldId, BVec<'doc, u8>, FacetKind), docid: DocumentId, fid: FieldId, + meta: Metadata, + filterable_attributes: &[FilterableAttributesRule], depth: perm_json_p::Depth, value: &Value, ) -> Result<()> { + // if the field is not faceted, do nothing + if !meta.is_faceted(filterable_attributes) { + return Ok(()); + } + + let features = meta.filterable_attributes_features(filterable_attributes); + let mut buffer = BVec::new_in(doc_alloc); // Exists // key: fid @@ -246,7 +301,9 @@ impl FacetedDocidsExtractor { } // Null // key: fid - Value::Null if depth == perm_json_p::Depth::OnBaseKey => { + Value::Null + if depth == perm_json_p::Depth::OnBaseKey && features.is_filterable_null() => + { buffer.clear(); buffer.push(FacetKind::Null as u8); buffer.extend_from_slice(&fid.to_be_bytes()); @@ -254,19 +311,29 @@ impl FacetedDocidsExtractor { } // Empty // key: fid - Value::Array(a) if a.is_empty() && depth == perm_json_p::Depth::OnBaseKey => { + Value::Array(a) + if a.is_empty() + && depth == perm_json_p::Depth::OnBaseKey + && features.is_filterable_empty() => + { buffer.clear(); buffer.push(FacetKind::Empty as u8); buffer.extend_from_slice(&fid.to_be_bytes()); cache_fn(cached_sorter, &buffer, docid) } - Value::String(_) if depth == perm_json_p::Depth::OnBaseKey => { + Value::String(_) + if depth == perm_json_p::Depth::OnBaseKey && features.is_filterable_empty() => + { buffer.clear(); buffer.push(FacetKind::Empty as u8); buffer.extend_from_slice(&fid.to_be_bytes()); cache_fn(cached_sorter, &buffer, docid) } - Value::Object(o) if o.is_empty() && depth == perm_json_p::Depth::OnBaseKey => { + Value::Object(o) + if o.is_empty() + && depth == perm_json_p::Depth::OnBaseKey + && features.is_filterable_empty() => + { buffer.clear(); buffer.push(FacetKind::Empty as u8); buffer.extend_from_slice(&fid.to_be_bytes()); @@ -276,10 +343,6 @@ impl FacetedDocidsExtractor { _ => Ok(()), } } - - fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { - index.user_defined_faceted_fields(rtxn) - } } struct DelAddFacetValue<'doc> { @@ -399,9 +462,11 @@ impl FacetedDocidsExtractor { { let index = indexing_context.index; let rtxn = index.read_txn()?; - let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?; - let attributes_to_extract: Vec<_> = - attributes_to_extract.iter().map(|s| s.as_ref()).collect(); + let filterable_attributes = index.filterable_attributes_rules(&rtxn)?; + let sortable_fields = index.sortable_fields(&rtxn)?; + let asc_desc_fields = index.asc_desc_fields(&rtxn)?; + let distinct_field = index.distinct_field(&rtxn)?.map(|s| s.to_string()); + let is_geo_enabled = index.is_geo_enabled(&rtxn)?; let datastore = ThreadLocal::new(); { @@ -410,10 +475,14 @@ impl FacetedDocidsExtractor { let _entered = span.enter(); let extractor = FacetedExtractorData { - attributes_to_extract: &attributes_to_extract, grenad_parameters: indexing_context.grenad_parameters, buckets: rayon::current_num_threads(), sender, + filterable_attributes: &filterable_attributes, + sortable_fields: &sortable_fields, + asc_desc_fields: &asc_desc_fields, + distinct_field: &distinct_field, + is_geo_enabled, }; extract( document_changes, diff --git a/crates/milli/src/update/new/extract/faceted/facet_document.rs b/crates/milli/src/update/new/extract/faceted/facet_document.rs index 8d582d103..e74131402 100644 --- a/crates/milli/src/update/new/extract/faceted/facet_document.rs +++ b/crates/milli/src/update/new/extract/faceted/facet_document.rs @@ -1,46 +1,80 @@ +use std::collections::HashSet; + use serde_json::Value; -use crate::constants::RESERVED_GEO_FIELD_NAME; +use crate::attribute_patterns::PatternMatch; +use crate::fields_ids_map::metadata::Metadata; use crate::update::new::document::Document; use crate::update::new::extract::geo::extract_geo_coordinates; use crate::update::new::extract::perm_json_p; -use crate::{FieldId, GlobalFieldsIdsMap, InternalError, Result, UserError}; +use crate::{ + FieldId, FilterableAttributesRule, GlobalFieldsIdsMap, InternalError, Result, UserError, +}; +use crate::filterable_attributes_rules::match_faceted_field; + +#[allow(clippy::too_many_arguments)] pub fn extract_document_facets<'doc>( - attributes_to_extract: &[&str], document: impl Document<'doc>, external_document_id: &str, field_id_map: &mut GlobalFieldsIdsMap, - facet_fn: &mut impl FnMut(FieldId, perm_json_p::Depth, &Value) -> Result<()>, + filterable_attributes: &[FilterableAttributesRule], + sortable_fields: &HashSet, + asc_desc_fields: &HashSet, + distinct_field: &Option, + is_geo_enabled: bool, + facet_fn: &mut impl FnMut(FieldId, Metadata, perm_json_p::Depth, &Value) -> Result<()>, ) -> Result<()> { + // return the match result for the given field name. + let match_field = |field_name: &str| -> PatternMatch { + match_faceted_field( + field_name, + filterable_attributes, + sortable_fields, + asc_desc_fields, + distinct_field, + ) + }; + + // extract the field if it is faceted (facet searchable, filterable, sortable) + let mut extract_field = |name: &str, depth: perm_json_p::Depth, value: &Value| -> Result<()> { + match field_id_map.id_with_metadata_or_insert(name) { + Some((field_id, meta)) => { + facet_fn(field_id, meta, depth, value)?; + + Ok(()) + } + None => Err(UserError::AttributeLimitReached.into()), + } + }; + for res in document.iter_top_level_fields() { let (field_name, value) = res?; + let selection = match_field(field_name); - let mut tokenize_field = - |name: &str, depth: perm_json_p::Depth, value: &Value| match field_id_map - .id_or_insert(name) - { - Some(field_id) => facet_fn(field_id, depth, value), - None => Err(UserError::AttributeLimitReached.into()), - }; + // extract the field if it matches a pattern and if it is faceted (facet searchable, filterable, sortable) + let mut match_and_extract = |name: &str, depth: perm_json_p::Depth, value: &Value| { + let selection = match_field(name); + if selection == PatternMatch::Match { + extract_field(name, depth, value)?; + } - // if the current field is searchable or contains a searchable attribute - let selection = perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]); - if selection != perm_json_p::Selection::Skip { + Ok(selection) + }; + + if selection != PatternMatch::NoMatch { // parse json. match serde_json::value::to_value(value).map_err(InternalError::SerdeJson)? { Value::Object(object) => { perm_json_p::seek_leaf_values_in_object( &object, - Some(attributes_to_extract), - &[], // skip no attributes field_name, perm_json_p::Depth::OnBaseKey, - &mut tokenize_field, + &mut match_and_extract, )?; - if selection == perm_json_p::Selection::Select { - tokenize_field( + if selection == PatternMatch::Match { + extract_field( field_name, perm_json_p::Depth::OnBaseKey, &Value::Object(object), @@ -50,36 +84,34 @@ pub fn extract_document_facets<'doc>( Value::Array(array) => { perm_json_p::seek_leaf_values_in_array( &array, - Some(attributes_to_extract), - &[], // skip no attributes field_name, perm_json_p::Depth::OnBaseKey, - &mut tokenize_field, + &mut match_and_extract, )?; - if selection == perm_json_p::Selection::Select { - tokenize_field( + if selection == PatternMatch::Match { + extract_field( field_name, perm_json_p::Depth::OnBaseKey, &Value::Array(array), )?; } } - value => tokenize_field(field_name, perm_json_p::Depth::OnBaseKey, &value)?, + value => extract_field(field_name, perm_json_p::Depth::OnBaseKey, &value)?, } } } - if attributes_to_extract.contains(&RESERVED_GEO_FIELD_NAME) { + if is_geo_enabled { if let Some(geo_value) = document.geo_field()? { if let Some([lat, lng]) = extract_geo_coordinates(external_document_id, geo_value)? { - let (lat_fid, lng_fid) = field_id_map - .id_or_insert("_geo.lat") - .zip(field_id_map.id_or_insert("_geo.lng")) + let ((lat_fid, lat_meta), (lng_fid, lng_meta)) = field_id_map + .id_with_metadata_or_insert("_geo.lat") + .zip(field_id_map.id_with_metadata_or_insert("_geo.lng")) .ok_or(UserError::AttributeLimitReached)?; - facet_fn(lat_fid, perm_json_p::Depth::OnBaseKey, &lat.into())?; - facet_fn(lng_fid, perm_json_p::Depth::OnBaseKey, &lng.into())?; + facet_fn(lat_fid, lat_meta, perm_json_p::Depth::OnBaseKey, &lat.into())?; + facet_fn(lng_fid, lng_meta, perm_json_p::Depth::OnBaseKey, &lng.into())?; } } } diff --git a/crates/milli/src/update/new/extract/geo/mod.rs b/crates/milli/src/update/new/extract/geo/mod.rs index f2af0b229..d51fd9d36 100644 --- a/crates/milli/src/update/new/extract/geo/mod.rs +++ b/crates/milli/src/update/new/extract/geo/mod.rs @@ -9,7 +9,6 @@ use heed::RoTxn; use serde_json::value::RawValue; use serde_json::Value; -use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::error::GeoError; use crate::update::new::document::Document; use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extractor}; @@ -29,9 +28,7 @@ impl GeoExtractor { index: &Index, grenad_parameters: GrenadParameters, ) -> Result> { - let is_sortable = index.sortable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME); - let is_filterable = index.filterable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME); - if is_sortable || is_filterable { + if index.is_geo_enabled(rtxn)? { Ok(Some(GeoExtractor { grenad_parameters })) } else { Ok(None) diff --git a/crates/milli/src/update/new/extract/mod.rs b/crates/milli/src/update/new/extract/mod.rs index aa0a3d333..a8264ba4a 100644 --- a/crates/milli/src/update/new/extract/mod.rs +++ b/crates/milli/src/update/new/extract/mod.rs @@ -5,7 +5,6 @@ mod geo; mod searchable; mod vectors; -use bumpalo::Bump; pub use cache::{ merge_caches_sorted, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, }; @@ -15,27 +14,11 @@ pub use geo::*; pub use searchable::*; pub use vectors::EmbeddingExtractor; -use super::indexer::document_changes::{DocumentChanges, IndexingContext}; -use super::steps::IndexingStep; -use super::thread_local::{FullySend, ThreadLocal}; -use crate::Result; - -pub trait DocidsExtractor { - fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( - document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, - extractor_allocs: &'extractor mut ThreadLocal>, - step: IndexingStep, - ) -> Result>> - where - MSP: Fn() -> bool + Sync; -} - /// TODO move in permissive json pointer pub mod perm_json_p { use serde_json::{Map, Value}; - use crate::Result; + use crate::{attribute_patterns::PatternMatch, Result}; const SPLIT_SYMBOL: char = '.'; /// Returns `true` if the `selector` match the `key`. @@ -68,11 +51,9 @@ pub mod perm_json_p { pub fn seek_leaf_values_in_object( value: &Map, - selectors: Option<&[&str]>, - skip_selectors: &[&str], base_key: &str, base_depth: Depth, - seeker: &mut impl FnMut(&str, Depth, &Value) -> Result<()>, + seeker: &mut impl FnMut(&str, Depth, &Value) -> Result, ) -> Result<()> { if value.is_empty() { seeker(base_key, base_depth, &Value::Object(Map::with_capacity(0)))?; @@ -85,40 +66,16 @@ pub mod perm_json_p { format!("{}{}{}", base_key, SPLIT_SYMBOL, key) }; - // here if the user only specified `doggo` we need to iterate in all the fields of `doggo` - // so we check the contained_in on both side - let selection = select_field(&base_key, selectors, skip_selectors); - if selection != Selection::Skip { + let selection = seeker(&base_key, Depth::OnBaseKey, value)?; + if selection != PatternMatch::NoMatch { match value { Value::Object(object) => { - if selection == Selection::Select { - seeker(&base_key, Depth::OnBaseKey, value)?; - } - - seek_leaf_values_in_object( - object, - selectors, - skip_selectors, - &base_key, - Depth::OnBaseKey, - seeker, - ) + seek_leaf_values_in_object(object, &base_key, Depth::OnBaseKey, seeker) } Value::Array(array) => { - if selection == Selection::Select { - seeker(&base_key, Depth::OnBaseKey, value)?; - } - - seek_leaf_values_in_array( - array, - selectors, - skip_selectors, - &base_key, - Depth::OnBaseKey, - seeker, - ) + seek_leaf_values_in_array(array, &base_key, Depth::OnBaseKey, seeker) } - value => seeker(&base_key, Depth::OnBaseKey, value), + _ => Ok(()), }?; } } @@ -128,11 +85,9 @@ pub mod perm_json_p { pub fn seek_leaf_values_in_array( values: &[Value], - selectors: Option<&[&str]>, - skip_selectors: &[&str], base_key: &str, base_depth: Depth, - seeker: &mut impl FnMut(&str, Depth, &Value) -> Result<()>, + seeker: &mut impl FnMut(&str, Depth, &Value) -> Result, ) -> Result<()> { if values.is_empty() { seeker(base_key, base_depth, &Value::Array(vec![]))?; @@ -140,61 +95,16 @@ pub mod perm_json_p { for value in values { match value { - Value::Object(object) => seek_leaf_values_in_object( - object, - selectors, - skip_selectors, - base_key, - Depth::InsideArray, - seeker, - ), - Value::Array(array) => seek_leaf_values_in_array( - array, - selectors, - skip_selectors, - base_key, - Depth::InsideArray, - seeker, - ), - value => seeker(base_key, Depth::InsideArray, value), + Value::Object(object) => { + seek_leaf_values_in_object(object, base_key, Depth::InsideArray, seeker) + } + Value::Array(array) => { + seek_leaf_values_in_array(array, base_key, Depth::InsideArray, seeker) + } + value => seeker(base_key, Depth::InsideArray, value).map(|_| ()), }?; } Ok(()) } - - pub fn select_field( - field_name: &str, - selectors: Option<&[&str]>, - skip_selectors: &[&str], - ) -> Selection { - if skip_selectors.iter().any(|skip_selector| { - contained_in(skip_selector, field_name) || contained_in(field_name, skip_selector) - }) { - Selection::Skip - } else if let Some(selectors) = selectors { - let mut selection = Selection::Skip; - for selector in selectors { - if contained_in(field_name, selector) { - selection = Selection::Select; - break; - } else if contained_in(selector, field_name) { - selection = Selection::Parent; - } - } - selection - } else { - Selection::Select - } - } - - #[derive(Debug, Clone, Copy, PartialEq, Eq)] - pub enum Selection { - /// The field is a parent of the of a nested field that must be selected - Parent, - /// The field must be selected - Select, - /// The field must be skipped - Skip, - } } diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs index 49259cd64..444c3f7d5 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -5,8 +5,8 @@ use std::ops::DerefMut as _; use bumpalo::collections::vec::Vec as BumpVec; use bumpalo::Bump; -use heed::RoTxn; +use super::match_searchable_field; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use crate::update::new::extract::cache::BalancedCaches; use crate::update::new::extract::perm_json_p::contained_in; @@ -17,8 +17,7 @@ use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; use crate::update::new::DocumentChange; -use crate::update::GrenadParameters; -use crate::{bucketed_position, DocumentId, FieldId, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; +use crate::{bucketed_position, DocumentId, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE}; const MAX_COUNTED_WORDS: usize = 30; @@ -207,9 +206,10 @@ impl<'extractor> WordDocidsCaches<'extractor> { } pub struct WordDocidsExtractorData<'a> { - tokenizer: &'a DocumentTokenizer<'a>, - grenad_parameters: &'a GrenadParameters, + tokenizer: DocumentTokenizer<'a>, + max_memory_by_thread: Option, buckets: usize, + searchable_attributes: Option>, } impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> { @@ -218,7 +218,7 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> { fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result { Ok(RefCell::new(Some(WordDocidsBalancedCaches::new_in( self.buckets, - self.grenad_parameters.max_memory_by_thread(), + self.max_memory_by_thread, extractor_alloc, )))) } @@ -230,7 +230,12 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> { ) -> Result<()> { for change in changes { let change = change?; - WordDocidsExtractors::extract_document_change(context, self.tokenizer, change)?; + WordDocidsExtractors::extract_document_change( + context, + &self.tokenizer, + self.searchable_attributes.as_deref(), + change, + )?; } Ok(()) } @@ -248,52 +253,42 @@ impl WordDocidsExtractors { where MSP: Fn() -> bool + Sync, { - let index = indexing_context.index; - let rtxn = index.read_txn()?; - - let stop_words = index.stop_words(&rtxn)?; - let allowed_separators = index.allowed_separators(&rtxn)?; + // Warning: this is duplicated code from extract_word_pair_proximity_docids.rs + let rtxn = indexing_context.index.read_txn()?; + let stop_words = indexing_context.index.stop_words(&rtxn)?; + let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?; let allowed_separators: Option> = allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let dictionary = index.dictionary(&rtxn)?; + let dictionary = indexing_context.index.dictionary(&rtxn)?; let dictionary: Option> = dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let builder = tokenizer_builder( + let mut builder = tokenizer_builder( stop_words.as_ref(), allowed_separators.as_deref(), dictionary.as_deref(), ); - let tokenizer = builder.into_tokenizer(); - - let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?; - let attributes_to_skip = Self::attributes_to_skip(&rtxn, index)?; + let tokenizer = builder.build(); let localized_attributes_rules = - index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); - + indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); let document_tokenizer = DocumentTokenizer { tokenizer: &tokenizer, - attribute_to_extract: attributes_to_extract.as_deref(), - attribute_to_skip: attributes_to_skip.as_slice(), localized_attributes_rules: &localized_attributes_rules, max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, }; - + let extractor_data = WordDocidsExtractorData { + tokenizer: document_tokenizer, + max_memory_by_thread: indexing_context.grenad_parameters.max_memory_by_thread(), + buckets: rayon::current_num_threads(), + searchable_attributes: indexing_context.index.user_defined_searchable_fields(&rtxn)?, + }; let datastore = ThreadLocal::new(); - { let span = tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); let _entered = span.enter(); - - let extractor = WordDocidsExtractorData { - tokenizer: &document_tokenizer, - grenad_parameters: indexing_context.grenad_parameters, - buckets: rayon::current_num_threads(), - }; - extract( document_changes, - &extractor, + &extractor_data, indexing_context, extractor_allocs, &datastore, @@ -312,6 +307,7 @@ impl WordDocidsExtractors { fn extract_document_change( context: &DocumentChangeContext>>, document_tokenizer: &DocumentTokenizer, + searchable_attributes: Option<&[&str]>, document_change: DocumentChange, ) -> Result<()> { let index = &context.index; @@ -345,7 +341,9 @@ impl WordDocidsExtractors { } DocumentChange::Update(inner) => { if !inner.has_changed_for_fields( - document_tokenizer.attribute_to_extract, + &mut |field_name: &str| { + match_searchable_field(field_name, searchable_attributes) + }, &context.rtxn, context.index, context.db_fields_ids_map, @@ -408,15 +406,4 @@ impl WordDocidsExtractors { let mut buffer = BumpVec::with_capacity_in(buffer_size, &context.doc_alloc); cached_sorter.flush_fid_word_count(&mut buffer) } - - fn attributes_to_extract<'a>( - rtxn: &'a RoTxn, - index: &'a Index, - ) -> Result>> { - index.user_defined_searchable_fields(rtxn).map_err(Into::into) - } - - fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result> { - Ok(Vec::new()) - } } diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index e58c0efd2..0724b0513 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -2,30 +2,114 @@ use std::cell::RefCell; use std::collections::VecDeque; use std::rc::Rc; -use heed::RoTxn; +use bumpalo::Bump; -use super::tokenize_document::DocumentTokenizer; -use super::SearchableExtractor; +use super::match_searchable_field; +use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use crate::proximity::{index_proximity, MAX_DISTANCE}; use crate::update::new::document::Document; use crate::update::new::extract::cache::BalancedCaches; -use crate::update::new::indexer::document_changes::DocumentChangeContext; +use crate::update::new::indexer::document_changes::{ + extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, +}; use crate::update::new::ref_cell_ext::RefCellExt as _; +use crate::update::new::steps::IndexingStep; +use crate::update::new::thread_local::{FullySend, ThreadLocal}; use crate::update::new::DocumentChange; -use crate::{FieldId, GlobalFieldsIdsMap, Index, Result}; +use crate::{FieldId, GlobalFieldsIdsMap, Result, MAX_POSITION_PER_ATTRIBUTE}; + +pub struct WordPairProximityDocidsExtractorData<'a> { + tokenizer: DocumentTokenizer<'a>, + searchable_attributes: Option>, + max_memory_by_thread: Option, + buckets: usize, +} + +impl<'a, 'extractor> Extractor<'extractor> for WordPairProximityDocidsExtractorData<'a> { + type Data = RefCell>; + + fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result { + Ok(RefCell::new(BalancedCaches::new_in( + self.buckets, + self.max_memory_by_thread, + extractor_alloc, + ))) + } + + fn process<'doc>( + &self, + changes: impl Iterator>>, + context: &DocumentChangeContext, + ) -> Result<()> { + for change in changes { + let change = change?; + WordPairProximityDocidsExtractor::extract_document_change( + context, + &self.tokenizer, + self.searchable_attributes.as_deref(), + change, + )?; + } + Ok(()) + } +} pub struct WordPairProximityDocidsExtractor; -impl SearchableExtractor for WordPairProximityDocidsExtractor { - fn attributes_to_extract<'a>( - rtxn: &'a RoTxn, - index: &'a Index, - ) -> Result>> { - index.user_defined_searchable_fields(rtxn).map_err(Into::into) - } +impl WordPairProximityDocidsExtractor { + pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( + document_changes: &DC, + indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, + extractor_allocs: &'extractor mut ThreadLocal>, + step: IndexingStep, + ) -> Result>> + where + MSP: Fn() -> bool + Sync, + { + // Warning: this is duplicated code from extract_word_docids.rs + let rtxn = indexing_context.index.read_txn()?; + let stop_words = indexing_context.index.stop_words(&rtxn)?; + let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?; + let allowed_separators: Option> = + allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect()); + let dictionary = indexing_context.index.dictionary(&rtxn)?; + let dictionary: Option> = + dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); + let mut builder = tokenizer_builder( + stop_words.as_ref(), + allowed_separators.as_deref(), + dictionary.as_deref(), + ); + let tokenizer = builder.build(); + let localized_attributes_rules = + indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); + let document_tokenizer = DocumentTokenizer { + tokenizer: &tokenizer, + localized_attributes_rules: &localized_attributes_rules, + max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, + }; + let extractor_data = WordPairProximityDocidsExtractorData { + tokenizer: document_tokenizer, + searchable_attributes: indexing_context.index.user_defined_searchable_fields(&rtxn)?, + max_memory_by_thread: indexing_context.grenad_parameters.max_memory_by_thread(), + buckets: rayon::current_num_threads(), + }; + let datastore = ThreadLocal::new(); + { + let span = + tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); + let _entered = span.enter(); + extract( + document_changes, + &extractor_data, + indexing_context, + extractor_allocs, + &datastore, + step, + )?; + } - fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result> { - Ok(Vec::new()) + Ok(datastore.into_iter().map(RefCell::into_inner).collect()) } // This method is reimplemented to count the number of words in the document in each field @@ -34,6 +118,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { fn extract_document_change( context: &DocumentChangeContext>, document_tokenizer: &DocumentTokenizer, + searchable_attributes: Option<&[&str]>, document_change: DocumentChange, ) -> Result<()> { let doc_alloc = &context.doc_alloc; @@ -71,7 +156,9 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { } DocumentChange::Update(inner) => { if !inner.has_changed_for_fields( - document_tokenizer.attribute_to_extract, + &mut |field_name: &str| { + match_searchable_field(field_name, searchable_attributes) + }, rtxn, index, context.db_fields_ids_map, diff --git a/crates/milli/src/update/new/extract/searchable/mod.rs b/crates/milli/src/update/new/extract/searchable/mod.rs index 7c949a3ce..79a6fae87 100644 --- a/crates/milli/src/update/new/extract/searchable/mod.rs +++ b/crates/milli/src/update/new/extract/searchable/mod.rs @@ -2,145 +2,28 @@ mod extract_word_docids; mod extract_word_pair_proximity_docids; mod tokenize_document; -use std::cell::RefCell; -use std::marker::PhantomData; - -use bumpalo::Bump; pub use extract_word_docids::{WordDocidsCaches, WordDocidsExtractors}; pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor; -use heed::RoTxn; -use tokenize_document::{tokenizer_builder, DocumentTokenizer}; -use super::cache::BalancedCaches; -use super::DocidsExtractor; -use crate::update::new::indexer::document_changes::{ - extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, -}; -use crate::update::new::steps::IndexingStep; -use crate::update::new::thread_local::{FullySend, ThreadLocal}; -use crate::update::new::DocumentChange; -use crate::update::GrenadParameters; -use crate::{Index, Result, MAX_POSITION_PER_ATTRIBUTE}; +use crate::attribute_patterns::{match_field_legacy, PatternMatch}; -pub struct SearchableExtractorData<'a, EX: SearchableExtractor> { - tokenizer: &'a DocumentTokenizer<'a>, - grenad_parameters: &'a GrenadParameters, - buckets: usize, - _ex: PhantomData, -} +pub fn match_searchable_field( + field_name: &str, + searchable_fields: Option<&[&str]>, +) -> PatternMatch { + let Some(searchable_fields) = searchable_fields else { + // If no searchable fields are provided, consider all fields as searchable + return PatternMatch::Match; + }; -impl<'a, 'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor> - for SearchableExtractorData<'a, EX> -{ - type Data = RefCell>; - - fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result { - Ok(RefCell::new(BalancedCaches::new_in( - self.buckets, - self.grenad_parameters.max_memory_by_thread(), - extractor_alloc, - ))) - } - - fn process<'doc>( - &self, - changes: impl Iterator>>, - context: &DocumentChangeContext, - ) -> Result<()> { - for change in changes { - let change = change?; - EX::extract_document_change(context, self.tokenizer, change)?; + let mut selection = PatternMatch::NoMatch; + for pattern in searchable_fields { + match match_field_legacy(pattern, field_name) { + PatternMatch::Match => return PatternMatch::Match, + PatternMatch::Parent => selection = PatternMatch::Parent, + PatternMatch::NoMatch => (), } - Ok(()) - } -} - -pub trait SearchableExtractor: Sized + Sync { - fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( - document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, - extractor_allocs: &'extractor mut ThreadLocal>, - step: IndexingStep, - ) -> Result>> - where - MSP: Fn() -> bool + Sync, - { - let rtxn = indexing_context.index.read_txn()?; - let stop_words = indexing_context.index.stop_words(&rtxn)?; - let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?; - let allowed_separators: Option> = - allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let dictionary = indexing_context.index.dictionary(&rtxn)?; - let dictionary: Option> = - dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let mut builder = tokenizer_builder( - stop_words.as_ref(), - allowed_separators.as_deref(), - dictionary.as_deref(), - ); - let tokenizer = builder.build(); - - let attributes_to_extract = Self::attributes_to_extract(&rtxn, indexing_context.index)?; - let attributes_to_skip = Self::attributes_to_skip(&rtxn, indexing_context.index)?; - let localized_attributes_rules = - indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); - - let document_tokenizer = DocumentTokenizer { - tokenizer: &tokenizer, - attribute_to_extract: attributes_to_extract.as_deref(), - attribute_to_skip: attributes_to_skip.as_slice(), - localized_attributes_rules: &localized_attributes_rules, - max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, - }; - - let extractor_data: SearchableExtractorData = SearchableExtractorData { - tokenizer: &document_tokenizer, - grenad_parameters: indexing_context.grenad_parameters, - buckets: rayon::current_num_threads(), - _ex: PhantomData, - }; - - let datastore = ThreadLocal::new(); - - { - let span = - tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); - let _entered = span.enter(); - extract( - document_changes, - &extractor_data, - indexing_context, - extractor_allocs, - &datastore, - step, - )?; - } - - Ok(datastore.into_iter().map(RefCell::into_inner).collect()) } - fn extract_document_change( - context: &DocumentChangeContext>, - document_tokenizer: &DocumentTokenizer, - document_change: DocumentChange, - ) -> Result<()>; - - fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) - -> Result>>; - - fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result>; -} - -impl DocidsExtractor for T { - fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( - document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, - extractor_allocs: &'extractor mut ThreadLocal>, - step: IndexingStep, - ) -> Result>> - where - MSP: Fn() -> bool + Sync, - { - Self::run_extraction(document_changes, indexing_context, extractor_allocs, step) - } + selection } diff --git a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs index 1c1605b66..dda46f24c 100644 --- a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -3,9 +3,10 @@ use std::collections::HashMap; use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; use serde_json::Value; +use crate::attribute_patterns::PatternMatch; use crate::update::new::document::Document; use crate::update::new::extract::perm_json_p::{ - seek_leaf_values_in_array, seek_leaf_values_in_object, select_field, Depth, Selection, + seek_leaf_values_in_array, seek_leaf_values_in_object, Depth, }; use crate::{ FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError, @@ -17,8 +18,6 @@ const MAX_DISTANCE: u32 = 8; pub struct DocumentTokenizer<'a> { pub tokenizer: &'a Tokenizer<'a>, - pub attribute_to_extract: Option<&'a [&'a str]>, - pub attribute_to_skip: &'a [&'a str], pub localized_attributes_rules: &'a [LocalizedAttributesRule], pub max_positions_per_attributes: u32, } @@ -31,87 +30,94 @@ impl<'a> DocumentTokenizer<'a> { token_fn: &mut impl FnMut(&str, FieldId, u16, &str) -> Result<()>, ) -> Result<()> { let mut field_position = HashMap::new(); + let mut tokenize_field = |field_name: &str, _depth, value: &Value| { + let Some((field_id, meta)) = field_id_map.id_with_metadata_or_insert(field_name) else { + return Err(UserError::AttributeLimitReached.into()); + }; + + if meta.is_searchable() { + self.tokenize_field(field_id, field_name, value, token_fn, &mut field_position)?; + } + + // todo: should be a match on the field_name using `match_field_legacy` function, + // but for legacy reasons we iterate over all the fields to fill the field_id_map. + Ok(PatternMatch::Match) + }; for entry in document.iter_top_level_fields() { let (field_name, value) = entry?; - - let mut tokenize_field = |field_name: &str, _depth, value: &Value| { - let Some(field_id) = field_id_map.id_or_insert(field_name) else { - return Err(UserError::AttributeLimitReached.into()); - }; - - if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip) - != Selection::Select - { - return Ok(()); - } - - let position = field_position - .entry(field_id) - .and_modify(|counter| *counter += MAX_DISTANCE) - .or_insert(0); - if *position >= self.max_positions_per_attributes { - return Ok(()); - } - - let text; - let tokens = match value { - Value::Number(n) => { - text = n.to_string(); - self.tokenizer.tokenize(text.as_str()) - } - Value::Bool(b) => { - text = b.to_string(); - self.tokenizer.tokenize(text.as_str()) - } - Value::String(text) => { - let locales = self - .localized_attributes_rules - .iter() - .find(|rule| rule.match_str(field_name)) - .map(|rule| rule.locales()); - self.tokenizer.tokenize_with_allow_list(text.as_str(), locales) - } - _ => return Ok(()), - }; - - // create an iterator of token with their positions. - let tokens = process_tokens(*position, tokens) - .take_while(|(p, _)| *p < self.max_positions_per_attributes); - - for (index, token) in tokens { - // keep a word only if it is not empty and fit in a LMDB key. - let token = token.lemma().trim(); - if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { - *position = index; - if let Ok(position) = (*position).try_into() { - token_fn(field_name, field_id, position, token)?; - } - } - } - - Ok(()) - }; - // parse json. match serde_json::to_value(value).map_err(InternalError::SerdeJson)? { Value::Object(object) => seek_leaf_values_in_object( &object, - None, - &[], field_name, Depth::OnBaseKey, &mut tokenize_field, )?, Value::Array(array) => seek_leaf_values_in_array( &array, - None, - &[], field_name, Depth::OnBaseKey, &mut tokenize_field, )?, - value => tokenize_field(field_name, Depth::OnBaseKey, &value)?, + value => { + tokenize_field(field_name, Depth::OnBaseKey, &value)?; + } + } + } + + Ok(()) + } + + fn tokenize_field( + &self, + field_id: FieldId, + field_name: &str, + value: &Value, + token_fn: &mut impl FnMut(&str, u16, u16, &str) -> std::result::Result<(), crate::Error>, + field_position: &mut HashMap, + ) -> Result<()> { + let position = field_position + .entry(field_id) + .and_modify(|counter| *counter += MAX_DISTANCE) + .or_insert(0); + if *position >= self.max_positions_per_attributes { + return Ok(()); + } + + let text; + let tokens = match value { + Value::Number(n) => { + text = n.to_string(); + self.tokenizer.tokenize(text.as_str()) + } + Value::Bool(b) => { + text = b.to_string(); + self.tokenizer.tokenize(text.as_str()) + } + Value::String(text) => { + let locales = self + .localized_attributes_rules + .iter() + .find(|rule| rule.match_str(field_name) == PatternMatch::Match) + .map(|rule| rule.locales()); + self.tokenizer.tokenize_with_allow_list(text.as_str(), locales) + } + _ => return Ok(()), + }; + + // create an iterator of token with their positions. + let tokens = process_tokens(*position, tokens) + .take_while(|(p, _)| *p < self.max_positions_per_attributes); + + for (index, token) in tokens { + // keep a word only if it is not empty and fit in a LMDB key. + let token = token.lemma().trim(); + if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { + *position = index; + if let Ok(position) = (*position).try_into() { + token_fn(field_name, field_id, position, token)?; + } } } @@ -215,15 +221,20 @@ mod test { let mut tb = TokenizerBuilder::default(); let document_tokenizer = DocumentTokenizer { tokenizer: &tb.build(), - attribute_to_extract: None, - attribute_to_skip: &["not-me", "me-nether.nope"], localized_attributes_rules: &[], max_positions_per_attributes: 1000, }; let fields_ids_map = FieldIdMapWithMetadata::new( fields_ids_map, - MetadataBuilder::new(Default::default(), Default::default(), Default::default(), None), + MetadataBuilder::new( + Default::default(), + Default::default(), + Default::default(), + None, + None, + Default::default(), + ), ); let fields_ids_map_lock = std::sync::RwLock::new(fields_ids_map); @@ -265,6 +276,10 @@ mod test { 2, 16, ]: "catto", + [ + 3, + 0, + ]: "unsearchable", [ 5, 0, @@ -277,6 +292,10 @@ mod test { 8, 0, ]: "23", + [ + 9, + 0, + ]: "unsearchable", } "###); } diff --git a/crates/milli/src/update/new/facet_search_builder.rs b/crates/milli/src/update/new/facet_search_builder.rs index d1ff6096d..6e9ffa1ed 100644 --- a/crates/milli/src/update/new/facet_search_builder.rs +++ b/crates/milli/src/update/new/facet_search_builder.rs @@ -9,12 +9,14 @@ use heed::{BytesDecode, BytesEncode, RoTxn, RwTxn}; use super::fst_merger_builder::FstMergerBuilder; use super::KvReaderDelAdd; +use crate::attribute_patterns::PatternMatch; use crate::heed_codec::facet::FacetGroupKey; use crate::update::del_add::{DelAdd, KvWriterDelAdd}; use crate::update::{create_sorter, MergeDeladdBtreesetString}; use crate::{ - BEU16StrCodec, FieldId, GlobalFieldsIdsMap, Index, LocalizedAttributesRule, Result, - MAX_FACET_VALUE_LENGTH, + BEU16StrCodec, FieldId, FieldIdMapMissingEntry, FilterableAttributesFeatures, + FilterableAttributesRule, GlobalFieldsIdsMap, Index, InternalError, LocalizedAttributesRule, + Result, MAX_FACET_VALUE_LENGTH, }; pub struct FacetSearchBuilder<'indexer> { @@ -22,6 +24,7 @@ pub struct FacetSearchBuilder<'indexer> { normalized_facet_string_docids_sorter: Sorter, global_fields_ids_map: GlobalFieldsIdsMap<'indexer>, localized_attributes_rules: Vec, + filterable_attributes_rules: Vec, // Buffered data below buffer: Vec, localized_field_ids: HashMap>>, @@ -31,6 +34,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> { pub fn new( global_fields_ids_map: GlobalFieldsIdsMap<'indexer>, localized_attributes_rules: Vec, + filterable_attributes_rules: Vec, ) -> Self { let registered_facets = HashMap::new(); let normalized_facet_string_docids_sorter = create_sorter( @@ -49,6 +53,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> { buffer: Vec::new(), global_fields_ids_map, localized_attributes_rules, + filterable_attributes_rules, localized_field_ids: HashMap::new(), } } @@ -60,6 +65,13 @@ impl<'indexer> FacetSearchBuilder<'indexer> { ) -> Result<()> { let FacetGroupKey { field_id, level: _level, left_bound } = facet_key; + let filterable_attributes_features = self.filterable_attributes_features(field_id)?; + + // if facet search is disabled, we don't need to register the facet + if !filterable_attributes_features.is_facet_searchable() { + return Ok(()); + }; + if deladd == DelAdd::Addition { self.registered_facets.entry(field_id).and_modify(|count| *count += 1).or_insert(1); } @@ -83,6 +95,24 @@ impl<'indexer> FacetSearchBuilder<'indexer> { Ok(()) } + fn filterable_attributes_features( + &mut self, + field_id: u16, + ) -> Result { + let Some(filterable_attributes_features) = + self.global_fields_ids_map.metadata(field_id).map(|metadata| { + metadata.filterable_attributes_features(&self.filterable_attributes_rules) + }) + else { + return Err(InternalError::FieldIdMapMissingEntry(FieldIdMapMissingEntry::FieldId { + field_id, + process: "facet_search_builder::register_from_key", + }) + .into()); + }; + Ok(filterable_attributes_features) + } + fn locales(&mut self, field_id: FieldId) -> Option<&[Language]> { if let Entry::Vacant(e) = self.localized_field_ids.entry(field_id) { let Some(field_name) = self.global_fields_ids_map.name(field_id) else { @@ -92,7 +122,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> { let locales = self .localized_attributes_rules .iter() - .find(|rule| rule.match_str(field_name)) + .find(|rule| rule.match_str(field_name) == PatternMatch::Match) .map(|rule| rule.locales.clone()); e.insert(locales); diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs index f49cd834d..907a4d1df 100644 --- a/crates/milli/src/update/new/indexer/extract.rs +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -199,7 +199,7 @@ where let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); let _entered = span.enter(); - ::run_extraction( + WordPairProximityDocidsExtractor::run_extraction( document_changes, indexing_context, extractor_allocs, diff --git a/crates/milli/src/update/new/indexer/post_processing.rs b/crates/milli/src/update/new/indexer/post_processing.rs index 201ab9ec9..2a01fccf3 100644 --- a/crates/milli/src/update/new/indexer/post_processing.rs +++ b/crates/milli/src/update/new/indexer/post_processing.rs @@ -25,7 +25,7 @@ use crate::{GlobalFieldsIdsMap, Index, Result}; pub(super) fn post_process( indexing_context: IndexingContext, wtxn: &mut RwTxn<'_>, - global_fields_ids_map: GlobalFieldsIdsMap<'_>, + mut global_fields_ids_map: GlobalFieldsIdsMap<'_>, facet_field_ids_delta: FacetFieldIdsDelta, ) -> Result<()> where @@ -33,10 +33,8 @@ where { let index = indexing_context.index; indexing_context.progress.update_progress(IndexingStep::PostProcessingFacets); - if index.facet_search(wtxn)? { - compute_facet_search_database(index, wtxn, global_fields_ids_map)?; - } - compute_facet_level_database(index, wtxn, facet_field_ids_delta)?; + compute_facet_level_database(index, wtxn, facet_field_ids_delta, &mut global_fields_ids_map)?; + compute_facet_search_database(index, wtxn, global_fields_ids_map)?; indexing_context.progress.update_progress(IndexingStep::PostProcessingWords); if let Some(prefix_delta) = compute_word_fst(index, wtxn)? { compute_prefix_database(index, wtxn, prefix_delta, indexing_context.grenad_parameters)?; @@ -116,10 +114,18 @@ fn compute_facet_search_database( global_fields_ids_map: GlobalFieldsIdsMap, ) -> Result<()> { let rtxn = index.read_txn()?; + + // if the facet search is not enabled, we can skip the rest of the function + if !index.facet_search(wtxn)? { + return Ok(()); + } + let localized_attributes_rules = index.localized_attributes_rules(&rtxn)?; + let filterable_attributes_rules = index.filterable_attributes_rules(&rtxn)?; let mut facet_search_builder = FacetSearchBuilder::new( global_fields_ids_map, localized_attributes_rules.unwrap_or_default(), + filterable_attributes_rules, ); let previous_facet_id_string_docids = index @@ -164,8 +170,19 @@ fn compute_facet_level_database( index: &Index, wtxn: &mut RwTxn, mut facet_field_ids_delta: FacetFieldIdsDelta, + global_fields_ids_map: &mut GlobalFieldsIdsMap, ) -> Result<()> { + let rtxn = index.read_txn()?; + let filterable_attributes_rules = index.filterable_attributes_rules(&rtxn)?; for (fid, delta) in facet_field_ids_delta.consume_facet_string_delta() { + // skip field ids that should not be facet leveled + let Some(metadata) = global_fields_ids_map.metadata(fid) else { + continue; + }; + if !metadata.require_facet_level_database(&filterable_attributes_rules) { + continue; + } + let span = tracing::trace_span!(target: "indexing::facet_field_ids", "string"); let _entered = span.enter(); match delta { diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index 723e018a1..a8bd3217f 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -137,7 +137,6 @@ pub(super) fn update_index( index.put_primary_key(wtxn, new_primary_key.name())?; } let mut inner_index_settings = InnerIndexSettings::from_index(index, wtxn, Some(embedders))?; - inner_index_settings.recompute_facets(wtxn, index)?; inner_index_settings.recompute_searchables(wtxn, index)?; index.put_field_distribution(wtxn, &field_distribution)?; index.put_documents_ids(wtxn, &document_ids)?; diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index 315988e98..571ffe1c6 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -6,17 +6,20 @@ use std::sync::Arc; use charabia::{Normalize, Tokenizer, TokenizerBuilder}; use deserr::{DeserializeError, Deserr}; -use itertools::{EitherOrBoth, Itertools}; +use itertools::{merge_join_by, EitherOrBoth, Itertools}; use roaring::RoaringBitmap; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use time::OffsetDateTime; -use super::del_add::DelAddOperation; +use super::del_add::{DelAdd, DelAddOperation}; use super::index_documents::{IndexDocumentsConfig, Transform}; use super::IndexerConfig; -use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME}; +use crate::attribute_patterns::PatternMatch; +use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::criterion::Criterion; use crate::error::UserError; +use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; +use crate::filterable_attributes_rules::match_faceted_field; use crate::index::{ IndexEmbeddingConfig, PrefixSearch, DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS, @@ -31,7 +34,7 @@ use crate::vector::settings::{ SubEmbeddingSettings, WriteBackToDocuments, }; use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; -use crate::{FieldId, FieldsIdsMap, Index, LocalizedAttributesRule, LocalizedFieldIds, Result}; +use crate::{FieldId, FilterableAttributesRule, Index, LocalizedAttributesRule, Result}; #[derive(Debug, Clone, PartialEq, Eq, Copy)] pub enum Setting { @@ -155,7 +158,7 @@ pub struct Settings<'a, 't, 'i> { searchable_fields: Setting>, displayed_fields: Setting>, - filterable_fields: Setting>, + filterable_fields: Setting>, sortable_fields: Setting>, criteria: Setting>, stop_words: Setting>, @@ -241,8 +244,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.filterable_fields = Setting::Reset; } - pub fn set_filterable_fields(&mut self, names: HashSet) { - self.filterable_fields = Setting::Set(names); + pub fn set_filterable_fields(&mut self, rules: Vec) { + self.filterable_fields = Setting::Set(rules); } pub fn set_sortable_fields(&mut self, names: HashSet) { @@ -516,7 +519,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { } /// Updates the index's searchable attributes. - fn update_searchable(&mut self) -> Result { + fn update_user_defined_searchable_attributes(&mut self) -> Result { match self.searchable_fields { Setting::Set(ref fields) => { // Check to see if the searchable fields changed before doing anything else @@ -529,26 +532,10 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { return Ok(false); } - // Since we're updating the settings we can only add new fields at the end of the field id map - let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; // fields are deduplicated, only the first occurrence is taken into account let names = fields.iter().unique().map(String::as_str).collect::>(); - // Add all the searchable attributes to the field map, and then add the - // remaining fields from the old field map to the new one - for name in names.iter() { - // The fields ids map won't change the field id of already present elements thus only the - // new fields will be inserted. - fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; - } - - self.index.put_all_searchable_fields_from_fields_ids_map( - self.wtxn, - &names, - &fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME), - &fields_ids_map, - )?; - self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; + self.index.put_user_defined_searchable_fields(self.wtxn, &names)?; Ok(true) } Setting::Reset => Ok(self.index.delete_all_searchable_fields(self.wtxn)?), @@ -760,14 +747,10 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { fn update_filterable(&mut self) -> Result<()> { match self.filterable_fields { Setting::Set(ref fields) => { - let mut new_facets = HashSet::new(); - for name in fields { - new_facets.insert(name.clone()); - } - self.index.put_filterable_fields(self.wtxn, &new_facets)?; + self.index.put_filterable_attributes_rules(self.wtxn, fields)?; } Setting::Reset => { - self.index.delete_filterable_fields(self.wtxn)?; + self.index.delete_filterable_attributes_rules(self.wtxn)?; } Setting::NotSet => (), } @@ -1257,7 +1240,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.update_separator_tokens()?; self.update_dictionary()?; self.update_synonyms()?; - self.update_searchable()?; + self.update_user_defined_searchable_attributes()?; self.update_exact_attributes()?; self.update_proximity_precision()?; self.update_prefix_search()?; @@ -1267,7 +1250,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { let embedding_config_updates = self.update_embedding_configs()?; let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn, None)?; - new_inner_settings.recompute_facets(self.wtxn, self.index)?; + new_inner_settings.recompute_searchables(self.wtxn, self.index)?; let primary_key_id = self .index @@ -1319,8 +1302,8 @@ impl InnerIndexSettingsDiff { settings_update_only: bool, ) -> Self { let only_additional_fields = match ( - &old_settings.user_defined_searchable_fields, - &new_settings.user_defined_searchable_fields, + &old_settings.user_defined_searchable_attributes, + &new_settings.user_defined_searchable_attributes, ) { (None, None) | (Some(_), None) | (None, Some(_)) => None, // None means * (Some(old), Some(new)) => { @@ -1342,14 +1325,14 @@ impl InnerIndexSettingsDiff { || old_settings.dictionary != new_settings.dictionary || old_settings.proximity_precision != new_settings.proximity_precision || old_settings.prefix_search != new_settings.prefix_search - || old_settings.localized_searchable_fields_ids - != new_settings.localized_searchable_fields_ids + || old_settings.localized_attributes_rules + != new_settings.localized_attributes_rules }; let cache_exact_attributes = old_settings.exact_attributes != new_settings.exact_attributes; - let cache_user_defined_searchables = old_settings.user_defined_searchable_fields - != new_settings.user_defined_searchable_fields; + let cache_user_defined_searchables = old_settings.user_defined_searchable_attributes + != new_settings.user_defined_searchable_attributes; // if the user-defined searchables changed, then we need to reindex prompts. if cache_user_defined_searchables { @@ -1432,30 +1415,70 @@ impl InnerIndexSettingsDiff { } } + /// List the faceted fields from the inner fid map. + /// This is used to list the faceted fields when we are reindexing, + /// but it can't be used in document addition because the field id map must be exhaustive. + pub fn list_faceted_fields_from_fid_map(&self, del_add: DelAdd) -> BTreeSet { + let settings = match del_add { + DelAdd::Deletion => &self.old, + DelAdd::Addition => &self.new, + }; + + settings + .fields_ids_map + .iter_id_metadata() + .filter(|(_, metadata)| metadata.is_faceted(&settings.filterable_attributes_rules)) + .map(|(id, _)| id) + .collect() + } + pub fn facet_fids_changed(&self) -> bool { - let existing_fields = &self.new.existing_fields; - if existing_fields.iter().any(|field| field.contains('.')) { - return true; + for eob in merge_join_by( + self.old.fields_ids_map.iter().filter(|(_, _, metadata)| { + metadata.is_faceted(&self.old.filterable_attributes_rules) + }), + self.new.fields_ids_map.iter().filter(|(_, _, metadata)| { + metadata.is_faceted(&self.new.filterable_attributes_rules) + }), + |(old_fid, _, _), (new_fid, _, _)| old_fid.cmp(new_fid), + ) { + match eob { + // If there is a difference, we need to reindex facet databases. + EitherOrBoth::Left(_) | EitherOrBoth::Right(_) => return true, + // If the field is faceted in both old and new settings, we check the facet-searchable and facet level database. + EitherOrBoth::Both((_, _, old_metadata), (_, _, new_metadata)) => { + // Check if the field is facet-searchable in the old and new settings. + // If there is a difference, we need to reindex facet-search database. + let old_filterable_features = old_metadata + .filterable_attributes_features(&self.old.filterable_attributes_rules); + let new_filterable_features = new_metadata + .filterable_attributes_features(&self.new.filterable_attributes_rules); + let is_old_facet_searchable = + old_filterable_features.is_facet_searchable() && self.old.facet_search; + let is_new_facet_searchable = + new_filterable_features.is_facet_searchable() && self.new.facet_search; + if is_old_facet_searchable != is_new_facet_searchable { + return true; + } + + // Check if the field needs a facet level database in the old and new settings. + // If there is a difference, we need to reindex facet level databases. + let old_facet_level_database = old_metadata + .require_facet_level_database(&self.old.filterable_attributes_rules); + let new_facet_level_database = new_metadata + .require_facet_level_database(&self.new.filterable_attributes_rules); + if old_facet_level_database != new_facet_level_database { + return true; + } + } + } } - let old_faceted_fields = &self.old.user_defined_faceted_fields; - if old_faceted_fields.iter().any(|field| field.contains('.')) { - return true; - } - - // If there is new faceted fields we indicate that we must reindex as we must - // index new fields as facets. It means that the distinct attribute, - // an Asc/Desc criterion or a filtered attribute as be added or removed. - let new_faceted_fields = &self.new.user_defined_faceted_fields; - if new_faceted_fields.iter().any(|field| field.contains('.')) { - return true; - } - - (existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields) + false } pub fn global_facet_settings_changed(&self) -> bool { - self.old.localized_faceted_fields_ids != self.new.localized_faceted_fields_ids + self.old.localized_attributes_rules != self.new.localized_attributes_rules || self.old.facet_search != self.new.facet_search } @@ -1475,10 +1498,6 @@ impl InnerIndexSettingsDiff { self.old.geo_fields_ids != self.new.geo_fields_ids || (!self.settings_update_only && self.new.geo_fields_ids.is_some()) } - - pub fn modified_faceted_fields(&self) -> HashSet { - &self.old.user_defined_faceted_fields ^ &self.new.user_defined_faceted_fields - } } #[derive(Clone)] @@ -1486,20 +1505,17 @@ pub(crate) struct InnerIndexSettings { pub stop_words: Option>>, pub allowed_separators: Option>, pub dictionary: Option>, - pub fields_ids_map: FieldsIdsMap, - pub user_defined_faceted_fields: HashSet, - pub user_defined_searchable_fields: Option>, - pub faceted_fields_ids: HashSet, - pub searchable_fields_ids: Vec, + pub fields_ids_map: FieldIdMapWithMetadata, + pub localized_attributes_rules: Vec, + pub filterable_attributes_rules: Vec, + pub asc_desc_fields: HashSet, + pub distinct_field: Option, + pub user_defined_searchable_attributes: Option>, + pub sortable_fields: HashSet, pub exact_attributes: HashSet, pub proximity_precision: ProximityPrecision, pub embedding_configs: EmbeddingConfigs, - pub existing_fields: HashSet, pub geo_fields_ids: Option<(FieldId, FieldId)>, - pub non_searchable_fields_ids: Vec, - pub non_faceted_fields_ids: Vec, - pub localized_searchable_fields_ids: LocalizedFieldIds, - pub localized_faceted_fields_ids: LocalizedFieldIds, pub prefix_search: PrefixSearch, pub facet_search: bool, } @@ -1515,12 +1531,6 @@ impl InnerIndexSettings { let allowed_separators = index.allowed_separators(rtxn)?; let dictionary = index.dictionary(rtxn)?; let mut fields_ids_map = index.fields_ids_map(rtxn)?; - let user_defined_searchable_fields = index.user_defined_searchable_fields(rtxn)?; - let user_defined_searchable_fields = - user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect()); - let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?; - let mut searchable_fields_ids = index.searchable_fields_ids(rtxn)?; - let mut faceted_fields_ids = index.faceted_fields_ids(rtxn)?; let exact_attributes = index.exact_attributes_ids(rtxn)?; let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default(); let embedding_configs = match embedding_configs { @@ -1529,87 +1539,57 @@ impl InnerIndexSettings { }; let prefix_search = index.prefix_search(rtxn)?.unwrap_or_default(); let facet_search = index.facet_search(rtxn)?; - let existing_fields: HashSet<_> = index - .field_distribution(rtxn)? - .into_iter() - .filter_map(|(field, count)| (count != 0).then_some(field)) - .collect(); - // index.fields_ids_map($a)? ==>> fields_ids_map let geo_fields_ids = match fields_ids_map.id(RESERVED_GEO_FIELD_NAME) { - Some(gfid) => { - let is_sortable = index.sortable_fields_ids(rtxn)?.contains(&gfid); - let is_filterable = index.filterable_fields_ids(rtxn)?.contains(&gfid); + Some(_) if index.is_geo_enabled(rtxn)? => { // if `_geo` is faceted then we get the `lat` and `lng` - if is_sortable || is_filterable { - let field_ids = fields_ids_map - .insert("_geo.lat") - .zip(fields_ids_map.insert("_geo.lng")) - .ok_or(UserError::AttributeLimitReached)?; - Some(field_ids) - } else { - None - } + let field_ids = fields_ids_map + .insert("_geo.lat") + .zip(fields_ids_map.insert("_geo.lng")) + .ok_or(UserError::AttributeLimitReached)?; + Some(field_ids) } - None => None, + _ => None, }; - let localized_attributes_rules = index.localized_attributes_rules(rtxn)?; - let localized_searchable_fields_ids = LocalizedFieldIds::new( - &localized_attributes_rules, - &fields_ids_map, - searchable_fields_ids.iter().cloned(), - ); - let localized_faceted_fields_ids = LocalizedFieldIds::new( - &localized_attributes_rules, - &fields_ids_map, - faceted_fields_ids.iter().cloned(), - ); - - let vectors_fids = fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME); - searchable_fields_ids.retain(|id| !vectors_fids.contains(id)); - faceted_fields_ids.retain(|id| !vectors_fids.contains(id)); + let localized_attributes_rules = + index.localized_attributes_rules(rtxn)?.unwrap_or_default(); + let filterable_attributes_rules = index.filterable_attributes_rules(rtxn)?; + let sortable_fields = index.sortable_fields(rtxn)?; + let asc_desc_fields = index.asc_desc_fields(rtxn)?; + let distinct_field = index.distinct_field(rtxn)?.map(|f| f.to_string()); + let user_defined_searchable_attributes = index + .user_defined_searchable_fields(rtxn)? + .map(|fields| fields.into_iter().map(|f| f.to_string()).collect()); + let builder = MetadataBuilder::from_index(index, rtxn)?; + let fields_ids_map = FieldIdMapWithMetadata::new(fields_ids_map, builder); Ok(Self { stop_words, allowed_separators, dictionary, fields_ids_map, - user_defined_faceted_fields, - user_defined_searchable_fields, - faceted_fields_ids, - searchable_fields_ids, + localized_attributes_rules, + filterable_attributes_rules, + asc_desc_fields, + distinct_field, + user_defined_searchable_attributes, + sortable_fields, exact_attributes, proximity_precision, embedding_configs, - existing_fields, geo_fields_ids, - non_searchable_fields_ids: vectors_fids.clone(), - non_faceted_fields_ids: vectors_fids.clone(), - localized_searchable_fields_ids, - localized_faceted_fields_ids, prefix_search, facet_search, }) } - // find and insert the new field ids - pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn<'_>, index: &Index) -> Result<()> { - let new_facets = self - .fields_ids_map - .iter() - .filter(|(fid, _field)| !self.non_faceted_fields_ids.contains(fid)) - .filter(|(_fid, field)| crate::is_faceted(field, &self.user_defined_faceted_fields)) - .map(|(_fid, field)| field.to_string()) - .collect(); - index.put_faceted_fields(wtxn, &new_facets)?; - - self.faceted_fields_ids = index.faceted_fields_ids(wtxn)?; - let localized_attributes_rules = index.localized_attributes_rules(wtxn)?; - self.localized_faceted_fields_ids = LocalizedFieldIds::new( - &localized_attributes_rules, - &self.fields_ids_map, - self.faceted_fields_ids.iter().cloned(), - ); - Ok(()) + pub fn match_faceted_field(&self, field: &str) -> PatternMatch { + match_faceted_field( + field, + &self.filterable_attributes_rules, + &self.sortable_fields, + &self.asc_desc_fields, + &self.distinct_field, + ) } // find and insert the new field ids @@ -1619,7 +1599,7 @@ impl InnerIndexSettings { index: &Index, ) -> Result<()> { let searchable_fields = self - .user_defined_searchable_fields + .user_defined_searchable_attributes .as_ref() .map(|searchable| searchable.iter().map(|s| s.as_str()).collect::>()); @@ -1628,17 +1608,9 @@ impl InnerIndexSettings { index.put_all_searchable_fields_from_fields_ids_map( wtxn, &searchable_fields, - &self.non_searchable_fields_ids, &self.fields_ids_map, )?; } - self.searchable_fields_ids = index.searchable_fields_ids(wtxn)?; - let localized_attributes_rules = index.localized_attributes_rules(wtxn)?; - self.localized_searchable_fields_ids = LocalizedFieldIds::new( - &localized_attributes_rules, - &self.fields_ids_map, - self.searchable_fields_ids.iter().cloned(), - ); Ok(()) } diff --git a/crates/milli/src/update/test_settings.rs b/crates/milli/src/update/test_settings.rs index 1b5992462..00be0476a 100644 --- a/crates/milli/src/update/test_settings.rs +++ b/crates/milli/src/update/test_settings.rs @@ -1,6 +1,6 @@ use big_s::S; use heed::types::Bytes; -use maplit::{btreemap, btreeset, hashset}; +use maplit::{btreemap, btreeset}; use meili_snap::snapshot; use super::*; @@ -210,7 +210,7 @@ fn set_filterable_fields() { // Set the filterable fields to be the age. index .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S("age") }); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S("age"))]); }) .unwrap(); @@ -225,8 +225,6 @@ fn set_filterable_fields() { // Check that the displayed fields are correctly set. let rtxn = index.read_txn().unwrap(); - let fields_ids = index.filterable_fields(&rtxn).unwrap(); - assert_eq!(fields_ids, hashset! { S("age") }); // Only count the field_id 0 and level 0 facet values. // TODO we must support typed CSVs for numbers to be understood. let fidmap = index.fields_ids_map(&rtxn).unwrap(); @@ -268,15 +266,13 @@ fn set_filterable_fields() { // Set the filterable fields to be the age and the name. index .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S("age"), S("name") }); + settings.set_filterable_fields(vec![ + FilterableAttributesRule::Field(S("age")), + FilterableAttributesRule::Field(S("name")), + ]); }) .unwrap(); - // Check that the displayed fields are correctly set. - let rtxn = index.read_txn().unwrap(); - let fields_ids = index.filterable_fields(&rtxn).unwrap(); - assert_eq!(fields_ids, hashset! { S("age"), S("name") }); - let rtxn = index.read_txn().unwrap(); // Only count the field_id 2 and level 0 facet values. let count = index @@ -300,15 +296,10 @@ fn set_filterable_fields() { // Remove the age from the filterable fields. index .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S("name") }); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S("name"))]); }) .unwrap(); - // Check that the displayed fields are correctly set. - let rtxn = index.read_txn().unwrap(); - let fields_ids = index.filterable_fields(&rtxn).unwrap(); - assert_eq!(fields_ids, hashset! { S("name") }); - let rtxn = index.read_txn().unwrap(); // Only count the field_id 2 and level 0 facet values. let count = index @@ -637,7 +628,10 @@ fn setting_searchable_recomputes_other_settings() { index .update_settings(|settings| { settings.set_displayed_fields(vec!["hello".to_string()]); - settings.set_filterable_fields(hashset! { S("age"), S("toto") }); + settings.set_filterable_fields(vec![ + FilterableAttributesRule::Field(S("age")), + FilterableAttributesRule::Field(S("toto")), + ]); settings.set_criteria(vec![Criterion::Asc(S("toto"))]); }) .unwrap(); @@ -754,7 +748,7 @@ fn setting_impact_relevancy() { // Set the genres setting index .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S("genres") }); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S("genres"))]); }) .unwrap(); diff --git a/crates/milli/tests/search/facet_distribution.rs b/crates/milli/tests/search/facet_distribution.rs index 4d8bf324c..c5a61da9f 100644 --- a/crates/milli/tests/search/facet_distribution.rs +++ b/crates/milli/tests/search/facet_distribution.rs @@ -1,13 +1,12 @@ use big_s::S; use bumpalo::Bump; use heed::EnvOpenOptions; -use maplit::hashset; use milli::documents::mmap_from_objects; use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexerConfig, Settings}; use milli::vector::EmbeddingConfigs; -use milli::{FacetDistribution, Index, Object, OrderBy}; +use milli::{FacetDistribution, FilterableAttributesRule, Index, Object, OrderBy}; use serde_json::{from_value, json}; #[test] @@ -21,10 +20,10 @@ fn test_facet_distribution_with_no_facet_values() { let config = IndexerConfig::default(); let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_filterable_fields(hashset! { - S("genres"), - S("tags"), - }); + builder.set_filterable_fields(vec![ + FilterableAttributesRule::Field(S("genres")), + FilterableAttributesRule::Field(S("tags")), + ]); builder.execute(|_| (), || false).unwrap(); wtxn.commit().unwrap(); diff --git a/crates/milli/tests/search/mod.rs b/crates/milli/tests/search/mod.rs index 337a4c88c..72b124219 100644 --- a/crates/milli/tests/search/mod.rs +++ b/crates/milli/tests/search/mod.rs @@ -11,7 +11,9 @@ use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexerConfig, Settings}; use milli::vector::EmbeddingConfigs; -use milli::{AscDesc, Criterion, DocumentId, Index, Member, TermsMatchingStrategy}; +use milli::{ + AscDesc, Criterion, DocumentId, FilterableAttributesRule, Index, Member, TermsMatchingStrategy, +}; use serde::{Deserialize, Deserializer}; use slice_group_by::GroupBy; @@ -42,14 +44,14 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_criteria(criteria.to_vec()); - builder.set_filterable_fields(hashset! { - S("tag"), - S("asc_desc_rank"), - S("_geo"), - S("opt1"), - S("opt1.opt2"), - S("tag_in") - }); + builder.set_filterable_fields(vec![ + FilterableAttributesRule::Field(S("tag")), + FilterableAttributesRule::Field(S("asc_desc_rank")), + FilterableAttributesRule::Field(S("_geo")), + FilterableAttributesRule::Field(S("opt1")), + FilterableAttributesRule::Field(S("opt1.opt2")), + FilterableAttributesRule::Field(S("tag_in")), + ]); builder.set_sortable_fields(hashset! { S("tag"), S("asc_desc_rank"),