This commit is contained in:
ManyTheFish 2025-01-16 10:47:40 +01:00
parent e568dbbabb
commit eaf3be1702
9 changed files with 203 additions and 90 deletions

View File

@ -11,7 +11,9 @@ use fst::IntoStreamer;
use milli::index::{IndexEmbeddingConfig, PrefixSearch};
use milli::proximity::ProximityPrecision;
use milli::update::Setting;
use milli::{Criterion, CriterionError, Index, DEFAULT_VALUES_PER_FACET};
use milli::{
Criterion, CriterionError, FilterableAttributesSettings, Index, DEFAULT_VALUES_PER_FACET,
};
use serde::{Deserialize, Serialize, Serializer};
use utoipa::ToSchema;
@ -202,8 +204,8 @@ pub struct Settings<T> {
/// Attributes to use for faceting and filtering. See [Filtering and Faceted Search](https://www.meilisearch.com/docs/learn/filtering_and_sorting/search_with_facet_filters).
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default, error = DeserrJsonError<InvalidSettingsFilterableAttributes>)]
#[schema(value_type = Option<Vec<String>>, example = json!(["release_date", "genre"]))]
pub filterable_attributes: Setting<BTreeSet<String>>,
#[schema(value_type = Option<Vec<FilterableAttributesSettings>>, example = json!(["release_date", "genre"]))]
pub filterable_attributes: Setting<Vec<FilterableAttributesSettings>>,
/// Attributes to use when sorting search results.
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default, error = DeserrJsonError<InvalidSettingsSortableAttributes>)]

View File

@ -0,0 +1,58 @@
use deserr::Deserr;
use serde::{Deserialize, Serialize};
use utoipa::ToSchema;
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Deserr, ToSchema)]
#[repr(transparent)]
#[serde(transparent)]
pub struct AttributePatterns {
#[schema(value_type = Vec<String>)]
pub patterns: Vec<String>,
}
impl From<Vec<String>> for AttributePatterns {
fn from(patterns: Vec<String>) -> Self {
Self { patterns }
}
}
impl AttributePatterns {
pub fn match_str(&self, str: &str) -> bool {
self.patterns.iter().any(|pattern| match_pattern(pattern, str))
}
}
fn match_pattern(pattern: &str, str: &str) -> bool {
if pattern == "*" {
true
} else if pattern.starts_with('*') && pattern.ends_with('*') {
str.contains(&pattern[1..pattern.len() - 1])
} else if let Some(pattern) = pattern.strip_prefix('*') {
str.ends_with(pattern)
} else if let Some(pattern) = pattern.strip_suffix('*') {
str.starts_with(pattern)
} else {
pattern == str
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_match_pattern() {
assert!(match_pattern("*", "test"));
assert!(match_pattern("test*", "test"));
assert!(match_pattern("test*", "testa"));
assert!(match_pattern("*test", "test"));
assert!(match_pattern("*test", "atest"));
assert!(match_pattern("*test*", "test"));
assert!(match_pattern("*test*", "atesta"));
assert!(match_pattern("*test*", "atest"));
assert!(match_pattern("*test*", "testa"));
assert!(!match_pattern("test*test", "test"));
assert!(!match_pattern("*test", "testa"));
assert!(!match_pattern("test*", "atest"));
}
}

View File

@ -5,14 +5,14 @@ use charabia::Language;
use heed::RoTxn;
use super::FieldsIdsMap;
use crate::{FieldId, Index, LocalizedAttributesRule, Result};
use crate::{FieldId, FilterableAttributesSettings, Index, LocalizedAttributesRule, Result};
#[derive(Debug, Clone, Copy)]
pub struct Metadata {
pub searchable: bool,
pub filterable: bool,
pub sortable: bool,
localized_attributes_rule_id: Option<NonZeroU16>,
filterable_attributes_rule_id: Option<NonZeroU16>,
}
#[derive(Debug, Clone)]
@ -111,7 +111,7 @@ impl Metadata {
#[derive(Debug, Clone)]
pub struct MetadataBuilder {
searchable_attributes: Vec<String>,
filterable_attributes: HashSet<String>,
filterable_attributes: Vec<FilterableAttributesSettings>,
sortable_attributes: HashSet<String>,
localized_attributes: Option<Vec<LocalizedAttributesRule>>,
}
@ -134,7 +134,7 @@ impl MetadataBuilder {
pub fn new(
searchable_attributes: Vec<String>,
filterable_attributes: HashSet<String>,
filterable_attributes: Vec<FilterableAttributesSettings>,
sortable_attributes: HashSet<String>,
localized_attributes: Option<Vec<LocalizedAttributesRule>>,
) -> Self {
@ -152,8 +152,6 @@ impl MetadataBuilder {
.iter()
.any(|attribute| attribute == "*" || attribute == field);
let filterable = self.filterable_attributes.contains(field);
let sortable = self.sortable_attributes.contains(field);
let localized_attributes_rule_id = self
@ -164,7 +162,24 @@ impl MetadataBuilder {
// saturating_add(1): make `id` `NonZero`
.map(|id| NonZeroU16::new(id.saturating_add(1).try_into().unwrap()).unwrap());
Metadata { searchable, filterable, sortable, localized_attributes_rule_id }
let filterable_attributes_rule_id = self
.filterable_attributes
.iter()
.position(|attribute| match attribute {
FilterableAttributesSettings::Field(field_name) => field_name == field,
FilterableAttributesSettings::Pattern(patterns) => {
patterns.patterns.match_str(field)
}
})
// saturating_add(1): make `id` `NonZero`
.map(|id| NonZeroU16::new(id.saturating_add(1).try_into().unwrap()).unwrap());
Metadata {
searchable,
sortable,
localized_attributes_rule_id,
filterable_attributes_rule_id,
}
}
pub fn searchable_attributes(&self) -> &[String] {
@ -175,7 +190,7 @@ impl MetadataBuilder {
&self.sortable_attributes
}
pub fn filterable_attributes(&self) -> &HashSet<String> {
pub fn filterable_attributes(&self) -> &[FilterableAttributesSettings] {
&self.filterable_attributes
}

View File

@ -0,0 +1,75 @@
use deserr::{DeserializeError, Deserr, ValuePointerRef};
use serde::{Deserialize, Serialize};
use std::collections::HashSet;
use utoipa::ToSchema;
use crate::{
constants::RESERVED_GEO_FIELD_NAME, is_faceted_by, AttributePatterns, FieldId, FieldsIdsMap,
};
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug, ToSchema)]
#[serde(untagged)]
pub enum FilterableAttributesSettings {
Field(String),
Pattern(FilterableAttributesPatterns),
}
impl FilterableAttributesSettings {
pub fn match_str(&self, field: &str) -> bool {
match self {
FilterableAttributesSettings::Field(field_name) => is_faceted_by(field, field_name),
FilterableAttributesSettings::Pattern(patterns) => patterns.patterns.match_str(field),
}
}
pub fn has_geo(&self) -> bool {
/// TODO: This is a temporary solution to check if the geo field is activated.
matches!(self, FilterableAttributesSettings::Field(field_name) if field_name == RESERVED_GEO_FIELD_NAME)
}
}
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug, Deserr, ToSchema)]
#[serde(deny_unknown_fields, rename_all = "camelCase")]
#[deserr(rename_all = camelCase, deny_unknown_fields)]
pub struct FilterableAttributesPatterns {
pub patterns: AttributePatterns,
pub features: Option<FilterableAttributesFeatures>,
}
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug, Deserr, ToSchema)]
#[serde(deny_unknown_fields, rename_all = "camelCase")]
#[deserr(rename_all = camelCase, deny_unknown_fields)]
pub struct FilterableAttributesFeatures {
facet_search: Option<String>,
filter: Option<String>,
}
impl<E: DeserializeError> Deserr<E> for FilterableAttributesSettings {
fn deserialize_from_value<V: deserr::IntoValue>(
value: deserr::Value<V>,
location: ValuePointerRef,
) -> Result<Self, E> {
if value.kind() == deserr::ValueKind::Map {
Ok(Self::Pattern(FilterableAttributesPatterns::deserialize_from_value(
value, location,
)?))
} else {
Ok(Self::Field(String::deserialize_from_value(value, location)?))
}
}
}
pub fn matching_field_ids(
filterable_attributes: &[FilterableAttributesSettings],
fields_ids_map: &FieldsIdsMap,
) -> HashSet<FieldId> {
let mut result = HashSet::new();
for (field_id, field_name) in fields_ids_map.iter() {
for filterable_attribute in filterable_attributes {
if filterable_attribute.match_str(field_name) {
result.insert(field_id);
}
}
}
result
}

View File

@ -10,10 +10,11 @@ use roaring::RoaringBitmap;
use rstar::RTree;
use serde::{Deserialize, Serialize};
use crate::constants::RESERVED_VECTORS_FIELD_NAME;
use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME};
use crate::documents::PrimaryKey;
use crate::error::{InternalError, UserError};
use crate::fields_ids_map::FieldsIdsMap;
use crate::filterable_fields::matching_field_ids;
use crate::heed_codec::facet::{
FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
FieldIdCodec, OrderedF64Codec,
@ -25,8 +26,9 @@ use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfig};
use crate::{
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec,
FieldidsWeightsMap, GeoPoint, LocalizedAttributesRule, ObkvCodec, Result, RoaringBitmapCodec,
RoaringBitmapLenCodec, Search, U8StrStrCodec, Weight, BEU16, BEU32, BEU64,
FieldidsWeightsMap, FilterableAttributesSettings, GeoPoint, LocalizedAttributesRule, ObkvCodec,
Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, Weight, BEU16, BEU32,
BEU64,
};
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
@ -787,7 +789,7 @@ impl Index {
pub(crate) fn put_filterable_fields(
&self,
wtxn: &mut RwTxn<'_>,
fields: &HashSet<String>,
fields: &Vec<FilterableAttributesSettings>,
) -> heed::Result<()> {
self.main.remap_types::<Str, SerdeJson<_>>().put(
wtxn,
@ -802,7 +804,10 @@ impl Index {
}
/// Returns the filterable fields names.
pub fn filterable_fields(&self, rtxn: &RoTxn<'_>) -> heed::Result<HashSet<String>> {
pub fn filterable_fields(
&self,
rtxn: &RoTxn<'_>,
) -> heed::Result<Vec<FilterableAttributesSettings>> {
Ok(self
.main
.remap_types::<Str, SerdeJson<_>>()
@ -815,14 +820,9 @@ impl Index {
let fields = self.filterable_fields(rtxn)?;
let fields_ids_map = self.fields_ids_map(rtxn)?;
let mut fields_ids = HashSet::new();
for name in fields {
if let Some(field_id) = fields_ids_map.id(&name) {
fields_ids.insert(field_id);
}
}
let matching_field_ids = matching_field_ids(&fields, &fields_ids_map);
Ok(fields_ids)
Ok(matching_field_ids)
}
/* sortable fields */
@ -876,6 +876,13 @@ impl Index {
)
}
/// Returns true if the geo feature is activated.
pub fn is_geo_activated(&self, rtxn: &RoTxn<'_>) -> Result<bool> {
let geo_filter = self.filterable_fields(rtxn)?.iter().any(|field| field.has_geo());
let geo_sortable = self.sortable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME);
Ok(geo_filter || geo_sortable)
}
/// Returns the faceted fields names.
pub fn faceted_fields(&self, rtxn: &RoTxn<'_>) -> heed::Result<HashSet<String>> {
Ok(self
@ -906,7 +913,8 @@ impl Index {
///
/// The user faceted fields are the union of all the filterable, sortable, distinct, and Asc/Desc fields.
pub fn user_defined_faceted_fields(&self, rtxn: &RoTxn<'_>) -> Result<HashSet<String>> {
let filterable_fields = self.filterable_fields(rtxn)?;
let fields_ids_map = self.fields_ids_map(rtxn)?;
let filterable_fields = self.filterable_fields_ids(rtxn)?;
let sortable_fields = self.sortable_fields(rtxn)?;
let distinct_field = self.distinct_field(rtxn)?;
let asc_desc_fields =
@ -915,7 +923,14 @@ impl Index {
_otherwise => None,
});
let mut faceted_fields = filterable_fields;
let mut faceted_fields: HashSet<_> = filterable_fields
.into_iter()
.filter_map(|field_id| {
let field_name = fields_ids_map.name(field_id);
debug_assert!(field_name.is_some(), "field name not found for {field_id}");
field_name.map(|field| field.to_string())
})
.collect();
faceted_fields.extend(sortable_fields);
faceted_fields.extend(asc_desc_fields);
if let Some(field) = distinct_field {
@ -925,21 +940,6 @@ impl Index {
Ok(faceted_fields)
}
/// Identical to `user_defined_faceted_fields`, but returns ids instead.
pub fn user_defined_faceted_fields_ids(&self, rtxn: &RoTxn<'_>) -> Result<HashSet<FieldId>> {
let fields = self.user_defined_faceted_fields(rtxn)?;
let fields_ids_map = self.fields_ids_map(rtxn)?;
let mut fields_ids = HashSet::new();
for name in fields {
if let Some(field_id) = fields_ids_map.id(&name) {
fields_ids.insert(field_id);
}
}
Ok(fields_ids)
}
/* faceted documents ids */
/// Retrieve all the documents which contain this field id set as null

View File

@ -9,11 +9,13 @@ pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
pub mod documents;
mod asc_desc;
mod attribute_patterns;
mod criterion;
mod error;
mod external_documents_ids;
pub mod facet;
mod fields_ids_map;
mod filterable_fields;
pub mod heed_codec;
pub mod index;
mod localized_attributes_rules;
@ -51,6 +53,7 @@ pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbor
pub use {charabia as tokenizer, heed, rhai};
pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};
pub use self::attribute_patterns::AttributePatterns;
pub use self::criterion::{default_criteria, Criterion, CriterionError};
pub use self::error::{
Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError,
@ -58,6 +61,9 @@ pub use self::error::{
pub use self::external_documents_ids::ExternalDocumentsIds;
pub use self::fieldids_weights_map::FieldidsWeightsMap;
pub use self::fields_ids_map::{FieldsIdsMap, GlobalFieldsIdsMap};
pub use self::filterable_fields::{
FilterableAttributesFeatures, FilterableAttributesPatterns, FilterableAttributesSettings,
};
pub use self::heed_codec::{
BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec,
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec,

View File

@ -5,7 +5,7 @@ use serde::{Deserialize, Serialize};
use utoipa::ToSchema;
use crate::fields_ids_map::FieldsIdsMap;
use crate::FieldId;
use crate::{AttributePatterns, FieldId};
/// A rule that defines which locales are supported for a given attribute.
///
@ -17,18 +17,18 @@ use crate::FieldId;
/// The pattern `*attribute_name*` matches any attribute name that contains `attribute_name`.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)]
pub struct LocalizedAttributesRule {
pub attribute_patterns: Vec<String>,
pub attribute_patterns: AttributePatterns,
#[schema(value_type = Vec<String>)]
pub locales: Vec<Language>,
}
impl LocalizedAttributesRule {
pub fn new(attribute_patterns: Vec<String>, locales: Vec<Language>) -> Self {
Self { attribute_patterns, locales }
Self { attribute_patterns: AttributePatterns::from(attribute_patterns), locales }
}
pub fn match_str(&self, str: &str) -> bool {
self.attribute_patterns.iter().any(|pattern| match_pattern(pattern.as_str(), str))
self.attribute_patterns.match_str(str)
}
pub fn locales(&self) -> &[Language] {
@ -36,20 +36,6 @@ impl LocalizedAttributesRule {
}
}
fn match_pattern(pattern: &str, str: &str) -> bool {
if pattern == "*" {
true
} else if pattern.starts_with('*') && pattern.ends_with('*') {
str.contains(&pattern[1..pattern.len() - 1])
} else if let Some(pattern) = pattern.strip_prefix('*') {
str.ends_with(pattern)
} else if let Some(pattern) = pattern.strip_suffix('*') {
str.starts_with(pattern)
} else {
pattern == str
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct LocalizedFieldIds {
field_id_to_locales: HashMap<FieldId, Vec<Language>>,
@ -65,7 +51,7 @@ impl LocalizedFieldIds {
if let Some(rules) = rules {
let fields = fields_ids.filter_map(|field_id| {
fields_ids_map.name(field_id).map(|field_name| (field_id, field_name))
fields_ids_map.name(field_id).map(|field_name: &str| (field_id, field_name))
});
for (field_id, field_name) in fields {
@ -108,24 +94,3 @@ impl LocalizedFieldIds {
locales
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_match_pattern() {
assert!(match_pattern("*", "test"));
assert!(match_pattern("test*", "test"));
assert!(match_pattern("test*", "testa"));
assert!(match_pattern("*test", "test"));
assert!(match_pattern("*test", "atest"));
assert!(match_pattern("*test*", "test"));
assert!(match_pattern("*test*", "atesta"));
assert!(match_pattern("*test*", "atest"));
assert!(match_pattern("*test*", "testa"));
assert!(!match_pattern("test*test", "test"));
assert!(!match_pattern("*test", "testa"));
assert!(!match_pattern("test*", "atest"));
}
}

View File

@ -95,12 +95,7 @@ pub fn enrich_documents_batch<R: Read + Seek>(
// If the settings specifies that a _geo field must be used therefore we must check the
// validity of it in all the documents of this batch and this is when we return `Some`.
let geo_field_id = match documents_batch_index.id(RESERVED_GEO_FIELD_NAME) {
Some(geo_field_id)
if index.sortable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME)
|| index.filterable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME) =>
{
Some(geo_field_id)
}
Some(geo_field_id) if index.is_geo_activated(rtxn)? => Some(geo_field_id),
_otherwise => None,
};

View File

@ -9,7 +9,6 @@ use heed::RoTxn;
use serde_json::value::RawValue;
use serde_json::Value;
use crate::constants::RESERVED_GEO_FIELD_NAME;
use crate::error::GeoError;
use crate::update::new::document::Document;
use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extractor};
@ -29,9 +28,7 @@ impl GeoExtractor {
index: &Index,
grenad_parameters: GrenadParameters,
) -> Result<Option<Self>> {
let is_sortable = index.sortable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME);
let is_filterable = index.filterable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME);
if is_sortable || is_filterable {
if index.is_geo_activated(rtxn)? {
Ok(Some(GeoExtractor { grenad_parameters }))
} else {
Ok(None)