diff --git a/Cargo.lock b/Cargo.lock index ccf79f9a2..effdfe9a7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2731,6 +2731,7 @@ dependencies = [ "grenad", "heed", "hnsw", + "indexmap", "insta", "itertools", "json-depth-checker", diff --git a/dump/src/lib.rs b/dump/src/lib.rs index 5ae9ddfed..1e21eed05 100644 --- a/dump/src/lib.rs +++ b/dump/src/lib.rs @@ -208,12 +208,13 @@ pub(crate) mod test { use std::str::FromStr; use big_s::S; - use maplit::btreeset; + use maplit::{btreemap, btreeset}; + use meilisearch_types::facet_values_sort::FacetValuesSort; use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::keys::{Action, Key}; + use meilisearch_types::milli; use meilisearch_types::milli::update::Setting; - use meilisearch_types::milli::{self}; - use meilisearch_types::settings::{Checked, Settings}; + use meilisearch_types::settings::{Checked, FacetingSettings, Settings}; use meilisearch_types::tasks::{Details, Status}; use serde_json::{json, Map, Value}; use time::macros::datetime; @@ -263,7 +264,12 @@ pub(crate) mod test { synonyms: Setting::NotSet, distinct_attribute: Setting::NotSet, typo_tolerance: Setting::NotSet, - faceting: Setting::NotSet, + faceting: Setting::Set(FacetingSettings { + max_values_per_facet: Setting::Set(111), + sort_facet_values_by: Setting::Set( + btreemap! { S("age") => FacetValuesSort::Count }, + ), + }), pagination: Setting::NotSet, _kind: std::marker::PhantomData, }; diff --git a/dump/src/reader/compat/v5_to_v6.rs b/dump/src/reader/compat/v5_to_v6.rs index b4c851d28..ef5588d8f 100644 --- a/dump/src/reader/compat/v5_to_v6.rs +++ b/dump/src/reader/compat/v5_to_v6.rs @@ -362,6 +362,7 @@ impl From> for v6::Settings { faceting: match settings.faceting { v5::Setting::Set(faceting) => v6::Setting::Set(v6::FacetingSettings { max_values_per_facet: faceting.max_values_per_facet.into(), + sort_facet_values_by: v6::Setting::NotSet, }), v5::Setting::Reset => v6::Setting::Reset, v5::Setting::NotSet => v6::Setting::NotSet, diff --git a/meilisearch-types/src/facet_values_sort.rs b/meilisearch-types/src/facet_values_sort.rs new file mode 100644 index 000000000..278061f19 --- /dev/null +++ b/meilisearch-types/src/facet_values_sort.rs @@ -0,0 +1,33 @@ +use deserr::Deserr; +use milli::OrderBy; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, Serialize, Deserialize, Deserr)] +#[serde(rename_all = "camelCase")] +#[deserr(rename_all = camelCase)] +pub enum FacetValuesSort { + /// Facet values are sorted in alphabetical order, ascending from A to Z. + #[default] + Alpha, + /// Facet values are sorted by decreasing count. + /// The count is the number of records containing this facet value in the results of the query. + Count, +} + +impl From for OrderBy { + fn from(val: FacetValuesSort) -> Self { + match val { + FacetValuesSort::Alpha => OrderBy::Lexicographic, + FacetValuesSort::Count => OrderBy::Count, + } + } +} + +impl From for FacetValuesSort { + fn from(val: OrderBy) -> Self { + match val { + OrderBy::Lexicographic => FacetValuesSort::Alpha, + OrderBy::Count => FacetValuesSort::Count, + } + } +} diff --git a/meilisearch-types/src/lib.rs b/meilisearch-types/src/lib.rs index dbdec14fc..b0762563a 100644 --- a/meilisearch-types/src/lib.rs +++ b/meilisearch-types/src/lib.rs @@ -2,6 +2,7 @@ pub mod compression; pub mod deserr; pub mod document_formats; pub mod error; +pub mod facet_values_sort; pub mod features; pub mod index_uid; pub mod index_uid_pattern; diff --git a/meilisearch-types/src/settings.rs b/meilisearch-types/src/settings.rs index 0fc7ea6e2..a5fb10074 100644 --- a/meilisearch-types/src/settings.rs +++ b/meilisearch-types/src/settings.rs @@ -14,8 +14,9 @@ use serde::{Deserialize, Serialize, Serializer}; use crate::deserr::DeserrJsonError; use crate::error::deserr_codes::*; +use crate::facet_values_sort::FacetValuesSort; -/// The maximimum number of results that the engine +/// The maximum number of results that the engine /// will be able to return in one search call. pub const DEFAULT_PAGINATION_MAX_TOTAL_HITS: usize = 1000; @@ -102,6 +103,9 @@ pub struct FacetingSettings { #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] pub max_values_per_facet: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + pub sort_facet_values_by: Setting>, } #[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] @@ -398,13 +402,25 @@ pub fn apply_settings_to_builder( Setting::NotSet => (), } - match settings.faceting { - Setting::Set(ref value) => match value.max_values_per_facet { - Setting::Set(val) => builder.set_max_values_per_facet(val), - Setting::Reset => builder.reset_max_values_per_facet(), - Setting::NotSet => (), - }, - Setting::Reset => builder.reset_max_values_per_facet(), + match &settings.faceting { + Setting::Set(FacetingSettings { max_values_per_facet, sort_facet_values_by }) => { + match max_values_per_facet { + Setting::Set(val) => builder.set_max_values_per_facet(*val), + Setting::Reset => builder.reset_max_values_per_facet(), + Setting::NotSet => (), + } + match sort_facet_values_by { + Setting::Set(val) => builder.set_sort_facet_values_by( + val.iter().map(|(name, order)| (name.clone(), (*order).into())).collect(), + ), + Setting::Reset => builder.reset_sort_facet_values_by(), + Setting::NotSet => (), + } + } + Setting::Reset => { + builder.reset_max_values_per_facet(); + builder.reset_sort_facet_values_by(); + } Setting::NotSet => (), } @@ -476,6 +492,13 @@ pub fn settings( max_values_per_facet: Setting::Set( index.max_values_per_facet(rtxn)?.unwrap_or(DEFAULT_VALUES_PER_FACET), ), + sort_facet_values_by: Setting::Set( + index + .sort_facet_values_by(rtxn)? + .into_iter() + .map(|(name, sort)| (name, sort.into())) + .collect(), + ), }; let pagination = PaginationSettings { diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index d90dd24dd..7d6601ac5 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -14,14 +14,27 @@ default-run = "meilisearch" [dependencies] actix-cors = "0.6.4" -actix-http = { version = "3.3.1", default-features = false, features = ["compress-brotli", "compress-gzip", "rustls"] } -actix-web = { version = "4.3.1", default-features = false, features = ["macros", "compress-brotli", "compress-gzip", "cookies", "rustls"] } +actix-http = { version = "3.3.1", default-features = false, features = [ + "compress-brotli", + "compress-gzip", + "rustls", +] } +actix-web = { version = "4.3.1", default-features = false, features = [ + "macros", + "compress-brotli", + "compress-gzip", + "cookies", + "rustls", +] } actix-web-static-files = { git = "https://github.com/kilork/actix-web-static-files.git", rev = "2d3b6160", optional = true } anyhow = { version = "1.0.70", features = ["backtrace"] } async-stream = "0.3.5" async-trait = "0.1.68" bstr = "1.4.0" -byte-unit = { version = "4.0.19", default-features = false, features = ["std", "serde"] } +byte-unit = { version = "4.0.19", default-features = false, features = [ + "std", + "serde", +] } bytes = "1.4.0" clap = { version = "4.2.1", features = ["derive", "env"] } crossbeam-channel = "0.5.8" @@ -57,7 +70,10 @@ prometheus = { version = "0.13.3", features = ["process"] } rand = "0.8.5" rayon = "1.7.0" regex = "1.7.3" -reqwest = { version = "0.11.16", features = ["rustls-tls", "json"], default-features = false } +reqwest = { version = "0.11.16", features = [ + "rustls-tls", + "json", +], default-features = false } rustls = "0.20.8" rustls-pemfile = "1.0.2" segment = { version = "0.2.2", optional = true } @@ -71,7 +87,12 @@ sysinfo = "0.28.4" tar = "0.4.38" tempfile = "3.5.0" thiserror = "1.0.40" -time = { version = "0.3.20", features = ["serde-well-known", "formatting", "parsing", "macros"] } +time = { version = "0.3.20", features = [ + "serde-well-known", + "formatting", + "parsing", + "macros", +] } tokio = { version = "1.27.0", features = ["full"] } tokio-stream = "0.1.12" toml = "0.7.3" @@ -90,7 +111,7 @@ brotli = "3.3.4" insta = "1.29.0" manifest-dir-macros = "0.1.16" maplit = "1.0.2" -meili-snap = {path = "../meili-snap"} +meili-snap = { path = "../meili-snap" } temp-env = "0.3.3" urlencoding = "2.1.2" yaup = "0.2.1" @@ -99,7 +120,10 @@ yaup = "0.2.1" anyhow = { version = "1.0.70", optional = true } cargo_toml = { version = "0.15.2", optional = true } hex = { version = "0.4.3", optional = true } -reqwest = { version = "0.11.16", features = ["blocking", "rustls-tls"], default-features = false, optional = true } +reqwest = { version = "0.11.16", features = [ + "blocking", + "rustls-tls", +], default-features = false, optional = true } sha-1 = { version = "0.10.1", optional = true } static-files = { version = "0.2.3", optional = true } tempfile = { version = "3.5.0", optional = true } @@ -109,7 +133,17 @@ zip = { version = "0.6.4", optional = true } [features] default = ["analytics", "meilisearch-types/all-tokenizations", "mini-dashboard"] analytics = ["segment"] -mini-dashboard = ["actix-web-static-files", "static-files", "anyhow", "cargo_toml", "hex", "reqwest", "sha-1", "tempfile", "zip"] +mini-dashboard = [ + "actix-web-static-files", + "static-files", + "anyhow", + "cargo_toml", + "hex", + "reqwest", + "sha-1", + "tempfile", + "zip", +] chinese = ["meilisearch-types/chinese"] hebrew = ["meilisearch-types/hebrew"] japanese = ["meilisearch-types/japanese"] diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index 4b6cde685..fb5d8ff7a 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -401,12 +401,17 @@ make_setting_route!( analytics, |setting: &Option, req: &HttpRequest| { use serde_json::json; + use meilisearch_types::facet_values_sort::FacetValuesSort; analytics.publish( "Faceting Updated".to_string(), json!({ "faceting": { "max_values_per_facet": setting.as_ref().and_then(|s| s.max_values_per_facet.set()), + "sort_facet_values_by_star_count": setting.as_ref().and_then(|s| { + s.sort_facet_values_by.as_ref().set().map(|s| s.iter().any(|(k, v)| k == "*" && v == &FacetValuesSort::Count)) + }), + "sort_facet_values_by_total": setting.as_ref().and_then(|s| s.sort_facet_values_by.as_ref().set().map(|s| s.len())), }, }), Some(req), @@ -545,6 +550,10 @@ pub async fn update_all( .as_ref() .set() .and_then(|s| s.max_values_per_facet.as_ref().set()), + "sort_facet_values_by": new_settings.faceting + .as_ref() + .set() + .and_then(|s| s.sort_facet_values_by.as_ref().set()), }, "pagination": { "max_total_hits": new_settings.pagination diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index bebf80084..85fef13b3 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -6,6 +6,7 @@ use std::time::Instant; use deserr::Deserr; use either::Either; use index_scheduler::RoFeatures; +use indexmap::IndexMap; use log::warn; use meilisearch_auth::IndexSearchRules; use meilisearch_types::deserr::DeserrJsonError; @@ -14,7 +15,7 @@ use meilisearch_types::heed::RoTxn; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy}; use meilisearch_types::milli::{ - dot_product_similarity, FacetValueHit, InternalError, SearchForFacetValues, + dot_product_similarity, FacetValueHit, InternalError, OrderBy, SearchForFacetValues, }; use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS; use meilisearch_types::{milli, Document}; @@ -226,6 +227,26 @@ impl From for TermsMatchingStrategy { } } +#[derive(Debug, Default, Clone, PartialEq, Eq, Deserr)] +#[deserr(rename_all = camelCase)] +pub enum FacetValuesSort { + /// Facet values are sorted in alphabetical order, ascending from A to Z. + #[default] + Alpha, + /// Facet values are sorted by decreasing count. + /// The count is the number of records containing this facet value in the results of the query. + Count, +} + +impl From for OrderBy { + fn from(val: FacetValuesSort) -> Self { + match val { + FacetValuesSort::Alpha => OrderBy::Lexicographic, + FacetValuesSort::Count => OrderBy::Count, + } + } +} + #[derive(Debug, Clone, Serialize, PartialEq)] pub struct SearchHit { #[serde(flatten)] @@ -253,7 +274,7 @@ pub struct SearchResult { #[serde(flatten)] pub hits_info: HitsInfo, #[serde(skip_serializing_if = "Option::is_none")] - pub facet_distribution: Option>>, + pub facet_distribution: Option>>, #[serde(skip_serializing_if = "Option::is_none")] pub facet_stats: Option>, } @@ -554,10 +575,30 @@ pub fn perform_search( .unwrap_or(DEFAULT_VALUES_PER_FACET); facet_distribution.max_values_per_facet(max_values_by_facet); + let sort_facet_values_by = + index.sort_facet_values_by(&rtxn).map_err(milli::Error::from)?; + let default_sort_facet_values_by = + sort_facet_values_by.get("*").copied().unwrap_or_default(); + if fields.iter().all(|f| f != "*") { + let fields: Vec<_> = fields + .iter() + .map(|n| { + ( + n, + sort_facet_values_by + .get(n) + .copied() + .unwrap_or(default_sort_facet_values_by), + ) + }) + .collect(); facet_distribution.facets(fields); } - let distribution = facet_distribution.candidates(candidates).execute()?; + let distribution = facet_distribution + .candidates(candidates) + .default_order_by(default_sort_facet_values_by) + .execute()?; let stats = facet_distribution.compute_stats()?; (Some(distribution), Some(stats)) } diff --git a/meilisearch/tests/dumps/mod.rs b/meilisearch/tests/dumps/mod.rs index 31fa49b60..ce225cdf7 100644 --- a/meilisearch/tests/dumps/mod.rs +++ b/meilisearch/tests/dumps/mod.rs @@ -36,7 +36,7 @@ async fn import_dump_v1_movie_raw() { assert_eq!(code, 200); assert_eq!( settings, - json!({"displayedAttributes": ["*"], "searchableAttributes": ["*"], "filterableAttributes": [], "sortableAttributes": [], "rankingRules": ["typo", "words", "proximity", "attribute", "exactness"], "stopWords": [], "synonyms": {}, "distinctAttribute": null, "typoTolerance": {"enabled": true, "minWordSizeForTypos": {"oneTypo": 5, "twoTypos": 9}, "disableOnWords": [], "disableOnAttributes": [] }, "faceting": { "maxValuesPerFacet": 100 }, "pagination": { "maxTotalHits": 1000 } }) + json!({"displayedAttributes": ["*"], "searchableAttributes": ["*"], "filterableAttributes": [], "sortableAttributes": [], "rankingRules": ["typo", "words", "proximity", "attribute", "exactness"], "stopWords": [], "synonyms": {}, "distinctAttribute": null, "typoTolerance": {"enabled": true, "minWordSizeForTypos": {"oneTypo": 5, "twoTypos": 9}, "disableOnWords": [], "disableOnAttributes": [] }, "faceting": { "maxValuesPerFacet": 100, "sortFacetValuesBy": { "*": "alpha" } }, "pagination": { "maxTotalHits": 1000 } }) ); let (tasks, code) = index.list_tasks().await; @@ -128,7 +128,7 @@ async fn import_dump_v1_movie_with_settings() { assert_eq!(code, 200); assert_eq!( settings, - json!({ "displayedAttributes": ["genres", "id", "overview", "poster", "release_date", "title"], "searchableAttributes": ["title", "overview"], "filterableAttributes": ["genres"], "sortableAttributes": ["genres"], "rankingRules": ["typo", "words", "proximity", "attribute", "exactness"], "stopWords": ["of", "the"], "synonyms": {}, "distinctAttribute": null, "typoTolerance": {"enabled": true, "minWordSizeForTypos": { "oneTypo": 5, "twoTypos": 9 }, "disableOnWords": [], "disableOnAttributes": [] }, "faceting": { "maxValuesPerFacet": 100 }, "pagination": { "maxTotalHits": 1000 } }) + json!({ "displayedAttributes": ["genres", "id", "overview", "poster", "release_date", "title"], "searchableAttributes": ["title", "overview"], "filterableAttributes": ["genres"], "sortableAttributes": ["genres"], "rankingRules": ["typo", "words", "proximity", "attribute", "exactness"], "stopWords": ["of", "the"], "synonyms": {}, "distinctAttribute": null, "typoTolerance": {"enabled": true, "minWordSizeForTypos": { "oneTypo": 5, "twoTypos": 9 }, "disableOnWords": [], "disableOnAttributes": [] }, "faceting": { "maxValuesPerFacet": 100, "sortFacetValuesBy": { "*": "alpha" } }, "pagination": { "maxTotalHits": 1000 } }) ); let (tasks, code) = index.list_tasks().await; @@ -220,7 +220,7 @@ async fn import_dump_v1_rubygems_with_settings() { assert_eq!(code, 200); assert_eq!( settings, - json!({"displayedAttributes": ["description", "id", "name", "summary", "total_downloads", "version"], "searchableAttributes": ["name", "summary"], "filterableAttributes": ["version"], "sortableAttributes": ["version"], "rankingRules": ["typo", "words", "fame:desc", "proximity", "attribute", "exactness", "total_downloads:desc"], "stopWords": [], "synonyms": {}, "distinctAttribute": null, "typoTolerance": {"enabled": true, "minWordSizeForTypos": {"oneTypo": 5, "twoTypos": 9}, "disableOnWords": [], "disableOnAttributes": [] }, "faceting": { "maxValuesPerFacet": 100 }, "pagination": { "maxTotalHits": 1000 }}) + json!({"displayedAttributes": ["description", "id", "name", "summary", "total_downloads", "version"], "searchableAttributes": ["name", "summary"], "filterableAttributes": ["version"], "sortableAttributes": ["version"], "rankingRules": ["typo", "words", "fame:desc", "proximity", "attribute", "exactness", "total_downloads:desc"], "stopWords": [], "synonyms": {}, "distinctAttribute": null, "typoTolerance": {"enabled": true, "minWordSizeForTypos": {"oneTypo": 5, "twoTypos": 9}, "disableOnWords": [], "disableOnAttributes": [] }, "faceting": { "maxValuesPerFacet": 100, "sortFacetValuesBy": { "*": "alpha" } }, "pagination": { "maxTotalHits": 1000 }}) ); let (tasks, code) = index.list_tasks().await; @@ -310,7 +310,7 @@ async fn import_dump_v2_movie_raw() { assert_eq!(code, 200); assert_eq!( settings, - json!({"displayedAttributes": ["*"], "searchableAttributes": ["*"], "filterableAttributes": [], "sortableAttributes": [], "rankingRules": ["words", "typo", "proximity", "attribute", "exactness"], "stopWords": [], "synonyms": {}, "distinctAttribute": null, "typoTolerance": {"enabled": true, "minWordSizeForTypos": {"oneTypo": 5, "twoTypos": 9}, "disableOnWords": [], "disableOnAttributes": [] }, "faceting": { "maxValuesPerFacet": 100 }, "pagination": { "maxTotalHits": 1000 } }) + json!({"displayedAttributes": ["*"], "searchableAttributes": ["*"], "filterableAttributes": [], "sortableAttributes": [], "rankingRules": ["words", "typo", "proximity", "attribute", "exactness"], "stopWords": [], "synonyms": {}, "distinctAttribute": null, "typoTolerance": {"enabled": true, "minWordSizeForTypos": {"oneTypo": 5, "twoTypos": 9}, "disableOnWords": [], "disableOnAttributes": [] }, "faceting": { "maxValuesPerFacet": 100, "sortFacetValuesBy": { "*": "alpha" } }, "pagination": { "maxTotalHits": 1000 } }) ); let (tasks, code) = index.list_tasks().await; @@ -402,7 +402,7 @@ async fn import_dump_v2_movie_with_settings() { assert_eq!(code, 200); assert_eq!( settings, - json!({ "displayedAttributes": ["title", "genres", "overview", "poster", "release_date"], "searchableAttributes": ["title", "overview"], "filterableAttributes": ["genres"], "sortableAttributes": [], "rankingRules": ["words", "typo", "proximity", "attribute", "exactness"], "stopWords": ["of", "the"], "synonyms": {}, "distinctAttribute": null, "typoTolerance": {"enabled": true, "minWordSizeForTypos": { "oneTypo": 5, "twoTypos": 9 }, "disableOnWords": [], "disableOnAttributes": [] }, "faceting": { "maxValuesPerFacet": 100 }, "pagination": { "maxTotalHits": 1000 } }) + json!({ "displayedAttributes": ["title", "genres", "overview", "poster", "release_date"], "searchableAttributes": ["title", "overview"], "filterableAttributes": ["genres"], "sortableAttributes": [], "rankingRules": ["words", "typo", "proximity", "attribute", "exactness"], "stopWords": ["of", "the"], "synonyms": {}, "distinctAttribute": null, "typoTolerance": {"enabled": true, "minWordSizeForTypos": { "oneTypo": 5, "twoTypos": 9 }, "disableOnWords": [], "disableOnAttributes": [] }, "faceting": { "maxValuesPerFacet": 100, "sortFacetValuesBy": { "*": "alpha" } }, "pagination": { "maxTotalHits": 1000 } }) ); let (tasks, code) = index.list_tasks().await; @@ -494,7 +494,7 @@ async fn import_dump_v2_rubygems_with_settings() { assert_eq!(code, 200); assert_eq!( settings, - json!({"displayedAttributes": ["name", "summary", "description", "version", "total_downloads"], "searchableAttributes": ["name", "summary"], "filterableAttributes": ["version"], "sortableAttributes": [], "rankingRules": ["typo", "words", "fame:desc", "proximity", "attribute", "exactness", "total_downloads:desc"], "stopWords": [], "synonyms": {}, "distinctAttribute": null, "typoTolerance": {"enabled": true, "minWordSizeForTypos": {"oneTypo": 5, "twoTypos": 9}, "disableOnWords": [], "disableOnAttributes": [] }, "faceting": { "maxValuesPerFacet": 100 }, "pagination": { "maxTotalHits": 1000 }}) + json!({"displayedAttributes": ["name", "summary", "description", "version", "total_downloads"], "searchableAttributes": ["name", "summary"], "filterableAttributes": ["version"], "sortableAttributes": [], "rankingRules": ["typo", "words", "fame:desc", "proximity", "attribute", "exactness", "total_downloads:desc"], "stopWords": [], "synonyms": {}, "distinctAttribute": null, "typoTolerance": {"enabled": true, "minWordSizeForTypos": {"oneTypo": 5, "twoTypos": 9}, "disableOnWords": [], "disableOnAttributes": [] }, "faceting": { "maxValuesPerFacet": 100, "sortFacetValuesBy": { "*": "alpha" } }, "pagination": { "maxTotalHits": 1000 }}) ); let (tasks, code) = index.list_tasks().await; @@ -584,7 +584,7 @@ async fn import_dump_v3_movie_raw() { assert_eq!(code, 200); assert_eq!( settings, - json!({"displayedAttributes": ["*"], "searchableAttributes": ["*"], "filterableAttributes": [], "sortableAttributes": [], "rankingRules": ["words", "typo", "proximity", "attribute", "exactness"], "stopWords": [], "synonyms": {}, "distinctAttribute": null, "typoTolerance": {"enabled": true, "minWordSizeForTypos": {"oneTypo": 5, "twoTypos": 9}, "disableOnWords": [], "disableOnAttributes": [] }, "faceting": { "maxValuesPerFacet": 100 }, "pagination": { "maxTotalHits": 1000 } }) + json!({"displayedAttributes": ["*"], "searchableAttributes": ["*"], "filterableAttributes": [], "sortableAttributes": [], "rankingRules": ["words", "typo", "proximity", "attribute", "exactness"], "stopWords": [], "synonyms": {}, "distinctAttribute": null, "typoTolerance": {"enabled": true, "minWordSizeForTypos": {"oneTypo": 5, "twoTypos": 9}, "disableOnWords": [], "disableOnAttributes": [] }, "faceting": { "maxValuesPerFacet": 100, "sortFacetValuesBy": { "*": "alpha" } }, "pagination": { "maxTotalHits": 1000 } }) ); let (tasks, code) = index.list_tasks().await; @@ -676,7 +676,7 @@ async fn import_dump_v3_movie_with_settings() { assert_eq!(code, 200); assert_eq!( settings, - json!({ "displayedAttributes": ["title", "genres", "overview", "poster", "release_date"], "searchableAttributes": ["title", "overview"], "filterableAttributes": ["genres"], "sortableAttributes": [], "rankingRules": ["words", "typo", "proximity", "attribute", "exactness"], "stopWords": ["of", "the"], "synonyms": {}, "distinctAttribute": null, "typoTolerance": {"enabled": true, "minWordSizeForTypos": { "oneTypo": 5, "twoTypos": 9 }, "disableOnWords": [], "disableOnAttributes": [] }, "faceting": { "maxValuesPerFacet": 100 }, "pagination": { "maxTotalHits": 1000 } }) + json!({ "displayedAttributes": ["title", "genres", "overview", "poster", "release_date"], "searchableAttributes": ["title", "overview"], "filterableAttributes": ["genres"], "sortableAttributes": [], "rankingRules": ["words", "typo", "proximity", "attribute", "exactness"], "stopWords": ["of", "the"], "synonyms": {}, "distinctAttribute": null, "typoTolerance": {"enabled": true, "minWordSizeForTypos": { "oneTypo": 5, "twoTypos": 9 }, "disableOnWords": [], "disableOnAttributes": [] }, "faceting": { "maxValuesPerFacet": 100, "sortFacetValuesBy": { "*": "alpha" } }, "pagination": { "maxTotalHits": 1000 } }) ); let (tasks, code) = index.list_tasks().await; @@ -768,7 +768,7 @@ async fn import_dump_v3_rubygems_with_settings() { assert_eq!(code, 200); assert_eq!( settings, - json!({"displayedAttributes": ["name", "summary", "description", "version", "total_downloads"], "searchableAttributes": ["name", "summary"], "filterableAttributes": ["version"], "sortableAttributes": [], "rankingRules": ["typo", "words", "fame:desc", "proximity", "attribute", "exactness", "total_downloads:desc"], "stopWords": [], "synonyms": {}, "distinctAttribute": null, "typoTolerance": {"enabled": true, "minWordSizeForTypos": {"oneTypo": 5, "twoTypos": 9}, "disableOnWords": [], "disableOnAttributes": [] }, "faceting": { "maxValuesPerFacet": 100 }, "pagination": { "maxTotalHits": 1000 } }) + json!({"displayedAttributes": ["name", "summary", "description", "version", "total_downloads"], "searchableAttributes": ["name", "summary"], "filterableAttributes": ["version"], "sortableAttributes": [], "rankingRules": ["typo", "words", "fame:desc", "proximity", "attribute", "exactness", "total_downloads:desc"], "stopWords": [], "synonyms": {}, "distinctAttribute": null, "typoTolerance": {"enabled": true, "minWordSizeForTypos": {"oneTypo": 5, "twoTypos": 9}, "disableOnWords": [], "disableOnAttributes": [] }, "faceting": { "maxValuesPerFacet": 100, "sortFacetValuesBy": { "*": "alpha" } }, "pagination": { "maxTotalHits": 1000 } }) ); let (tasks, code) = index.list_tasks().await; @@ -858,7 +858,7 @@ async fn import_dump_v4_movie_raw() { assert_eq!(code, 200); assert_eq!( settings, - json!({ "displayedAttributes": ["*"], "searchableAttributes": ["*"], "filterableAttributes": [], "sortableAttributes": [], "rankingRules": ["words", "typo", "proximity", "attribute", "exactness"], "stopWords": [], "synonyms": {}, "distinctAttribute": null, "typoTolerance": {"enabled": true, "minWordSizeForTypos": {"oneTypo": 5, "twoTypos": 9}, "disableOnWords": [], "disableOnAttributes": [] }, "faceting": { "maxValuesPerFacet": 100 }, "pagination": { "maxTotalHits": 1000 } }) + json!({ "displayedAttributes": ["*"], "searchableAttributes": ["*"], "filterableAttributes": [], "sortableAttributes": [], "rankingRules": ["words", "typo", "proximity", "attribute", "exactness"], "stopWords": [], "synonyms": {}, "distinctAttribute": null, "typoTolerance": {"enabled": true, "minWordSizeForTypos": {"oneTypo": 5, "twoTypos": 9}, "disableOnWords": [], "disableOnAttributes": [] }, "faceting": { "maxValuesPerFacet": 100, "sortFacetValuesBy": { "*": "alpha" } }, "pagination": { "maxTotalHits": 1000 } }) ); let (tasks, code) = index.list_tasks().await; @@ -950,7 +950,7 @@ async fn import_dump_v4_movie_with_settings() { assert_eq!(code, 200); assert_eq!( settings, - json!({ "displayedAttributes": ["title", "genres", "overview", "poster", "release_date"], "searchableAttributes": ["title", "overview"], "filterableAttributes": ["genres"], "sortableAttributes": [], "rankingRules": ["words", "typo", "proximity", "attribute", "exactness"], "stopWords": ["of", "the"], "synonyms": {}, "distinctAttribute": null, "typoTolerance": {"enabled": true, "minWordSizeForTypos": { "oneTypo": 5, "twoTypos": 9 }, "disableOnWords": [], "disableOnAttributes": [] }, "faceting": { "maxValuesPerFacet": 100 }, "pagination": { "maxTotalHits": 1000 } }) + json!({ "displayedAttributes": ["title", "genres", "overview", "poster", "release_date"], "searchableAttributes": ["title", "overview"], "filterableAttributes": ["genres"], "sortableAttributes": [], "rankingRules": ["words", "typo", "proximity", "attribute", "exactness"], "stopWords": ["of", "the"], "synonyms": {}, "distinctAttribute": null, "typoTolerance": {"enabled": true, "minWordSizeForTypos": { "oneTypo": 5, "twoTypos": 9 }, "disableOnWords": [], "disableOnAttributes": [] }, "faceting": { "maxValuesPerFacet": 100, "sortFacetValuesBy": { "*": "alpha" } }, "pagination": { "maxTotalHits": 1000 } }) ); let (tasks, code) = index.list_tasks().await; @@ -1042,7 +1042,7 @@ async fn import_dump_v4_rubygems_with_settings() { assert_eq!(code, 200); assert_eq!( settings, - json!({ "displayedAttributes": ["name", "summary", "description", "version", "total_downloads"], "searchableAttributes": ["name", "summary"], "filterableAttributes": ["version"], "sortableAttributes": [], "rankingRules": ["typo", "words", "fame:desc", "proximity", "attribute", "exactness", "total_downloads:desc"], "stopWords": [], "synonyms": {}, "distinctAttribute": null, "typoTolerance": {"enabled": true, "minWordSizeForTypos": {"oneTypo": 5, "twoTypos": 9}, "disableOnWords": [], "disableOnAttributes": [] }, "faceting": { "maxValuesPerFacet": 100 }, "pagination": { "maxTotalHits": 1000 } }) + json!({ "displayedAttributes": ["name", "summary", "description", "version", "total_downloads"], "searchableAttributes": ["name", "summary"], "filterableAttributes": ["version"], "sortableAttributes": [], "rankingRules": ["typo", "words", "fame:desc", "proximity", "attribute", "exactness", "total_downloads:desc"], "stopWords": [], "synonyms": {}, "distinctAttribute": null, "typoTolerance": {"enabled": true, "minWordSizeForTypos": {"oneTypo": 5, "twoTypos": 9}, "disableOnWords": [], "disableOnAttributes": [] }, "faceting": { "maxValuesPerFacet": 100, "sortFacetValuesBy": { "*": "alpha" } }, "pagination": { "maxTotalHits": 1000 } }) ); let (tasks, code) = index.list_tasks().await; diff --git a/meilisearch/tests/settings/get_settings.rs b/meilisearch/tests/settings/get_settings.rs index 88e395b57..d39cbd96e 100644 --- a/meilisearch/tests/settings/get_settings.rs +++ b/meilisearch/tests/settings/get_settings.rs @@ -21,6 +21,9 @@ static DEFAULT_SETTINGS_VALUES: Lazy> = Lazy::new(| "faceting", json!({ "maxValuesPerFacet": json!(100), + "sortFacetValuesBy": { + "*": "alpha" + } }), ); map.insert( @@ -63,6 +66,9 @@ async fn get_settings() { settings["faceting"], json!({ "maxValuesPerFacet": 100, + "sortFacetValuesBy": { + "*": "alpha" + } }) ); assert_eq!( diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 08f0c2645..aa4b98ec2 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -34,6 +34,7 @@ heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.6", default-f "sync-read-txn", ] } hnsw = { version = "0.11.0", features = ["serde1"] } +indexmap = { version = "1.9.3", features = ["serde"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } memmap2 = "0.5.10" diff --git a/milli/src/index.rs b/milli/src/index.rs index 5c32f75f5..392ed1705 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -26,7 +26,8 @@ use crate::readable_slices::ReadableSlices; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec, - Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32, + OrderBy, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, + BEU32, }; /// The HNSW data-structure that we serialize, fill and search in. @@ -71,6 +72,7 @@ pub mod main_key { pub const EXACT_WORDS: &str = "exact-words"; pub const EXACT_ATTRIBUTES: &str = "exact-attributes"; pub const MAX_VALUES_PER_FACET: &str = "max-values-per-facet"; + pub const SORT_FACET_VALUES_BY: &str = "sort-facet-values-by"; pub const PAGINATION_MAX_TOTAL_HITS: &str = "pagination-max-total-hits"; } @@ -1298,6 +1300,31 @@ impl Index { self.main.delete::<_, Str>(txn, main_key::MAX_VALUES_PER_FACET) } + pub fn sort_facet_values_by(&self, txn: &RoTxn) -> heed::Result> { + let mut orders = self + .main + .get::<_, Str, SerdeJson>>( + txn, + main_key::SORT_FACET_VALUES_BY, + )? + .unwrap_or_default(); + // Insert the default ordering if it is not already overwritten by the user. + orders.entry("*".to_string()).or_insert(OrderBy::Lexicographic); + Ok(orders) + } + + pub(crate) fn put_sort_facet_values_by( + &self, + txn: &mut RwTxn, + val: &HashMap, + ) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson<_>>(txn, main_key::SORT_FACET_VALUES_BY, &val) + } + + pub(crate) fn delete_sort_facet_values_by(&self, txn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(txn, main_key::SORT_FACET_VALUES_BY) + } + pub fn pagination_max_total_hits(&self, txn: &RoTxn) -> heed::Result> { self.main.get::<_, Str, OwnedType>(txn, main_key::PAGINATION_MAX_TOTAL_HITS) } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 4360eb38e..55b283931 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -58,7 +58,7 @@ pub use self::heed_codec::{ pub use self::index::Index; pub use self::search::{ FacetDistribution, FacetValueHit, Filter, FormatOptions, MatchBounds, MatcherBuilder, - MatchingWords, Search, SearchForFacetValues, SearchResult, TermsMatchingStrategy, + MatchingWords, OrderBy, Search, SearchForFacetValues, SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, }; diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index f5f32fecf..acf117ef6 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -1,19 +1,22 @@ -use std::collections::{BTreeMap, HashSet}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::ops::ControlFlow; use std::{fmt, mem}; use heed::types::ByteSlice; use heed::BytesDecode; +use indexmap::IndexMap; use roaring::RoaringBitmap; +use serde::{Deserialize, Serialize}; use crate::error::UserError; use crate::facet::FacetType; use crate::heed_codec::facet::{ - FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, - OrderedF64Codec, + FacetGroupKeyCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, OrderedF64Codec, }; use crate::heed_codec::{ByteSliceRefCodec, StrRefCodec}; -use crate::search::facet::facet_distribution_iter; +use crate::search::facet::facet_distribution_iter::{ + count_iterate_over_facet_distribution, lexicographically_iterate_over_facet_distribution, +}; use crate::{FieldId, Index, Result}; /// The default number of values by facets that will @@ -24,10 +27,21 @@ pub const DEFAULT_VALUES_PER_FACET: usize = 100; /// the system to choose between one algorithm or another. const CANDIDATES_THRESHOLD: u64 = 3000; +/// How should we fetch the facets? +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum OrderBy { + /// By lexicographic order... + #[default] + Lexicographic, + /// Or by number of docids in common? + Count, +} + pub struct FacetDistribution<'a> { - facets: Option>, + facets: Option>, candidates: Option, max_values_per_facet: usize, + default_order_by: OrderBy, rtxn: &'a heed::RoTxn<'a>, index: &'a Index, } @@ -38,13 +52,22 @@ impl<'a> FacetDistribution<'a> { facets: None, candidates: None, max_values_per_facet: DEFAULT_VALUES_PER_FACET, + default_order_by: OrderBy::default(), rtxn, index, } } - pub fn facets, A: AsRef>(&mut self, names: I) -> &mut Self { - self.facets = Some(names.into_iter().map(|s| s.as_ref().to_string()).collect()); + pub fn facets, A: AsRef>( + &mut self, + names_ordered_by: I, + ) -> &mut Self { + self.facets = Some( + names_ordered_by + .into_iter() + .map(|(name, order_by)| (name.as_ref().to_string(), order_by)) + .collect(), + ); self } @@ -53,6 +76,11 @@ impl<'a> FacetDistribution<'a> { self } + pub fn default_order_by(&mut self, order_by: OrderBy) -> &mut Self { + self.default_order_by = order_by; + self + } + pub fn candidates(&mut self, candidates: RoaringBitmap) -> &mut Self { self.candidates = Some(candidates); self @@ -65,7 +93,7 @@ impl<'a> FacetDistribution<'a> { field_id: FieldId, facet_type: FacetType, candidates: &RoaringBitmap, - distribution: &mut BTreeMap, + distribution: &mut IndexMap, ) -> heed::Result<()> { match facet_type { FacetType::Number => { @@ -134,9 +162,15 @@ impl<'a> FacetDistribution<'a> { &self, field_id: FieldId, candidates: &RoaringBitmap, - distribution: &mut BTreeMap, + order_by: OrderBy, + distribution: &mut IndexMap, ) -> heed::Result<()> { - facet_distribution_iter::iterate_over_facet_distribution( + let search_function = match order_by { + OrderBy::Lexicographic => lexicographically_iterate_over_facet_distribution, + OrderBy::Count => count_iterate_over_facet_distribution, + }; + + search_function( self.rtxn, self.index .facet_id_f64_docids @@ -159,9 +193,15 @@ impl<'a> FacetDistribution<'a> { &self, field_id: FieldId, candidates: &RoaringBitmap, - distribution: &mut BTreeMap, + order_by: OrderBy, + distribution: &mut IndexMap, ) -> heed::Result<()> { - facet_distribution_iter::iterate_over_facet_distribution( + let search_function = match order_by { + OrderBy::Lexicographic => lexicographically_iterate_over_facet_distribution, + OrderBy::Count => count_iterate_over_facet_distribution, + }; + + search_function( self.rtxn, self.index .facet_id_string_docids @@ -189,93 +229,47 @@ impl<'a> FacetDistribution<'a> { ) } - /// Placeholder search, a.k.a. no candidates were specified. We iterate throught the - /// facet values one by one and iterate on the facet level 0 for numbers. - fn facet_values_from_raw_facet_database( + fn facet_values( &self, field_id: FieldId, - ) -> heed::Result> { - let mut distribution = BTreeMap::new(); - - let db = self.index.facet_id_f64_docids; - let mut prefix = vec![]; - prefix.extend_from_slice(&field_id.to_be_bytes()); - prefix.push(0); // read values from level 0 only - - let iter = db - .as_polymorph() - .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())? - .remap_types::, FacetGroupValueCodec>(); - - for result in iter { - let (key, value) = result?; - distribution.insert(key.left_bound.to_string(), value.bitmap.len()); - if distribution.len() == self.max_values_per_facet { - break; - } - } - - let iter = self - .index - .facet_id_string_docids - .as_polymorph() - .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())? - .remap_types::, FacetGroupValueCodec>(); - - for result in iter { - let (key, value) = result?; - - let docid = value.bitmap.iter().next().unwrap(); - let key: (FieldId, _, &'a str) = (field_id, docid, key.left_bound); - let original_string = - self.index.field_id_docid_facet_strings.get(self.rtxn, &key)?.unwrap().to_owned(); - - distribution.insert(original_string, value.bitmap.len()); - if distribution.len() == self.max_values_per_facet { - break; - } - } - - Ok(distribution) - } - - fn facet_values(&self, field_id: FieldId) -> heed::Result> { + order_by: OrderBy, + ) -> heed::Result> { use FacetType::{Number, String}; - match self.candidates { - Some(ref candidates) => { + let mut distribution = IndexMap::new(); + match (order_by, &self.candidates) { + (OrderBy::Lexicographic, Some(cnd)) if cnd.len() <= CANDIDATES_THRESHOLD => { // Classic search, candidates were specified, we must return facet values only related // to those candidates. We also enter here for facet strings for performance reasons. - let mut distribution = BTreeMap::new(); - if candidates.len() <= CANDIDATES_THRESHOLD { - self.facet_distribution_from_documents( - field_id, - Number, - candidates, - &mut distribution, - )?; - self.facet_distribution_from_documents( - field_id, - String, - candidates, - &mut distribution, - )?; - } else { - self.facet_numbers_distribution_from_facet_levels( - field_id, - candidates, - &mut distribution, - )?; - self.facet_strings_distribution_from_facet_levels( - field_id, - candidates, - &mut distribution, - )?; - } - Ok(distribution) + self.facet_distribution_from_documents(field_id, Number, cnd, &mut distribution)?; + self.facet_distribution_from_documents(field_id, String, cnd, &mut distribution)?; } - None => self.facet_values_from_raw_facet_database(field_id), - } + _ => { + let universe; + let candidates = match &self.candidates { + Some(cnd) => cnd, + None => { + universe = self.index.documents_ids(self.rtxn)?; + &universe + } + }; + + self.facet_numbers_distribution_from_facet_levels( + field_id, + candidates, + order_by, + &mut distribution, + )?; + self.facet_strings_distribution_from_facet_levels( + field_id, + candidates, + order_by, + &mut distribution, + )?; + } + }; + + Ok(distribution) } pub fn compute_stats(&self) -> Result> { @@ -291,6 +285,7 @@ impl<'a> FacetDistribution<'a> { Some(facets) => { let invalid_fields: HashSet<_> = facets .iter() + .map(|(name, _)| name) .filter(|facet| !crate::is_faceted(facet, &filterable_fields)) .collect(); if !invalid_fields.is_empty() { @@ -300,7 +295,7 @@ impl<'a> FacetDistribution<'a> { } .into()); } else { - facets.clone() + facets.iter().map(|(name, _)| name).cloned().collect() } } None => filterable_fields, @@ -337,7 +332,7 @@ impl<'a> FacetDistribution<'a> { Ok(distribution) } - pub fn execute(&self) -> Result>> { + pub fn execute(&self) -> Result>> { let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; let filterable_fields = self.index.filterable_fields(self.rtxn)?; @@ -345,6 +340,7 @@ impl<'a> FacetDistribution<'a> { Some(ref facets) => { let invalid_fields: HashSet<_> = facets .iter() + .map(|(name, _)| name) .filter(|facet| !crate::is_faceted(facet, &filterable_fields)) .collect(); if !invalid_fields.is_empty() { @@ -354,7 +350,7 @@ impl<'a> FacetDistribution<'a> { } .into()); } else { - facets.clone() + facets.iter().map(|(name, _)| name).cloned().collect() } } None => filterable_fields, @@ -363,7 +359,12 @@ impl<'a> FacetDistribution<'a> { let mut distribution = BTreeMap::new(); for (fid, name) in fields_ids_map.iter() { if crate::is_faceted(name, &fields) { - let values = self.facet_values(fid)?; + let order_by = self + .facets + .as_ref() + .and_then(|facets| facets.get(name).copied()) + .unwrap_or(self.default_order_by); + let values = self.facet_values(fid, order_by)?; distribution.insert(name.to_string(), values); } } @@ -374,25 +375,34 @@ impl<'a> FacetDistribution<'a> { impl fmt::Debug for FacetDistribution<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let FacetDistribution { facets, candidates, max_values_per_facet, rtxn: _, index: _ } = - self; + let FacetDistribution { + facets, + candidates, + max_values_per_facet, + default_order_by, + rtxn: _, + index: _, + } = self; f.debug_struct("FacetDistribution") .field("facets", facets) .field("candidates", candidates) .field("max_values_per_facet", max_values_per_facet) + .field("default_order_by", default_order_by) .finish() } } #[cfg(test)] mod tests { + use std::iter; + use big_s::S; use maplit::hashset; use crate::documents::documents_batch_reader_from_objects; use crate::index::tests::TempIndex; - use crate::{milli_snap, FacetDistribution}; + use crate::{milli_snap, FacetDistribution, OrderBy}; #[test] fn few_candidates_few_facet_values() { @@ -417,14 +427,14 @@ mod tests { let txn = index.read_txn().unwrap(); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .execute() .unwrap(); milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2, "RED": 1}}"###); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .candidates([0, 1, 2].iter().copied().collect()) .execute() .unwrap(); @@ -432,7 +442,7 @@ mod tests { milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2, "RED": 1}}"###); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .candidates([1, 2].iter().copied().collect()) .execute() .unwrap(); @@ -443,7 +453,7 @@ mod tests { milli_snap!(format!("{map:?}"), @r###"{"colour": {" blue": 1, "RED": 1}}"###); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .candidates([2].iter().copied().collect()) .execute() .unwrap(); @@ -451,13 +461,22 @@ mod tests { milli_snap!(format!("{map:?}"), @r###"{"colour": {"RED": 1}}"###); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .candidates([0, 1, 2].iter().copied().collect()) .max_values_per_facet(1) .execute() .unwrap(); milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 1}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(iter::once(("colour", OrderBy::Count))) + .candidates([0, 1, 2].iter().copied().collect()) + .max_values_per_facet(1) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2}}"###); } #[test] @@ -489,14 +508,14 @@ mod tests { let txn = index.read_txn().unwrap(); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .execute() .unwrap(); milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 4000, "Red": 6000}}"###); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .max_values_per_facet(1) .execute() .unwrap(); @@ -504,7 +523,7 @@ mod tests { milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 4000}}"###); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .candidates((0..10_000).collect()) .execute() .unwrap(); @@ -512,7 +531,7 @@ mod tests { milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 4000, "Red": 6000}}"###); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .candidates((0..5_000).collect()) .execute() .unwrap(); @@ -520,7 +539,7 @@ mod tests { milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2000, "Red": 3000}}"###); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .candidates((0..5_000).collect()) .execute() .unwrap(); @@ -528,13 +547,22 @@ mod tests { milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2000, "Red": 3000}}"###); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .candidates((0..5_000).collect()) .max_values_per_facet(1) .execute() .unwrap(); milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2000}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(iter::once(("colour", OrderBy::Count))) + .candidates((0..5_000).collect()) + .max_values_per_facet(1) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Red": 3000}}"###); } #[test] @@ -566,14 +594,14 @@ mod tests { let txn = index.read_txn().unwrap(); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .execute() .unwrap(); milli_snap!(format!("{map:?}"), "no_candidates", @"ac9229ed5964d893af96a7076e2f8af5"); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .max_values_per_facet(2) .execute() .unwrap(); @@ -581,7 +609,7 @@ mod tests { milli_snap!(format!("{map:?}"), "no_candidates_with_max_2", @r###"{"colour": {"0": 10, "1": 10}}"###); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .candidates((0..10_000).collect()) .execute() .unwrap(); @@ -589,7 +617,7 @@ mod tests { milli_snap!(format!("{map:?}"), "candidates_0_10_000", @"ac9229ed5964d893af96a7076e2f8af5"); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .candidates((0..5_000).collect()) .execute() .unwrap(); @@ -626,14 +654,14 @@ mod tests { let txn = index.read_txn().unwrap(); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .compute_stats() .unwrap(); milli_snap!(format!("{map:?}"), "no_candidates", @"{}"); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .candidates((0..1000).collect()) .compute_stats() .unwrap(); @@ -641,7 +669,7 @@ mod tests { milli_snap!(format!("{map:?}"), "candidates_0_1000", @r###"{"colour": (0.0, 999.0)}"###); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .candidates((217..777).collect()) .compute_stats() .unwrap(); @@ -678,14 +706,14 @@ mod tests { let txn = index.read_txn().unwrap(); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .compute_stats() .unwrap(); milli_snap!(format!("{map:?}"), "no_candidates", @"{}"); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .candidates((0..1000).collect()) .compute_stats() .unwrap(); @@ -693,7 +721,7 @@ mod tests { milli_snap!(format!("{map:?}"), "candidates_0_1000", @r###"{"colour": (0.0, 1999.0)}"###); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .candidates((217..777).collect()) .compute_stats() .unwrap(); @@ -730,14 +758,14 @@ mod tests { let txn = index.read_txn().unwrap(); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .compute_stats() .unwrap(); milli_snap!(format!("{map:?}"), "no_candidates", @"{}"); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .candidates((0..1000).collect()) .compute_stats() .unwrap(); @@ -745,7 +773,7 @@ mod tests { milli_snap!(format!("{map:?}"), "candidates_0_1000", @r###"{"colour": (0.0, 999.0)}"###); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .candidates((217..777).collect()) .compute_stats() .unwrap(); @@ -786,14 +814,14 @@ mod tests { let txn = index.read_txn().unwrap(); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .compute_stats() .unwrap(); milli_snap!(format!("{map:?}"), "no_candidates", @"{}"); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .candidates((0..1000).collect()) .compute_stats() .unwrap(); @@ -801,7 +829,7 @@ mod tests { milli_snap!(format!("{map:?}"), "candidates_0_1000", @r###"{"colour": (0.0, 1998.0)}"###); let map = FacetDistribution::new(&txn, &index) - .facets(std::iter::once("colour")) + .facets(iter::once(("colour", OrderBy::default()))) .candidates((217..777).collect()) .compute_stats() .unwrap(); diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index d355b981a..722a30e6d 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -1,3 +1,5 @@ +use std::cmp::Reverse; +use std::collections::BinaryHeap; use std::ops::ControlFlow; use heed::Result; @@ -19,7 +21,7 @@ use crate::DocumentId; /// /// The return value of the closure is a `ControlFlow<()>` which indicates whether we should /// keep iterating over the different facet values or stop. -pub fn iterate_over_facet_distribution<'t, CB>( +pub fn lexicographically_iterate_over_facet_distribution<'t, CB>( rtxn: &'t heed::RoTxn<'t>, db: heed::Database, FacetGroupValueCodec>, field_id: u16, @@ -29,7 +31,7 @@ pub fn iterate_over_facet_distribution<'t, CB>( where CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, { - let mut fd = FacetDistribution { rtxn, db, field_id, callback }; + let mut fd = LexicographicFacetDistribution { rtxn, db, field_id, callback }; let highest_level = get_highest_level( rtxn, db.remap_key_type::>(), @@ -44,7 +46,102 @@ where } } -struct FacetDistribution<'t, CB> +pub fn count_iterate_over_facet_distribution<'t, CB>( + rtxn: &'t heed::RoTxn<'t>, + db: heed::Database, FacetGroupValueCodec>, + field_id: u16, + candidates: &RoaringBitmap, + mut callback: CB, +) -> Result<()> +where + CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, +{ + /// # Important + /// The order of the fields determines the order in which the facet values will be returned. + /// This struct is inserted in a BinaryHeap and popped later on. + #[derive(Debug, PartialOrd, Ord, PartialEq, Eq)] + struct LevelEntry<'t> { + /// The number of candidates in this entry. + count: u64, + /// The key level of the entry. + level: Reverse, + /// The left bound key. + left_bound: &'t [u8], + /// The number of keys we must look for after `left_bound`. + group_size: u8, + /// Any docid in the set of matching documents. Used to find the original facet string. + any_docid: u32, + } + + // Represents the list of keys that we must explore. + let mut heap = BinaryHeap::new(); + let highest_level = get_highest_level( + rtxn, + db.remap_key_type::>(), + field_id, + )?; + + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + // We first fill the heap with values from the highest level + let starting_key = + FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; + for el in db.range(rtxn, &(&starting_key..))?.take(usize::MAX) { + let (key, value) = el?; + // The range is unbounded on the right and the group size for the highest level is MAX, + // so we need to check that we are not iterating over the next field id + if key.field_id != field_id { + break; + } + let intersection = value.bitmap & candidates; + let count = intersection.len(); + if count != 0 { + heap.push(LevelEntry { + count, + level: Reverse(key.level), + left_bound: key.left_bound, + group_size: value.size, + any_docid: intersection.min().unwrap(), + }); + } + } + + while let Some(LevelEntry { count, level, left_bound, group_size, any_docid }) = heap.pop() + { + if let Reverse(0) = level { + match (callback)(left_bound, count, any_docid)? { + ControlFlow::Continue(_) => (), + ControlFlow::Break(_) => return Ok(()), + } + } else { + let starting_key = FacetGroupKey { field_id, level: level.0 - 1, left_bound }; + for el in db.range(rtxn, &(&starting_key..))?.take(group_size as usize) { + let (key, value) = el?; + // The range is unbounded on the right and the group size for the highest level is MAX, + // so we need to check that we are not iterating over the next field id + if key.field_id != field_id { + break; + } + let intersection = value.bitmap & candidates; + let count = intersection.len(); + if count != 0 { + heap.push(LevelEntry { + count, + level: Reverse(key.level), + left_bound: key.left_bound, + group_size: value.size, + any_docid: intersection.min().unwrap(), + }); + } + } + } + } + } + + Ok(()) +} + +/// Iterate over the facets values by lexicographic order. +struct LexicographicFacetDistribution<'t, CB> where CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, { @@ -54,7 +151,7 @@ where callback: CB, } -impl<'t, CB> FacetDistribution<'t, CB> +impl<'t, CB> LexicographicFacetDistribution<'t, CB> where CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, { @@ -86,6 +183,7 @@ where } Ok(ControlFlow::Continue(())) } + fn iterate( &mut self, candidates: &RoaringBitmap, @@ -98,10 +196,10 @@ where } let starting_key = FacetGroupKey { field_id: self.field_id, level, left_bound: starting_bound }; - let iter = self.db.range(self.rtxn, &(&starting_key..)).unwrap().take(group_size); + let iter = self.db.range(self.rtxn, &(&starting_key..))?.take(group_size); for el in iter { - let (key, value) = el.unwrap(); + let (key, value) = el?; // The range is unbounded on the right and the group size for the highest level is MAX, // so we need to check that we are not iterating over the next field id if key.field_id != self.field_id { @@ -116,7 +214,7 @@ where value.size as usize, )?; match cf { - ControlFlow::Continue(_) => {} + ControlFlow::Continue(_) => (), ControlFlow::Break(_) => return Ok(ControlFlow::Break(())), } } @@ -132,7 +230,7 @@ mod tests { use heed::BytesDecode; use roaring::RoaringBitmap; - use super::iterate_over_facet_distribution; + use super::lexicographically_iterate_over_facet_distribution; use crate::heed_codec::facet::OrderedF64Codec; use crate::milli_snap; use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; @@ -144,7 +242,7 @@ mod tests { let txn = index.env.read_txn().unwrap(); let candidates = (0..=255).collect::(); let mut results = String::new(); - iterate_over_facet_distribution( + lexicographically_iterate_over_facet_distribution( &txn, index.content, 0, @@ -161,6 +259,7 @@ mod tests { txn.commit().unwrap(); } } + #[test] fn filter_distribution_all_stop_early() { let indexes = [get_simple_index(), get_random_looking_index()]; @@ -169,7 +268,7 @@ mod tests { let candidates = (0..=255).collect::(); let mut results = String::new(); let mut nbr_facets = 0; - iterate_over_facet_distribution( + lexicographically_iterate_over_facet_distribution( &txn, index.content, 0, diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 51f1bf005..ebc9e1da0 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -4,7 +4,7 @@ use heed::types::{ByteSlice, DecodeIgnore}; use heed::{BytesDecode, RoTxn}; use roaring::RoaringBitmap; -pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; +pub use self::facet_distribution::{FacetDistribution, OrderBy, DEFAULT_VALUES_PER_FACET}; pub use self::filter::{BadGeoError, Filter}; use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec}; use crate::heed_codec::ByteSliceRefCodec; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index e05562f8e..65e78caa9 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -7,7 +7,7 @@ use log::error; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET}; +pub use self::facet::{FacetDistribution, Filter, OrderBy, DEFAULT_VALUES_PER_FACET}; pub use self::new::matches::{FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWords}; use self::new::PartialSearchResult; use crate::error::UserError; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 3e271924b..aa69abca1 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -14,7 +14,7 @@ use crate::error::UserError; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::update::index_documents::IndexDocumentsMethod; use crate::update::{IndexDocuments, UpdateIndexingStep}; -use crate::{FieldsIdsMap, Index, Result}; +use crate::{FieldsIdsMap, Index, OrderBy, Result}; #[derive(Debug, Clone, PartialEq, Eq, Copy)] pub enum Setting { @@ -122,6 +122,7 @@ pub struct Settings<'a, 't, 'u, 'i> { /// Attributes on which typo tolerance is disabled. exact_attributes: Setting>, max_values_per_facet: Setting, + sort_facet_values_by: Setting>, pagination_max_total_hits: Setting, } @@ -149,6 +150,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { min_word_len_one_typo: Setting::NotSet, exact_attributes: Setting::NotSet, max_values_per_facet: Setting::NotSet, + sort_facet_values_by: Setting::NotSet, pagination_max_total_hits: Setting::NotSet, indexer_config, } @@ -275,6 +277,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.max_values_per_facet = Setting::Reset; } + pub fn set_sort_facet_values_by(&mut self, value: HashMap) { + self.sort_facet_values_by = Setting::Set(value); + } + + pub fn reset_sort_facet_values_by(&mut self) { + self.sort_facet_values_by = Setting::Reset; + } + pub fn set_pagination_max_total_hits(&mut self, value: usize) { self.pagination_max_total_hits = Setting::Set(value); } @@ -680,6 +690,20 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(()) } + fn update_sort_facet_values_by(&mut self) -> Result<()> { + match self.sort_facet_values_by.as_ref() { + Setting::Set(value) => { + self.index.put_sort_facet_values_by(self.wtxn, value)?; + } + Setting::Reset => { + self.index.delete_sort_facet_values_by(self.wtxn)?; + } + Setting::NotSet => (), + } + + Ok(()) + } + fn update_pagination_max_total_hits(&mut self) -> Result<()> { match self.pagination_max_total_hits { Setting::Set(max) => { @@ -714,6 +738,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.update_min_typo_word_len()?; self.update_exact_words()?; self.update_max_values_per_facet()?; + self.update_sort_facet_values_by()?; self.update_pagination_max_total_hits()?; // If there is new faceted fields we indicate that we must reindex as we must @@ -1515,6 +1540,7 @@ mod tests { exact_words, exact_attributes, max_values_per_facet, + sort_facet_values_by, pagination_max_total_hits, } = settings; assert!(matches!(searchable_fields, Setting::NotSet)); @@ -1532,6 +1558,7 @@ mod tests { assert!(matches!(exact_words, Setting::NotSet)); assert!(matches!(exact_attributes, Setting::NotSet)); assert!(matches!(max_values_per_facet, Setting::NotSet)); + assert!(matches!(sort_facet_values_by, Setting::NotSet)); assert!(matches!(pagination_max_total_hits, Setting::NotSet)); }) .unwrap(); diff --git a/milli/tests/search/facet_distribution.rs b/milli/tests/search/facet_distribution.rs index e2f89f2db..03405531d 100644 --- a/milli/tests/search/facet_distribution.rs +++ b/milli/tests/search/facet_distribution.rs @@ -5,7 +5,7 @@ use heed::EnvOpenOptions; use maplit::hashset; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; -use milli::{FacetDistribution, Index, Object}; +use milli::{FacetDistribution, Index, Object, OrderBy}; use serde_json::Deserializer; #[test] @@ -63,12 +63,12 @@ fn test_facet_distribution_with_no_facet_values() { let txn = index.read_txn().unwrap(); let mut distrib = FacetDistribution::new(&txn, &index); - distrib.facets(vec!["genres"]); + distrib.facets(vec![("genres", OrderBy::default())]); let result = distrib.execute().unwrap(); assert_eq!(result["genres"].len(), 0); let mut distrib = FacetDistribution::new(&txn, &index); - distrib.facets(vec!["tags"]); + distrib.facets(vec![("tags", OrderBy::default())]); let result = distrib.execute().unwrap(); assert_eq!(result["tags"].len(), 2); }