diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 27de36c6d..d98e96e87 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -671,27 +671,16 @@ pub fn perform_search( let sort_facet_values_by = index.sort_facet_values_by(&rtxn).map_err(milli::Error::from)?; - let default_sort_facet_values_by = - sort_facet_values_by.get("*").copied().unwrap_or_default(); if fields.iter().all(|f| f != "*") { - let fields: Vec<_> = fields - .iter() - .map(|n| { - ( - n, - sort_facet_values_by - .get(n) - .copied() - .unwrap_or(default_sort_facet_values_by), - ) - }) - .collect(); + let fields: Vec<_> = + fields.iter().map(|n| (n, sort_facet_values_by.get(n))).collect(); facet_distribution.facets(fields); } + let distribution = facet_distribution .candidates(candidates) - .default_order_by(default_sort_facet_values_by) + .default_order_by(sort_facet_values_by.get("*")) .execute()?; let stats = facet_distribution.compute_stats()?; (Some(distribution), Some(stats)) diff --git a/meilisearch/tests/search/facet_search.rs b/meilisearch/tests/search/facet_search.rs index 5f9f631f9..12d2226a9 100644 --- a/meilisearch/tests/search/facet_search.rs +++ b/meilisearch/tests/search/facet_search.rs @@ -123,6 +123,28 @@ async fn simple_facet_search_with_max_values() { assert_eq!(dbg!(response)["facetHits"].as_array().unwrap().len(), 1); } +#[actix_rt::test] +async fn simple_facet_search_by_count_with_max_values() { + let server = Server::new().await; + let index = server.index("test"); + + let documents = DOCUMENTS.clone(); + index + .update_settings_faceting( + json!({ "maxValuesPerFacet": 1, "sortFacetValuesBy": { "*": "count" } }), + ) + .await; + index.update_settings_filterable_attributes(json!(["genres"])).await; + index.add_documents(documents, None).await; + index.wait_task(2).await; + + let (response, code) = + index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; + + assert_eq!(code, 200, "{}", response); + assert_eq!(dbg!(response)["facetHits"].as_array().unwrap().len(), 1); +} + #[actix_rt::test] async fn non_filterable_facet_search_error() { let server = Server::new().await; @@ -157,3 +179,24 @@ async fn facet_search_dont_support_words() { assert_eq!(code, 200, "{}", response); assert_eq!(response["facetHits"].as_array().unwrap().len(), 0); } + +#[actix_rt::test] +async fn simple_facet_search_with_sort_by_count() { + let server = Server::new().await; + let index = server.index("test"); + + let documents = DOCUMENTS.clone(); + index.update_settings_faceting(json!({ "sortFacetValuesBy": { "*": "count" } })).await; + index.update_settings_filterable_attributes(json!(["genres"])).await; + index.add_documents(documents, None).await; + index.wait_task(2).await; + + let (response, code) = + index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; + + assert_eq!(code, 200, "{}", response); + let hits = response["facetHits"].as_array().unwrap(); + assert_eq!(hits.len(), 2); + assert_eq!(hits[0], json!({ "value": "Action", "count": 3 })); + assert_eq!(hits[1], json!({ "value": "Adventure", "count": 2 })); +} diff --git a/milli/src/index.rs b/milli/src/index.rs index 6ad39dcb1..2c3977403 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -20,13 +20,13 @@ use crate::heed_codec::facet::{ use crate::heed_codec::{ BEU16StrCodec, FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec, }; +use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; use crate::vector::EmbeddingConfig; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec, - OrderBy, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, - BEU32, BEU64, + Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32, BEU64, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; @@ -1373,21 +1373,19 @@ impl Index { self.main.remap_key_type::().delete(txn, main_key::MAX_VALUES_PER_FACET) } - pub fn sort_facet_values_by(&self, txn: &RoTxn) -> heed::Result> { - let mut orders = self + pub fn sort_facet_values_by(&self, txn: &RoTxn) -> heed::Result { + let orders = self .main - .remap_types::>>() + .remap_types::>() .get(txn, main_key::SORT_FACET_VALUES_BY)? .unwrap_or_default(); - // Insert the default ordering if it is not already overwritten by the user. - orders.entry("*".to_string()).or_insert(OrderBy::Lexicographic); Ok(orders) } pub(crate) fn put_sort_facet_values_by( &self, txn: &mut RwTxn, - val: &HashMap, + val: &OrderByMap, ) -> heed::Result<()> { self.main.remap_types::>().put(txn, main_key::SORT_FACET_VALUES_BY, &val) } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index f6b398304..5effcea3d 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -16,6 +16,7 @@ pub mod facet; mod fields_ids_map; pub mod heed_codec; pub mod index; +pub mod order_by_map; pub mod prompt; pub mod proximity; pub mod score_details; @@ -56,10 +57,10 @@ pub use self::heed_codec::{ UncheckedU8StrStrCodec, }; pub use self::index::Index; +pub use self::search::facet::{FacetValueHit, SearchForFacetValues}; pub use self::search::{ - FacetDistribution, FacetValueHit, Filter, FormatOptions, MatchBounds, MatcherBuilder, - MatchingWords, OrderBy, Search, SearchForFacetValues, SearchResult, TermsMatchingStrategy, - DEFAULT_VALUES_PER_FACET, + FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, OrderBy, + Search, SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, }; pub type Result = std::result::Result; diff --git a/milli/src/order_by_map.rs b/milli/src/order_by_map.rs new file mode 100644 index 000000000..287e62c3a --- /dev/null +++ b/milli/src/order_by_map.rs @@ -0,0 +1,57 @@ +use std::collections::{hash_map, HashMap}; +use std::iter::FromIterator; + +use serde::{Deserialize, Deserializer, Serialize}; + +use crate::OrderBy; + +#[derive(Serialize)] +pub struct OrderByMap(HashMap); + +impl OrderByMap { + pub fn get(&self, key: impl AsRef) -> OrderBy { + self.0 + .get(key.as_ref()) + .copied() + .unwrap_or_else(|| self.0.get("*").copied().unwrap_or_default()) + } + + pub fn insert(&mut self, key: String, value: OrderBy) -> Option { + self.0.insert(key, value) + } +} + +impl Default for OrderByMap { + fn default() -> Self { + let mut map = HashMap::new(); + map.insert("*".to_string(), OrderBy::Lexicographic); + OrderByMap(map) + } +} + +impl FromIterator<(String, OrderBy)> for OrderByMap { + fn from_iter>(iter: T) -> Self { + OrderByMap(iter.into_iter().collect()) + } +} + +impl IntoIterator for OrderByMap { + type Item = (String, OrderBy); + type IntoIter = hash_map::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.0.into_iter() + } +} + +impl<'de> Deserialize<'de> for OrderByMap { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let mut map = Deserialize::deserialize(deserializer).map(OrderByMap)?; + // Insert the default ordering if it is not already overwritten by the user. + map.0.entry("*".to_string()).or_insert(OrderBy::default()); + Ok(map) + } +} diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index f676ee109..34a9cdcb8 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -6,15 +6,18 @@ use roaring::RoaringBitmap; pub use self::facet_distribution::{FacetDistribution, OrderBy, DEFAULT_VALUES_PER_FACET}; pub use self::filter::{BadGeoError, Filter}; +pub use self::search::{FacetValueHit, SearchForFacetValues}; use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec}; use crate::heed_codec::BytesRefCodec; use crate::{Index, Result}; + mod facet_distribution; mod facet_distribution_iter; mod facet_range_search; mod facet_sort_ascending; mod facet_sort_descending; mod filter; +mod search; fn facet_extreme_value<'t>( mut extreme_it: impl Iterator> + 't, diff --git a/milli/src/search/facet/search.rs b/milli/src/search/facet/search.rs new file mode 100644 index 000000000..0251d6b8d --- /dev/null +++ b/milli/src/search/facet/search.rs @@ -0,0 +1,326 @@ +use std::cmp::{Ordering, Reverse}; +use std::collections::BinaryHeap; +use std::ops::ControlFlow; + +use charabia::normalizer::NormalizerOption; +use charabia::Normalize; +use fst::automaton::{Automaton, Str}; +use fst::{IntoStreamer, Streamer}; +use roaring::RoaringBitmap; +use tracing::error; + +use crate::error::UserError; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue}; +use crate::search::build_dfa; +use crate::{DocumentId, FieldId, OrderBy, Result, Search}; + +/// The maximum number of values per facet returned by the facet search route. +const DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET: usize = 100; + +pub struct SearchForFacetValues<'a> { + query: Option, + facet: String, + search_query: Search<'a>, + max_values: usize, + is_hybrid: bool, +} + +impl<'a> SearchForFacetValues<'a> { + pub fn new( + facet: String, + search_query: Search<'a>, + is_hybrid: bool, + ) -> SearchForFacetValues<'a> { + SearchForFacetValues { + query: None, + facet, + search_query, + max_values: DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET, + is_hybrid, + } + } + + pub fn query(&mut self, query: impl Into) -> &mut Self { + self.query = Some(query.into()); + self + } + + pub fn max_values(&mut self, max: usize) -> &mut Self { + self.max_values = max; + self + } + + fn one_original_value_of( + &self, + field_id: FieldId, + facet_str: &str, + any_docid: DocumentId, + ) -> Result> { + let index = self.search_query.index; + let rtxn = self.search_query.rtxn; + let key: (FieldId, _, &str) = (field_id, any_docid, facet_str); + Ok(index.field_id_docid_facet_strings.get(rtxn, &key)?.map(|v| v.to_owned())) + } + + pub fn execute(&self) -> Result> { + let index = self.search_query.index; + let rtxn = self.search_query.rtxn; + + let filterable_fields = index.filterable_fields(rtxn)?; + if !filterable_fields.contains(&self.facet) { + let (valid_fields, hidden_fields) = + index.remove_hidden_fields(rtxn, filterable_fields)?; + + return Err(UserError::InvalidFacetSearchFacetName { + field: self.facet.clone(), + valid_fields, + hidden_fields, + } + .into()); + } + + let fields_ids_map = index.fields_ids_map(rtxn)?; + let fid = match fields_ids_map.id(&self.facet) { + Some(fid) => fid, + // we return an empty list of results when the attribute has been + // set as filterable but no document contains this field (yet). + None => return Ok(Vec::new()), + }; + + let fst = match self.search_query.index.facet_id_string_fst.get(rtxn, &fid)? { + Some(fst) => fst, + None => return Ok(Vec::new()), + }; + + let search_candidates = self + .search_query + .execute_for_candidates(self.is_hybrid || self.search_query.vector.is_some())?; + + let mut results = match index.sort_facet_values_by(rtxn)?.get(&self.facet) { + OrderBy::Lexicographic => ValuesCollection::by_lexicographic(self.max_values), + OrderBy::Count => ValuesCollection::by_count(self.max_values), + }; + + match self.query.as_ref() { + Some(query) => { + let options = NormalizerOption { lossy: true, ..Default::default() }; + let query = query.normalize(&options); + let query = query.as_ref(); + + let authorize_typos = self.search_query.index.authorize_typos(rtxn)?; + let field_authorizes_typos = + !self.search_query.index.exact_attributes_ids(rtxn)?.contains(&fid); + + if authorize_typos && field_authorizes_typos { + let exact_words_fst = self.search_query.index.exact_words(rtxn)?; + if exact_words_fst.map_or(false, |fst| fst.contains(query)) { + if fst.contains(query) { + self.fetch_original_facets_using_normalized( + fid, + query, + query, + &search_candidates, + &mut results, + )?; + } + } else { + let one_typo = self.search_query.index.min_word_len_one_typo(rtxn)?; + let two_typos = self.search_query.index.min_word_len_two_typos(rtxn)?; + + let is_prefix = true; + let automaton = if query.len() < one_typo as usize { + build_dfa(query, 0, is_prefix) + } else if query.len() < two_typos as usize { + build_dfa(query, 1, is_prefix) + } else { + build_dfa(query, 2, is_prefix) + }; + + let mut stream = fst.search(automaton).into_stream(); + while let Some(facet_value) = stream.next() { + let value = std::str::from_utf8(facet_value)?; + if self + .fetch_original_facets_using_normalized( + fid, + value, + query, + &search_candidates, + &mut results, + )? + .is_break() + { + break; + } + } + } + } else { + let automaton = Str::new(query).starts_with(); + let mut stream = fst.search(automaton).into_stream(); + while let Some(facet_value) = stream.next() { + let value = std::str::from_utf8(facet_value)?; + if self + .fetch_original_facets_using_normalized( + fid, + value, + query, + &search_candidates, + &mut results, + )? + .is_break() + { + break; + } + } + } + } + None => { + let prefix = FacetGroupKey { field_id: fid, level: 0, left_bound: "" }; + for result in index.facet_id_string_docids.prefix_iter(rtxn, &prefix)? { + let (FacetGroupKey { left_bound, .. }, FacetGroupValue { bitmap, .. }) = + result?; + let count = search_candidates.intersection_len(&bitmap); + if count != 0 { + let value = self + .one_original_value_of(fid, left_bound, bitmap.min().unwrap())? + .unwrap_or_else(|| left_bound.to_string()); + if results.insert(FacetValueHit { value, count }).is_break() { + break; + } + } + } + } + } + + Ok(results.into_sorted_vec()) + } + + fn fetch_original_facets_using_normalized( + &self, + fid: FieldId, + value: &str, + query: &str, + search_candidates: &RoaringBitmap, + results: &mut ValuesCollection, + ) -> Result> { + let index = self.search_query.index; + let rtxn = self.search_query.rtxn; + + let database = index.facet_id_normalized_string_strings; + let key = (fid, value); + let original_strings = match database.get(rtxn, &key)? { + Some(original_strings) => original_strings, + None => { + error!("the facet value is missing from the facet database: {key:?}"); + return Ok(ControlFlow::Continue(())); + } + }; + for original in original_strings { + let key = FacetGroupKey { field_id: fid, level: 0, left_bound: original.as_str() }; + let docids = match index.facet_id_string_docids.get(rtxn, &key)? { + Some(FacetGroupValue { bitmap, .. }) => bitmap, + None => { + error!("the facet value is missing from the facet database: {key:?}"); + return Ok(ControlFlow::Continue(())); + } + }; + let count = search_candidates.intersection_len(&docids); + if count != 0 { + let value = self + .one_original_value_of(fid, &original, docids.min().unwrap())? + .unwrap_or_else(|| query.to_string()); + if results.insert(FacetValueHit { value, count }).is_break() { + break; + } + } + } + + Ok(ControlFlow::Continue(())) + } +} + +#[derive(Debug, Clone, serde::Serialize, PartialEq)] +pub struct FacetValueHit { + /// The original facet value + pub value: String, + /// The number of documents associated to this facet + pub count: u64, +} + +impl PartialOrd for FacetValueHit { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for FacetValueHit { + fn cmp(&self, other: &Self) -> Ordering { + self.count.cmp(&other.count).then_with(|| self.value.cmp(&other.value)) + } +} + +impl Eq for FacetValueHit {} + +/// A wrapper type that collects the best facet values by +/// lexicographic or number of associated values. +enum ValuesCollection { + /// Keeps the top values according to the lexicographic order. + Lexicographic { max: usize, content: Vec }, + /// Keeps the top values according to the number of values associated to them. + /// + /// Note that it is a max heap and we need to move the smallest counts + /// at the top to be able to pop them when we reach the max_values limit. + Count { max: usize, content: BinaryHeap> }, +} + +impl ValuesCollection { + pub fn by_lexicographic(max: usize) -> Self { + ValuesCollection::Lexicographic { max, content: Vec::new() } + } + + pub fn by_count(max: usize) -> Self { + ValuesCollection::Count { max, content: BinaryHeap::new() } + } + + pub fn insert(&mut self, value: FacetValueHit) -> ControlFlow<()> { + match self { + ValuesCollection::Lexicographic { max, content } => { + if content.len() < *max { + content.push(value); + if content.len() < *max { + return ControlFlow::Continue(()); + } + } + ControlFlow::Break(()) + } + ValuesCollection::Count { max, content } => { + if content.len() == *max { + // Peeking gives us the worst value in the list as + // this is a max-heap and we reversed it. + let Some(mut peek) = content.peek_mut() else { return ControlFlow::Break(()) }; + if peek.0.count <= value.count { + // Replace the current worst value in the heap + // with the new one we received that is better. + *peek = Reverse(value); + } + } else { + content.push(Reverse(value)); + } + ControlFlow::Continue(()) + } + } + } + + /// Returns the list of facet values in descending order of, either, + /// count or lexicographic order of the value depending on the type. + pub fn into_sorted_vec(self) -> Vec { + match self { + ValuesCollection::Lexicographic { content, .. } => content.into_iter().collect(), + ValuesCollection::Count { content, .. } => { + // Convert the heap into a vec of hits by removing the Reverse wrapper. + // Hits are already in the right order as they were reversed and there + // are output in ascending order. + content.into_sorted_vec().into_iter().map(|Reverse(hit)| hit).collect() + } + } + } +} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index e411bd032..dc8354486 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -1,25 +1,17 @@ use std::fmt; -use std::ops::ControlFlow; -use charabia::normalizer::NormalizerOption; -use charabia::Normalize; -use fst::automaton::{Automaton, Str}; -use fst::{IntoStreamer, Streamer}; use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -use tracing::error; pub use self::facet::{FacetDistribution, Filter, OrderBy, DEFAULT_VALUES_PER_FACET}; pub use self::new::matches::{FormatOptions, MatchBounds, MatcherBuilder, MatchingWords}; use self::new::{execute_vector_search, PartialSearchResult}; -use crate::error::UserError; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue}; use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::vector::DistributionShift; use crate::{ - execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, FieldId, Index, - Result, SearchContext, + execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, + SearchContext, }; // Building these factories is not free. @@ -27,9 +19,6 @@ static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); static LEVDIST1: Lazy = Lazy::new(|| LevBuilder::new(1, true)); static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); -/// The maximum number of values per facet returned by the facet search route. -const DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET: usize = 100; - pub mod facet; mod fst_utils; pub mod hybrid; @@ -302,240 +291,6 @@ pub fn build_dfa(word: &str, typos: u8, is_prefix: bool) -> DFA { } } -pub struct SearchForFacetValues<'a> { - query: Option, - facet: String, - search_query: Search<'a>, - max_values: usize, - is_hybrid: bool, -} - -impl<'a> SearchForFacetValues<'a> { - pub fn new( - facet: String, - search_query: Search<'a>, - is_hybrid: bool, - ) -> SearchForFacetValues<'a> { - SearchForFacetValues { - query: None, - facet, - search_query, - max_values: DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET, - is_hybrid, - } - } - - pub fn query(&mut self, query: impl Into) -> &mut Self { - self.query = Some(query.into()); - self - } - - pub fn max_values(&mut self, max: usize) -> &mut Self { - self.max_values = max; - self - } - - fn one_original_value_of( - &self, - field_id: FieldId, - facet_str: &str, - any_docid: DocumentId, - ) -> Result> { - let index = self.search_query.index; - let rtxn = self.search_query.rtxn; - let key: (FieldId, _, &str) = (field_id, any_docid, facet_str); - Ok(index.field_id_docid_facet_strings.get(rtxn, &key)?.map(|v| v.to_owned())) - } - - pub fn execute(&self) -> Result> { - let index = self.search_query.index; - let rtxn = self.search_query.rtxn; - - let filterable_fields = index.filterable_fields(rtxn)?; - if !filterable_fields.contains(&self.facet) { - let (valid_fields, hidden_fields) = - index.remove_hidden_fields(rtxn, filterable_fields)?; - - return Err(UserError::InvalidFacetSearchFacetName { - field: self.facet.clone(), - valid_fields, - hidden_fields, - } - .into()); - } - - let fields_ids_map = index.fields_ids_map(rtxn)?; - let fid = match fields_ids_map.id(&self.facet) { - Some(fid) => fid, - // we return an empty list of results when the attribute has been - // set as filterable but no document contains this field (yet). - None => return Ok(Vec::new()), - }; - - let fst = match self.search_query.index.facet_id_string_fst.get(rtxn, &fid)? { - Some(fst) => fst, - None => return Ok(vec![]), - }; - - let search_candidates = self - .search_query - .execute_for_candidates(self.is_hybrid || self.search_query.vector.is_some())?; - - match self.query.as_ref() { - Some(query) => { - let options = NormalizerOption { lossy: true, ..Default::default() }; - let query = query.normalize(&options); - let query = query.as_ref(); - - let authorize_typos = self.search_query.index.authorize_typos(rtxn)?; - let field_authorizes_typos = - !self.search_query.index.exact_attributes_ids(rtxn)?.contains(&fid); - - if authorize_typos && field_authorizes_typos { - let exact_words_fst = self.search_query.index.exact_words(rtxn)?; - if exact_words_fst.map_or(false, |fst| fst.contains(query)) { - let mut results = vec![]; - if fst.contains(query) { - self.fetch_original_facets_using_normalized( - fid, - query, - query, - &search_candidates, - &mut results, - )?; - } - Ok(results) - } else { - let one_typo = self.search_query.index.min_word_len_one_typo(rtxn)?; - let two_typos = self.search_query.index.min_word_len_two_typos(rtxn)?; - - let is_prefix = true; - let automaton = if query.len() < one_typo as usize { - build_dfa(query, 0, is_prefix) - } else if query.len() < two_typos as usize { - build_dfa(query, 1, is_prefix) - } else { - build_dfa(query, 2, is_prefix) - }; - - let mut stream = fst.search(automaton).into_stream(); - let mut results = vec![]; - while let Some(facet_value) = stream.next() { - let value = std::str::from_utf8(facet_value)?; - if self - .fetch_original_facets_using_normalized( - fid, - value, - query, - &search_candidates, - &mut results, - )? - .is_break() - { - break; - } - } - - Ok(results) - } - } else { - let automaton = Str::new(query).starts_with(); - let mut stream = fst.search(automaton).into_stream(); - let mut results = vec![]; - while let Some(facet_value) = stream.next() { - let value = std::str::from_utf8(facet_value)?; - if self - .fetch_original_facets_using_normalized( - fid, - value, - query, - &search_candidates, - &mut results, - )? - .is_break() - { - break; - } - } - - Ok(results) - } - } - None => { - let mut results = vec![]; - let prefix = FacetGroupKey { field_id: fid, level: 0, left_bound: "" }; - for result in index.facet_id_string_docids.prefix_iter(rtxn, &prefix)? { - let (FacetGroupKey { left_bound, .. }, FacetGroupValue { bitmap, .. }) = - result?; - let count = search_candidates.intersection_len(&bitmap); - if count != 0 { - let value = self - .one_original_value_of(fid, left_bound, bitmap.min().unwrap())? - .unwrap_or_else(|| left_bound.to_string()); - results.push(FacetValueHit { value, count }); - } - if results.len() >= self.max_values { - break; - } - } - Ok(results) - } - } - } - - fn fetch_original_facets_using_normalized( - &self, - fid: FieldId, - value: &str, - query: &str, - search_candidates: &RoaringBitmap, - results: &mut Vec, - ) -> Result> { - let index = self.search_query.index; - let rtxn = self.search_query.rtxn; - - let database = index.facet_id_normalized_string_strings; - let key = (fid, value); - let original_strings = match database.get(rtxn, &key)? { - Some(original_strings) => original_strings, - None => { - error!("the facet value is missing from the facet database: {key:?}"); - return Ok(ControlFlow::Continue(())); - } - }; - for original in original_strings { - let key = FacetGroupKey { field_id: fid, level: 0, left_bound: original.as_str() }; - let docids = match index.facet_id_string_docids.get(rtxn, &key)? { - Some(FacetGroupValue { bitmap, .. }) => bitmap, - None => { - error!("the facet value is missing from the facet database: {key:?}"); - return Ok(ControlFlow::Continue(())); - } - }; - let count = search_candidates.intersection_len(&docids); - if count != 0 { - let value = self - .one_original_value_of(fid, &original, docids.min().unwrap())? - .unwrap_or_else(|| query.to_string()); - results.push(FacetValueHit { value, count }); - } - if results.len() >= self.max_values { - return Ok(ControlFlow::Break(())); - } - } - - Ok(ControlFlow::Continue(())) - } -} - -#[derive(Debug, Clone, serde::Serialize, PartialEq)] -pub struct FacetValueHit { - /// The original facet value - pub value: String, - /// The number of documents associated to this facet - pub count: u64, -} - #[cfg(test)] mod test { #[allow(unused_imports)] diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index ee2f58a01..63b45e3aa 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -14,12 +14,13 @@ use super::IndexerConfig; use crate::criterion::Criterion; use crate::error::UserError; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; +use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; use crate::update::{IndexDocuments, UpdateIndexingStep}; use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings}; use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; -use crate::{FieldsIdsMap, Index, OrderBy, Result}; +use crate::{FieldsIdsMap, Index, Result}; #[derive(Debug, Clone, PartialEq, Eq, Copy)] pub enum Setting { @@ -145,7 +146,7 @@ pub struct Settings<'a, 't, 'i> { /// Attributes on which typo tolerance is disabled. exact_attributes: Setting>, max_values_per_facet: Setting, - sort_facet_values_by: Setting>, + sort_facet_values_by: Setting, pagination_max_total_hits: Setting, proximity_precision: Setting, embedder_settings: Setting>>, @@ -340,7 +341,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.max_values_per_facet = Setting::Reset; } - pub fn set_sort_facet_values_by(&mut self, value: HashMap) { + pub fn set_sort_facet_values_by(&mut self, value: OrderByMap) { self.sort_facet_values_by = Setting::Set(value); }