From cd7c6e19ed64c37f173ae4cf5daa0b110628f866 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 18 May 2022 14:51:00 +0200 Subject: [PATCH] Reintroduce the max values by facet limit --- milli/src/search/facet/facet_distribution.rs | 48 ++++++++++++++++++-- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index ddbcb2b68..23b0b1df9 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -1,6 +1,6 @@ use std::collections::{BTreeMap, HashSet}; use std::ops::Bound::Unbounded; -use std::{fmt, mem}; +use std::{cmp, fmt, mem}; use heed::types::ByteSlice; use roaring::RoaringBitmap; @@ -13,6 +13,14 @@ use crate::heed_codec::facet::{ use crate::search::facet::{FacetNumberIter, FacetNumberRange, FacetStringIter}; use crate::{FieldId, Index, Result}; +/// The default number of values by facets that will +/// be fetched from the key-value store. +const DEFAULT_VALUES_BY_FACET: usize = 1000; + +/// The hard limit in the number of values by facets that will be fetched from +/// the key-value store. Searching for more values could slow down the engine. +const MAX_VALUES_BY_FACET: usize = 10000; + /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. const CANDIDATES_THRESHOLD: u64 = 3000; @@ -20,13 +28,20 @@ const CANDIDATES_THRESHOLD: u64 = 3000; pub struct FacetDistribution<'a> { facets: Option>, candidates: Option, + max_values_by_facet: usize, rtxn: &'a heed::RoTxn<'a>, index: &'a Index, } impl<'a> FacetDistribution<'a> { pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> FacetDistribution<'a> { - FacetDistribution { facets: None, candidates: None, rtxn, index } + FacetDistribution { + facets: None, + candidates: None, + max_values_by_facet: DEFAULT_VALUES_BY_FACET, + rtxn, + index, + } } pub fn facets, A: AsRef>(&mut self, names: I) -> &mut Self { @@ -34,6 +49,11 @@ impl<'a> FacetDistribution<'a> { self } + pub fn max_values_by_facet(&mut self, max: usize) -> &mut Self { + self.max_values_by_facet = cmp::min(max, MAX_VALUES_BY_FACET); + self + } + pub fn candidates(&mut self, candidates: RoaringBitmap) -> &mut Self { self.candidates = Some(candidates); self @@ -52,6 +72,7 @@ impl<'a> FacetDistribution<'a> { FacetType::Number => { let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); + let distribution_prelength = distribution.len(); let db = self.index.field_id_docid_facet_f64s; for docid in candidates.into_iter() { key_buffer.truncate(mem::size_of::()); @@ -64,6 +85,10 @@ impl<'a> FacetDistribution<'a> { for result in iter { let ((_, _, value), ()) = result?; *distribution.entry(value.to_string()).or_insert(0) += 1; + + if distribution.len() - distribution_prelength == self.max_values_by_facet { + break; + } } } } @@ -86,6 +111,10 @@ impl<'a> FacetDistribution<'a> { .entry(normalized_value) .or_insert_with(|| (original_value, 0)); *count += 1; + + if normalized_distribution.len() == self.max_values_by_facet { + break; + } } } @@ -116,6 +145,9 @@ impl<'a> FacetDistribution<'a> { if !docids.is_empty() { distribution.insert(value.to_string(), docids.len()); } + if distribution.len() == self.max_values_by_facet { + break; + } } Ok(()) @@ -136,6 +168,9 @@ impl<'a> FacetDistribution<'a> { if !docids.is_empty() { distribution.insert(original.to_string(), docids.len()); } + if distribution.len() == self.max_values_by_facet { + break; + } } Ok(()) @@ -155,6 +190,9 @@ impl<'a> FacetDistribution<'a> { for result in range { let ((_, _, value, _), docids) = result?; distribution.insert(value.to_string(), docids.len()); + if distribution.len() == self.max_values_by_facet { + break; + } } let iter = self @@ -168,6 +206,9 @@ impl<'a> FacetDistribution<'a> { for result in iter { let ((_, normalized_value), (original_value, docids)) = result?; normalized_distribution.insert(normalized_value, (original_value, docids.len())); + if normalized_distribution.len() == self.max_values_by_facet { + break; + } } let iter = normalized_distribution @@ -253,11 +294,12 @@ impl<'a> FacetDistribution<'a> { impl fmt::Debug for FacetDistribution<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let FacetDistribution { facets, candidates, rtxn: _, index: _ } = self; + let FacetDistribution { facets, candidates, max_values_by_facet, rtxn: _, index: _ } = self; f.debug_struct("FacetDistribution") .field("facets", facets) .field("candidates", candidates) + .field("max_values_by_facet", max_values_by_facet) .finish() } }