From d71bc1e69fc68c8cf4b5a3abb503fda7c8afdf1b Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 18 Jul 2022 16:52:45 +0200 Subject: [PATCH] Compute an exact count when using distinct --- milli/src/search/criteria/initial.rs | 29 ++++++++++++++++----- milli/src/search/criteria/mod.rs | 15 +++++++---- milli/src/search/criteria/typo.rs | 24 +++++++++++++---- milli/src/search/distinct/facet_distinct.rs | 1 + milli/src/search/mod.rs | 28 ++++++++++++++------ 5 files changed, 72 insertions(+), 25 deletions(-) diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index 2aabe9b13..bae77fda0 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -3,32 +3,35 @@ use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; use crate::search::criteria::{resolve_query_tree, Context}; use crate::search::query_tree::Operation; +use crate::search::Distinct; use crate::Result; -pub struct Initial<'t> { +pub struct Initial<'t, D> { ctx: &'t dyn Context<'t>, answer: Option, exhaustive_number_hits: bool, + distinct: Option, } -impl<'t> Initial<'t> { +impl<'t, D> Initial<'t, D> { pub fn new( ctx: &'t dyn Context<'t>, query_tree: Option, filtered_candidates: Option, exhaustive_number_hits: bool, - ) -> Initial { + distinct: Option, + ) -> Initial { let answer = CriterionResult { query_tree, candidates: None, filtered_candidates, bucket_candidates: None, }; - Initial { ctx, answer: Some(answer), exhaustive_number_hits } + Initial { ctx, answer: Some(answer), exhaustive_number_hits, distinct } } } -impl Criterion for Initial<'_> { +impl Criterion for Initial<'_, D> { #[logging_timer::time("Initial::{}")] fn next(&mut self, params: &mut CriterionParameters) -> Result> { self.answer @@ -41,8 +44,20 @@ impl Criterion for Initial<'_> { &mut params.wdcache, )?; - answer.candidates = Some(candidates.clone()); - answer.bucket_candidates = Some(candidates); + let bucket_candidates = match &mut self.distinct { + // may be really time consuming + Some(distinct) => { + let mut bucket_candidates = RoaringBitmap::new(); + for c in distinct.distinct(candidates.clone(), RoaringBitmap::new()) { + bucket_candidates.insert(c?); + } + bucket_candidates + } + None => candidates.clone(), + }; + + answer.candidates = Some(candidates); + answer.bucket_candidates = Some(bucket_candidates); } Ok(answer) }) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 6c4fa51d3..866eaefde 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -13,7 +13,7 @@ use self::typo::Typo; use self::words::Words; use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; use crate::search::criteria::geo::Geo; -use crate::search::{word_derivations, WordDerivationsCache}; +use crate::search::{word_derivations, Distinct, WordDerivationsCache}; use crate::{AscDesc as AscDescName, DocumentId, FieldId, Index, Member, Result}; mod asc_desc; @@ -226,21 +226,26 @@ impl<'t> CriteriaBuilder<'t> { Ok(Self { rtxn, index, words_fst, words_prefixes_fst }) } - pub fn build( + pub fn build( &'t self, query_tree: Option, primitive_query: Option>, filtered_candidates: Option, sort_criteria: Option>, exhaustive_number_hits: bool, + distinct: Option, ) -> Result> { use crate::criterion::Criterion as Name; let primitive_query = primitive_query.unwrap_or_default(); - let mut criterion = - Box::new(Initial::new(self, query_tree, filtered_candidates, exhaustive_number_hits)) - as Box; + let mut criterion = Box::new(Initial::new( + self, + query_tree, + filtered_candidates, + exhaustive_number_hits, + distinct, + )) as Box; for name in self.index.criteria(&self.rtxn)? { criterion = match name { Name::Words => Box::new(Words::new(self, criterion)), diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index f1537ed48..605089fae 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -348,6 +348,7 @@ mod test { use super::super::initial::Initial; use super::super::test::TestContext; use super::*; + use crate::search::NoopDistinct; fn display_criteria(mut criteria: Typo, mut parameters: CriterionParameters) -> String { let mut result = String::new(); @@ -368,7 +369,8 @@ mod test { excluded_candidates: &RoaringBitmap::new(), }; - let parent = Initial::new(&context, query_tree, facet_candidates, false); + let parent = + Initial::::new(&context, query_tree, facet_candidates, false, None); let criteria = Typo::new(&context, Box::new(parent)); let result = display_criteria(criteria, criterion_parameters); @@ -405,7 +407,8 @@ mod test { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), }; - let parent = Initial::new(&context, Some(query_tree), facet_candidates, false); + let parent = + Initial::::new(&context, Some(query_tree), facet_candidates, false, None); let criteria = Typo::new(&context, Box::new(parent)); let result = display_criteria(criteria, criterion_parameters); @@ -439,7 +442,13 @@ mod test { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), }; - let parent = Initial::new(&context, query_tree, Some(facet_candidates.clone()), false); + let parent = Initial::::new( + &context, + query_tree, + Some(facet_candidates.clone()), + false, + None, + ); let criteria = Typo::new(&context, Box::new(parent)); let result = display_criteria(criteria, criterion_parameters); @@ -476,8 +485,13 @@ mod test { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), }; - let parent = - Initial::new(&context, Some(query_tree), Some(facet_candidates.clone()), false); + let parent = Initial::::new( + &context, + Some(query_tree), + Some(facet_candidates.clone()), + false, + None, + ); let criteria = Typo::new(&context, Box::new(parent)); let result = display_criteria(criteria, criterion_parameters); diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 4436d4cda..33e7b4975 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -21,6 +21,7 @@ const DOCID_SIZE: usize = size_of::(); /// care to keep the document we are currently on, and remove it from the excluded list. The next /// iterations will never contain any occurence of a document with the same distinct value as a /// document from previous iterations. +#[derive(Clone)] pub struct FacetDistinct<'a> { distinct: FieldId, index: &'a Index, diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 6f1e1b34c..270beb52a 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -191,21 +191,33 @@ impl<'a> Search<'a> { } let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; - let criteria = criteria_builder.build( - query_tree, - primitive_query, - filtered_candidates, - self.sort_criteria.clone(), - self.exhaustive_number_hits, - )?; match self.index.distinct_field(self.rtxn)? { - None => self.perform_sort(NoopDistinct, matching_words.unwrap_or_default(), criteria), + None => { + let criteria = criteria_builder.build::( + query_tree, + primitive_query, + filtered_candidates, + self.sort_criteria.clone(), + self.exhaustive_number_hits, + None, + )?; + self.perform_sort(NoopDistinct, matching_words.unwrap_or_default(), criteria) + } Some(name) => { let field_ids_map = self.index.fields_ids_map(self.rtxn)?; match field_ids_map.id(name) { Some(fid) => { let distinct = FacetDistinct::new(fid, self.index, self.rtxn); + + let criteria = criteria_builder.build( + query_tree, + primitive_query, + filtered_candidates, + self.sort_criteria.clone(), + self.exhaustive_number_hits, + Some(distinct.clone()), + )?; self.perform_sort(distinct, matching_words.unwrap_or_default(), criteria) } None => Ok(SearchResult::default()),