Compute an exact count when using distinct

This commit is contained in:
ManyTheFish 2022-07-18 16:52:45 +02:00
parent a396806343
commit d71bc1e69f
5 changed files with 72 additions and 25 deletions

View File

@ -3,32 +3,35 @@ use roaring::RoaringBitmap;
use super::{Criterion, CriterionParameters, CriterionResult};
use crate::search::criteria::{resolve_query_tree, Context};
use crate::search::query_tree::Operation;
use crate::search::Distinct;
use crate::Result;
pub struct Initial<'t> {
pub struct Initial<'t, D> {
ctx: &'t dyn Context<'t>,
answer: Option<CriterionResult>,
exhaustive_number_hits: bool,
distinct: Option<D>,
}
impl<'t> Initial<'t> {
impl<'t, D> Initial<'t, D> {
pub fn new(
ctx: &'t dyn Context<'t>,
query_tree: Option<Operation>,
filtered_candidates: Option<RoaringBitmap>,
exhaustive_number_hits: bool,
) -> Initial {
distinct: Option<D>,
) -> Initial<D> {
let answer = CriterionResult {
query_tree,
candidates: None,
filtered_candidates,
bucket_candidates: None,
};
Initial { ctx, answer: Some(answer), exhaustive_number_hits }
Initial { ctx, answer: Some(answer), exhaustive_number_hits, distinct }
}
}
impl Criterion for Initial<'_> {
impl<D: Distinct> Criterion for Initial<'_, D> {
#[logging_timer::time("Initial::{}")]
fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>> {
self.answer
@ -41,8 +44,20 @@ impl Criterion for Initial<'_> {
&mut params.wdcache,
)?;
answer.candidates = Some(candidates.clone());
answer.bucket_candidates = Some(candidates);
let bucket_candidates = match &mut self.distinct {
// may be really time consuming
Some(distinct) => {
let mut bucket_candidates = RoaringBitmap::new();
for c in distinct.distinct(candidates.clone(), RoaringBitmap::new()) {
bucket_candidates.insert(c?);
}
bucket_candidates
}
None => candidates.clone(),
};
answer.candidates = Some(candidates);
answer.bucket_candidates = Some(bucket_candidates);
}
Ok(answer)
})

View File

@ -13,7 +13,7 @@ use self::typo::Typo;
use self::words::Words;
use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind};
use crate::search::criteria::geo::Geo;
use crate::search::{word_derivations, WordDerivationsCache};
use crate::search::{word_derivations, Distinct, WordDerivationsCache};
use crate::{AscDesc as AscDescName, DocumentId, FieldId, Index, Member, Result};
mod asc_desc;
@ -226,21 +226,26 @@ impl<'t> CriteriaBuilder<'t> {
Ok(Self { rtxn, index, words_fst, words_prefixes_fst })
}
pub fn build(
pub fn build<D: 't + Distinct>(
&'t self,
query_tree: Option<Operation>,
primitive_query: Option<Vec<PrimitiveQueryPart>>,
filtered_candidates: Option<RoaringBitmap>,
sort_criteria: Option<Vec<AscDescName>>,
exhaustive_number_hits: bool,
distinct: Option<D>,
) -> Result<Final<'t>> {
use crate::criterion::Criterion as Name;
let primitive_query = primitive_query.unwrap_or_default();
let mut criterion =
Box::new(Initial::new(self, query_tree, filtered_candidates, exhaustive_number_hits))
as Box<dyn Criterion>;
let mut criterion = Box::new(Initial::new(
self,
query_tree,
filtered_candidates,
exhaustive_number_hits,
distinct,
)) as Box<dyn Criterion>;
for name in self.index.criteria(&self.rtxn)? {
criterion = match name {
Name::Words => Box::new(Words::new(self, criterion)),

View File

@ -348,6 +348,7 @@ mod test {
use super::super::initial::Initial;
use super::super::test::TestContext;
use super::*;
use crate::search::NoopDistinct;
fn display_criteria(mut criteria: Typo, mut parameters: CriterionParameters) -> String {
let mut result = String::new();
@ -368,7 +369,8 @@ mod test {
excluded_candidates: &RoaringBitmap::new(),
};
let parent = Initial::new(&context, query_tree, facet_candidates, false);
let parent =
Initial::<NoopDistinct>::new(&context, query_tree, facet_candidates, false, None);
let criteria = Typo::new(&context, Box::new(parent));
let result = display_criteria(criteria, criterion_parameters);
@ -405,7 +407,8 @@ mod test {
wdcache: &mut WordDerivationsCache::new(),
excluded_candidates: &RoaringBitmap::new(),
};
let parent = Initial::new(&context, Some(query_tree), facet_candidates, false);
let parent =
Initial::<NoopDistinct>::new(&context, Some(query_tree), facet_candidates, false, None);
let criteria = Typo::new(&context, Box::new(parent));
let result = display_criteria(criteria, criterion_parameters);
@ -439,7 +442,13 @@ mod test {
wdcache: &mut WordDerivationsCache::new(),
excluded_candidates: &RoaringBitmap::new(),
};
let parent = Initial::new(&context, query_tree, Some(facet_candidates.clone()), false);
let parent = Initial::<NoopDistinct>::new(
&context,
query_tree,
Some(facet_candidates.clone()),
false,
None,
);
let criteria = Typo::new(&context, Box::new(parent));
let result = display_criteria(criteria, criterion_parameters);
@ -476,8 +485,13 @@ mod test {
wdcache: &mut WordDerivationsCache::new(),
excluded_candidates: &RoaringBitmap::new(),
};
let parent =
Initial::new(&context, Some(query_tree), Some(facet_candidates.clone()), false);
let parent = Initial::<NoopDistinct>::new(
&context,
Some(query_tree),
Some(facet_candidates.clone()),
false,
None,
);
let criteria = Typo::new(&context, Box::new(parent));
let result = display_criteria(criteria, criterion_parameters);

View File

@ -21,6 +21,7 @@ const DOCID_SIZE: usize = size_of::<DocumentId>();
/// care to keep the document we are currently on, and remove it from the excluded list. The next
/// iterations will never contain any occurence of a document with the same distinct value as a
/// document from previous iterations.
#[derive(Clone)]
pub struct FacetDistinct<'a> {
distinct: FieldId,
index: &'a Index,

View File

@ -191,21 +191,33 @@ impl<'a> Search<'a> {
}
let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?;
match self.index.distinct_field(self.rtxn)? {
None => {
let criteria = criteria_builder.build::<NoopDistinct>(
query_tree,
primitive_query,
filtered_candidates,
self.sort_criteria.clone(),
self.exhaustive_number_hits,
None,
)?;
self.perform_sort(NoopDistinct, matching_words.unwrap_or_default(), criteria)
}
Some(name) => {
let field_ids_map = self.index.fields_ids_map(self.rtxn)?;
match field_ids_map.id(name) {
Some(fid) => {
let distinct = FacetDistinct::new(fid, self.index, self.rtxn);
let criteria = criteria_builder.build(
query_tree,
primitive_query,
filtered_candidates,
self.sort_criteria.clone(),
self.exhaustive_number_hits,
Some(distinct.clone()),
)?;
match self.index.distinct_field(self.rtxn)? {
None => self.perform_sort(NoopDistinct, matching_words.unwrap_or_default(), criteria),
Some(name) => {
let field_ids_map = self.index.fields_ids_map(self.rtxn)?;
match field_ids_map.id(name) {
Some(fid) => {
let distinct = FacetDistinct::new(fid, self.index, self.rtxn);
self.perform_sort(distinct, matching_words.unwrap_or_default(), criteria)
}
None => Ok(SearchResult::default()),