190: Make bucket candidates optionals r=Kerollmops a=LegendreM

Before the bucket candidates were the result of the facet filters or result of the query tree.
They will now be only the result of the query tree, making the number of candidates more consistent between the same request with or without facet filters.

Fix some clippy warnings.

Fix #186 

Co-authored-by: many <maxime@meilisearch.com>
This commit is contained in:
bors[bot] 2021-05-24 11:19:32 +00:00 committed by GitHub
commit 49bee2ebc5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 363 additions and 393 deletions

View File

@ -93,33 +93,25 @@ impl<'t> Criterion for AscDesc<'t> {
match self.candidates.next().transpose()? {
None => {
match self.parent.next(params)? {
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
let candidates_is_some = candidates.is_some();
Some(CriterionResult { query_tree, candidates, filtered_candidates, bucket_candidates }) => {
self.query_tree = query_tree;
let candidates = match (&self.query_tree, candidates) {
(_, Some(mut candidates)) => {
candidates.intersect_with(&self.faceted_candidates);
candidates
},
let mut candidates = match (&self.query_tree, candidates) {
(_, Some(candidates)) => candidates & &self.faceted_candidates,
(Some(qt), None) => {
let context = CriteriaBuilder::new(&self.rtxn, &self.index)?;
let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), params.wdcache)?;
candidates -= params.excluded_candidates;
candidates.intersect_with(&self.faceted_candidates);
candidates
let candidates = resolve_query_tree(&context, qt, params.wdcache)?;
candidates & &self.faceted_candidates
},
(None, None) => take(&mut self.faceted_candidates),
};
// If our parent returns candidates it means that the bucket
// candidates were already computed before and we can use them.
//
// If not, we must use the just computed candidates as our bucket
// candidates.
if candidates_is_some {
self.bucket_candidates.union_with(&bucket_candidates);
} else {
self.bucket_candidates.union_with(&candidates);
if let Some(filtered_candidates) = filtered_candidates {
candidates &= filtered_candidates;
}
match bucket_candidates {
Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates,
None => self.bucket_candidates |= &candidates,
}
if candidates.is_empty() {
@ -143,7 +135,8 @@ impl<'t> Criterion for AscDesc<'t> {
return Ok(Some(CriterionResult {
query_tree: self.query_tree.clone(),
candidates: Some(candidates),
bucket_candidates: take(&mut self.bucket_candidates),
filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)),
}));
},
}
@ -222,7 +215,7 @@ fn iterative_facet_ordered_iter<'t>(
docids_values.push((docid, OrderedFloat(value)));
}
}
docids_values.sort_unstable_by_key(|(_, v)| v.clone());
docids_values.sort_unstable_by_key(|(_, v)| *v);
let iter = docids_values.into_iter();
let iter = if ascending {
Box::new(iter) as Box<dyn Iterator<Item = _>>
@ -233,7 +226,7 @@ fn iterative_facet_ordered_iter<'t>(
// The itertools GroupBy iterator doesn't provide an owned version, we are therefore
// required to collect the result into an owned collection (a Vec).
// https://github.com/rust-itertools/itertools/issues/499
let vec: Vec<_> = iter.group_by(|(_, v)| v.clone())
let vec: Vec<_> = iter.group_by(|(_, v)| *v)
.into_iter()
.map(|(_, ids)| ids.map(|(id, _)| id).collect())
.collect();

View File

@ -24,13 +24,13 @@ const LEVEL_EXPONENTIATION_BASE: u32 = 4;
/// the system to choose between one algorithm or another.
const CANDIDATES_THRESHOLD: u64 = 1000;
type FlattenedQueryTree = Vec<Vec<Vec<Query>>>;
pub struct Attribute<'t> {
ctx: &'t dyn Context<'t>,
query_tree: Option<Operation>,
candidates: Option<RoaringBitmap>,
state: Option<(Operation, FlattenedQueryTree, RoaringBitmap)>,
bucket_candidates: RoaringBitmap,
parent: Box<dyn Criterion + 't>,
flattened_query_tree: Option<Vec<Vec<Vec<Query>>>>,
current_buckets: Option<btree_map::IntoIter<u64, RoaringBitmap>>,
}
@ -38,11 +38,9 @@ impl<'t> Attribute<'t> {
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self {
Attribute {
ctx,
query_tree: None,
candidates: None,
state: None,
bucket_candidates: RoaringBitmap::new(),
parent,
flattened_query_tree: None,
current_buckets: None,
}
}
@ -52,29 +50,26 @@ impl<'t> Criterion for Attribute<'t> {
#[logging_timer::time("Attribute::{}")]
fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result<Option<CriterionResult>> {
// remove excluded candidates when next is called, instead of doing it in the loop.
if let Some(candidates) = self.candidates.as_mut() {
*candidates -= params.excluded_candidates;
if let Some((_, _, allowed_candidates)) = self.state.as_mut() {
*allowed_candidates -= params.excluded_candidates;
}
loop {
match (&self.query_tree, &mut self.candidates) {
(_, Some(candidates)) if candidates.is_empty() => {
match self.state.take() {
Some((query_tree, _, allowed_candidates)) if allowed_candidates.is_empty() => {
return Ok(Some(CriterionResult {
query_tree: self.query_tree.take(),
candidates: self.candidates.take(),
bucket_candidates: take(&mut self.bucket_candidates),
query_tree: Some(query_tree),
candidates: Some(RoaringBitmap::new()),
filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)),
}));
},
(Some(qt), Some(candidates)) => {
let flattened_query_tree = self.flattened_query_tree.get_or_insert_with(|| {
flatten_query_tree(&qt)
});
let found_candidates = if candidates.len() < CANDIDATES_THRESHOLD {
Some((query_tree, flattened_query_tree, mut allowed_candidates)) => {
let found_candidates = if allowed_candidates.len() < CANDIDATES_THRESHOLD {
let current_buckets = match self.current_buckets.as_mut() {
Some(current_buckets) => current_buckets,
None => {
let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?;
let new_buckets = linear_compute_candidates(self.ctx, &flattened_query_tree, &allowed_candidates)?;
self.current_buckets.get_or_insert(new_buckets.into_iter())
},
};
@ -83,62 +78,68 @@ impl<'t> Criterion for Attribute<'t> {
Some((_score, candidates)) => candidates,
None => {
return Ok(Some(CriterionResult {
query_tree: self.query_tree.take(),
candidates: self.candidates.take(),
bucket_candidates: take(&mut self.bucket_candidates),
query_tree: Some(query_tree),
candidates: Some(RoaringBitmap::new()),
filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)),
}));
},
}
} else {
match set_compute_candidates(self.ctx, flattened_query_tree, candidates, params.wdcache)? {
match set_compute_candidates(self.ctx, &flattened_query_tree, &allowed_candidates, params.wdcache)? {
Some(candidates) => candidates,
None => {
return Ok(Some(CriterionResult {
query_tree: self.query_tree.take(),
candidates: self.candidates.take(),
bucket_candidates: take(&mut self.bucket_candidates),
query_tree: Some(query_tree),
candidates: Some(RoaringBitmap::new()),
filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)),
}));
},
}
};
candidates.difference_with(&found_candidates);
allowed_candidates -= &found_candidates;
self.state = Some((query_tree.clone(), flattened_query_tree, allowed_candidates));
return Ok(Some(CriterionResult {
query_tree: self.query_tree.clone(),
query_tree: Some(query_tree),
candidates: Some(found_candidates),
bucket_candidates: take(&mut self.bucket_candidates),
filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)),
}));
},
(Some(qt), None) => {
let mut query_tree_candidates = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), params.wdcache)?;
query_tree_candidates -= params.excluded_candidates;
self.bucket_candidates |= &query_tree_candidates;
self.candidates = Some(query_tree_candidates);
},
(None, Some(_)) => {
return Ok(Some(CriterionResult {
query_tree: self.query_tree.take(),
candidates: self.candidates.take(),
bucket_candidates: take(&mut self.bucket_candidates),
}));
},
(None, None) => {
None => {
match self.parent.next(params)? {
Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => {
Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => {
let mut candidates = match candidates {
Some(candidates) => candidates,
None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates,
};
if let Some(filtered_candidates) = filtered_candidates {
candidates &= filtered_candidates;
}
let flattened_query_tree = flatten_query_tree(&query_tree);
match bucket_candidates {
Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates,
None => self.bucket_candidates |= &candidates,
}
self.state = Some((query_tree, flattened_query_tree, candidates));
self.current_buckets = None;
},
Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => {
return Ok(Some(CriterionResult {
query_tree: None,
candidates: None,
candidates,
filtered_candidates,
bucket_candidates,
}));
},
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
self.query_tree = query_tree;
self.candidates = candidates;
self.bucket_candidates |= bucket_candidates;
self.flattened_query_tree = None;
self.current_buckets = None;
},
None => return Ok(None),
}
},
@ -164,7 +165,7 @@ impl<'t, 'q> WordLevelIterator<'t, 'q> {
fn new(ctx: &'t dyn Context<'t>, word: Cow<'q, str>, in_prefix_cache: bool) -> heed::Result<Option<Self>> {
match ctx.word_position_last_level(&word, in_prefix_cache)? {
Some(level) => {
let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::<u8>::into(level.clone()) as u32);
let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::<u8>::into(level) as u32);
let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?;
Ok(Some(Self { inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None }))
},
@ -173,8 +174,8 @@ impl<'t, 'q> WordLevelIterator<'t, 'q> {
}
fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel, left_interval: Option<u32>) -> heed::Result<Self> {
let level = level.min(&self.level).clone();
let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::<u8>::into(level.clone()) as u32);
let level = *level.min(&self.level);
let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::<u8>::into(level) as u32);
let word = self.word.clone();
let in_prefix_cache = self.in_prefix_cache;
let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?;
@ -223,7 +224,7 @@ struct QueryLevelIterator<'t, 'q> {
}
impl<'t, 'q> QueryLevelIterator<'t, 'q> {
fn new(ctx: &'t dyn Context<'t>, queries: &'q Vec<Query>, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<Self>> {
fn new(ctx: &'t dyn Context<'t>, queries: &'q [Query], wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<Self>> {
let mut inner = Vec::with_capacity(queries.len());
for query in queries {
match &query.kind {
@ -253,7 +254,7 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
}
}
let highest = inner.iter().max_by_key(|wli| wli.level).map(|wli| wli.level.clone());
let highest = inner.iter().max_by_key(|wli| wli.level).map(|wli| wli.level);
match highest {
Some(level) => Ok(Some(Self {
parent: None,
@ -296,7 +297,7 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
let u8_level = Into::<u8>::into(level);
let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32);
for wli in self.inner.iter_mut() {
let wli_u8_level = Into::<u8>::into(wli.level.clone());
let wli_u8_level = Into::<u8>::into(wli.level);
let accumulated_count = LEVEL_EXPONENTIATION_BASE.pow((u8_level - wli_u8_level) as u32);
for _ in 0..accumulated_count {
if let Some((next_left, _, next_docids)) = wli.next()? {
@ -373,8 +374,8 @@ fn interval_to_skip(
already_skiped: usize,
allowed_candidates: &RoaringBitmap,
) -> usize {
parent_accumulator.into_iter()
.zip(current_accumulator.into_iter())
parent_accumulator.iter()
.zip(current_accumulator.iter())
.skip(already_skiped)
.take_while(|(parent, current)| {
let skip_parent = parent.as_ref().map_or(true, |(_, _, docids)| docids.is_empty());
@ -419,7 +420,7 @@ impl<'t, 'q> Branch<'t, 'q> {
/// update inner interval in order to be ranked by the binary_heap without computing it,
/// the next() method should be called when the real interval is needed.
fn lazy_next(&mut self) {
let u8_level = Into::<u8>::into(self.tree_level.clone());
let u8_level = Into::<u8>::into(self.tree_level);
let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32);
let (left, right, _) = self.last_result;
@ -467,7 +468,7 @@ impl<'t, 'q> Eq for Branch<'t, 'q> {}
fn initialize_query_level_iterators<'t, 'q>(
ctx: &'t dyn Context<'t>,
branches: &'q Vec<Vec<Vec<Query>>>,
branches: &'q FlattenedQueryTree,
allowed_candidates: &RoaringBitmap,
wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<BinaryHeap<Branch<'t, 'q>>> {
@ -517,7 +518,7 @@ fn initialize_query_level_iterators<'t, 'q>(
fn set_compute_candidates<'t>(
ctx: &'t dyn Context<'t>,
branches: &Vec<Vec<Vec<Query>>>,
branches: &FlattenedQueryTree,
allowed_candidates: &RoaringBitmap,
wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<Option<RoaringBitmap>>
@ -570,11 +571,11 @@ fn set_compute_candidates<'t>(
fn linear_compute_candidates(
ctx: &dyn Context,
branches: &Vec<Vec<Vec<Query>>>,
branches: &FlattenedQueryTree,
allowed_candidates: &RoaringBitmap,
) -> anyhow::Result<BTreeMap<u64, RoaringBitmap>>
{
fn compute_candidate_rank(branches: &Vec<Vec<Vec<Query>>>, words_positions: HashMap<String, RoaringBitmap>) -> u64 {
fn compute_candidate_rank(branches: &FlattenedQueryTree, words_positions: HashMap<String, RoaringBitmap>) -> u64 {
let mut min_rank = u64::max_value();
for branch in branches {
@ -659,10 +660,10 @@ fn linear_compute_candidates(
}
// TODO can we keep refs of Query
fn flatten_query_tree(query_tree: &Operation) -> Vec<Vec<Vec<Query>>> {
fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
use crate::search::criteria::Operation::{And, Or, Consecutive};
fn and_recurse(head: &Operation, tail: &[Operation]) -> Vec<Vec<Vec<Query>>> {
fn and_recurse(head: &Operation, tail: &[Operation]) -> FlattenedQueryTree {
match tail.split_first() {
Some((thead, tail)) => {
let tail = and_recurse(thead, tail);
@ -680,7 +681,7 @@ fn flatten_query_tree(query_tree: &Operation) -> Vec<Vec<Vec<Query>>> {
}
}
fn recurse(op: &Operation) -> Vec<Vec<Vec<Query>>> {
fn recurse(op: &Operation) -> FlattenedQueryTree {
match op {
And(ops) | Consecutive(ops) => {
ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t))
@ -688,7 +689,7 @@ fn flatten_query_tree(query_tree: &Operation) -> Vec<Vec<Vec<Query>>> {
Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) {
vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]]
} else {
ops.into_iter().map(recurse).flatten().collect()
ops.iter().map(recurse).flatten().collect()
},
Operation::Query(query) => vec![vec![vec![query.clone()]]],
}

View File

@ -1,4 +1,4 @@
use std::{collections::HashMap, mem};
use std::mem::take;
use log::debug;
use roaring::RoaringBitmap;
@ -60,30 +60,41 @@ impl<'t> Criterion for Exactness<'t> {
self.query_tree = None;
},
Some(state) => {
let (candidates, state) = resolve_state(self.ctx, mem::take(state), &self.query)?;
let (candidates, state) = resolve_state(self.ctx, take(state), &self.query)?;
self.state = state;
return Ok(Some(CriterionResult {
query_tree: self.query_tree.clone(),
candidates: Some(candidates),
bucket_candidates: mem::take(&mut self.bucket_candidates),
filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)),
}));
},
None => {
match self.parent.next(params)? {
Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => {
let candidates = match candidates {
Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => {
let mut candidates = match candidates {
Some(candidates) => candidates,
None => resolve_query_tree(self.ctx, &query_tree, &mut HashMap::new(), params.wdcache)?,
None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates,
};
if let Some(filtered_candidates) = filtered_candidates {
candidates &= filtered_candidates;
}
match bucket_candidates {
Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates,
None => self.bucket_candidates |= &candidates,
}
self.state = Some(State::new(candidates));
self.query_tree = Some(query_tree);
self.bucket_candidates |= bucket_candidates;
},
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => {
return Ok(Some(CriterionResult {
query_tree,
query_tree: None,
candidates,
filtered_candidates,
bucket_candidates,
}));
},

View File

@ -1,5 +1,3 @@
use std::collections::HashMap;
use log::debug;
use roaring::RoaringBitmap;
@ -32,34 +30,33 @@ impl<'t> Final<'t> {
#[logging_timer::time("Final::{}")]
pub fn next(&mut self, excluded_candidates: &RoaringBitmap) -> anyhow::Result<Option<FinalResult>> {
loop {
debug!("Final iteration");
let mut criterion_parameters = CriterionParameters {
wdcache: &mut self.wdcache,
// returned_candidates is merged with excluded_candidates to avoid duplicas
excluded_candidates: &(&self.returned_candidates | excluded_candidates),
};
debug!("Final iteration");
let excluded_candidates = &self.returned_candidates | excluded_candidates;
let mut criterion_parameters = CriterionParameters {
wdcache: &mut self.wdcache,
// returned_candidates is merged with excluded_candidates to avoid duplicas
excluded_candidates: &excluded_candidates,
};
match self.parent.next(&mut criterion_parameters)? {
Some(CriterionResult { query_tree, candidates, mut bucket_candidates }) => {
let candidates = match candidates {
Some(candidates) => candidates,
None => {
let candidates = match query_tree.as_ref() {
Some(qt) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), &mut self.wdcache)?,
None => self.ctx.documents_ids()?,
};
bucket_candidates |= &candidates;
candidates
}
};
match self.parent.next(&mut criterion_parameters)? {
Some(CriterionResult { query_tree, candidates, filtered_candidates, bucket_candidates }) => {
let mut candidates = match (candidates, query_tree.as_ref()) {
(Some(candidates), _) => candidates,
(None, Some(qt)) => resolve_query_tree(self.ctx, qt, &mut self.wdcache)? - excluded_candidates,
(None, None) => self.ctx.documents_ids()? - excluded_candidates,
};
self.returned_candidates |= &candidates;
if let Some(filtered_candidates) = filtered_candidates {
candidates &= filtered_candidates;
}
return Ok(Some(FinalResult { query_tree, candidates, bucket_candidates }));
},
None => return Ok(None),
}
let bucket_candidates = bucket_candidates.unwrap_or_else(|| candidates.clone());
self.returned_candidates |= &candidates;
Ok(Some(FinalResult { query_tree, candidates, bucket_candidates }))
},
None => Ok(None),
}
}
}

View File

@ -9,11 +9,12 @@ pub struct Initial {
}
impl Initial {
pub fn new(query_tree: Option<Operation>, mut candidates: Option<RoaringBitmap>) -> Initial {
pub fn new(query_tree: Option<Operation>, filtered_candidates: Option<RoaringBitmap>) -> Initial {
let answer = CriterionResult {
query_tree,
candidates: candidates.clone(),
bucket_candidates: candidates.take().unwrap_or_default(),
candidates: None,
filtered_candidates,
bucket_candidates: None,
};
Initial { answer: Some(answer) }
}

View File

@ -38,8 +38,10 @@ pub struct CriterionResult {
/// The candidates that this criterion is allowed to return subsets of,
/// if None, it is up to the child to compute the candidates itself.
candidates: Option<RoaringBitmap>,
/// The candidates, coming from facet filters, that this criterion is allowed to return subsets of.
filtered_candidates: Option<RoaringBitmap>,
/// Candidates that comes from the current bucket of the initial criterion.
bucket_candidates: RoaringBitmap,
bucket_candidates: Option<RoaringBitmap>,
}
#[derive(Debug, PartialEq)]
@ -57,15 +59,6 @@ enum Candidates {
Forbidden(RoaringBitmap)
}
impl Candidates {
fn into_inner(self) -> RoaringBitmap {
match self {
Self::Allowed(inner) => inner,
Self::Forbidden(inner) => inner,
}
}
}
impl Default for Candidates {
fn default() -> Self {
Self::Forbidden(RoaringBitmap::new())
@ -236,14 +229,12 @@ impl<'t> CriteriaBuilder<'t> {
pub fn resolve_query_tree<'t>(
ctx: &'t dyn Context,
query_tree: &Operation,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<RoaringBitmap>
{
fn resolve_operation<'t>(
ctx: &'t dyn Context,
query_tree: &Operation,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<RoaringBitmap>
{
@ -252,7 +243,7 @@ pub fn resolve_query_tree<'t>(
match query_tree {
And(ops) => {
let mut ops = ops.iter().map(|op| {
resolve_operation(ctx, op, cache, wdcache)
resolve_operation(ctx, op, wdcache)
}).collect::<anyhow::Result<Vec<_>>>()?;
ops.sort_unstable_by_key(|cds| cds.len());
@ -296,7 +287,7 @@ pub fn resolve_query_tree<'t>(
Or(_, ops) => {
let mut candidates = RoaringBitmap::new();
for op in ops {
let docids = resolve_operation(ctx, op, cache, wdcache)?;
let docids = resolve_operation(ctx, op, wdcache)?;
candidates.union_with(&docids);
}
Ok(candidates)
@ -305,7 +296,7 @@ pub fn resolve_query_tree<'t>(
}
}
resolve_operation(ctx, query_tree, cache, wdcache)
resolve_operation(ctx, query_tree, wdcache)
}

View File

@ -30,8 +30,8 @@ const PROXIMITY_THRESHOLD: u8 = 0;
pub struct Proximity<'t> {
ctx: &'t dyn Context<'t>,
/// ((max_proximity, query_tree), allowed_candidates)
state: Option<(Option<(usize, Operation)>, RoaringBitmap)>,
/// (max_proximity, query_tree, allowed_candidates)
state: Option<(u8, Operation, RoaringBitmap)>,
proximity: u8,
bucket_candidates: RoaringBitmap,
parent: Box<dyn Criterion + 't>,
@ -57,114 +57,96 @@ impl<'t> Criterion for Proximity<'t> {
#[logging_timer::time("Proximity::{}")]
fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result<Option<CriterionResult>> {
// remove excluded candidates when next is called, instead of doing it in the loop.
if let Some((_, candidates)) = self.state.as_mut() {
*candidates -= params.excluded_candidates;
if let Some((_, _, allowed_candidates)) = self.state.as_mut() {
*allowed_candidates -= params.excluded_candidates;
}
loop {
debug!("Proximity at iteration {} (max prox {:?}) ({:?})",
self.proximity,
self.state.as_ref().map(|(qt, _)| qt.as_ref().map(|(mp, _)| mp)),
self.state.as_ref().map(|(_, cd)| cd),
self.state.as_ref().map(|(mp, _, _)| mp),
self.state.as_ref().map(|(_, _, cd)| cd),
);
match &mut self.state {
Some((_, candidates)) if candidates.is_empty() => {
Some((max_prox, _, allowed_candidates)) if allowed_candidates.is_empty() || self.proximity > *max_prox => {
self.state = None; // reset state
},
Some((Some((max_prox, query_tree)), candidates)) => {
if self.proximity as usize > *max_prox {
self.state = None; // reset state
} else {
let mut new_candidates = if candidates.len() <= CANDIDATES_THRESHOLD && self.proximity > PROXIMITY_THRESHOLD {
if let Some(cache) = self.plane_sweep_cache.as_mut() {
match cache.next() {
Some((p, candidates)) => {
self.proximity = p;
candidates
},
None => {
self.state = None; // reset state
continue
},
}
} else {
let cache = resolve_plane_sweep_candidates(
self.ctx,
query_tree,
candidates,
params.wdcache,
)?;
self.plane_sweep_cache = Some(cache.into_iter());
continue
Some((_, query_tree, allowed_candidates)) => {
let mut new_candidates = if allowed_candidates.len() <= CANDIDATES_THRESHOLD && self.proximity > PROXIMITY_THRESHOLD {
if let Some(cache) = self.plane_sweep_cache.as_mut() {
match cache.next() {
Some((p, candidates)) => {
self.proximity = p;
candidates
},
None => {
self.state = None; // reset state
continue
},
}
} else { // use set theory based algorithm
resolve_candidates(
self.ctx,
&query_tree,
self.proximity,
&mut self.candidates_cache,
params.wdcache,
)?
};
} else {
let cache = resolve_plane_sweep_candidates(
self.ctx,
query_tree,
allowed_candidates,
params.wdcache,
)?;
self.plane_sweep_cache = Some(cache.into_iter());
new_candidates.intersect_with(&candidates);
candidates.difference_with(&new_candidates);
self.proximity += 1;
continue
}
} else { // use set theory based algorithm
resolve_candidates(
self.ctx,
&query_tree,
self.proximity,
&mut self.candidates_cache,
params.wdcache,
)?
};
new_candidates &= &*allowed_candidates;
*allowed_candidates -= &new_candidates;
self.proximity += 1;
return Ok(Some(CriterionResult {
query_tree: Some(query_tree.clone()),
candidates: Some(new_candidates),
bucket_candidates: take(&mut self.bucket_candidates),
}));
}
},
Some((None, candidates)) => {
let candidates = take(candidates);
self.state = None; // reset state
return Ok(Some(CriterionResult {
query_tree: None,
candidates: Some(candidates.clone()),
bucket_candidates: candidates,
query_tree: Some(query_tree.clone()),
candidates: Some(new_candidates),
filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)),
}));
},
None => {
match self.parent.next(params)? {
Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => {
return Ok(Some(CriterionResult {
query_tree: None,
candidates: None,
bucket_candidates,
}));
},
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
let candidates_is_some = candidates.is_some();
let candidates = match (&query_tree, candidates) {
(_, Some(candidates)) => candidates,
(Some(qt), None) => {
let candidates = resolve_query_tree(self.ctx, qt, &mut HashMap::new(), params.wdcache)?;
candidates - params.excluded_candidates
},
(None, None) => RoaringBitmap::new(),
Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => {
let mut candidates = match candidates {
Some(candidates) => candidates,
None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates,
};
// If our parent returns candidates it means that the bucket
// candidates were already computed before and we can use them.
//
// If not, we must use the just computed candidates as our bucket
// candidates.
if candidates_is_some {
self.bucket_candidates.union_with(&bucket_candidates);
} else {
self.bucket_candidates.union_with(&candidates);
if let Some(filtered_candidates) = filtered_candidates {
candidates &= filtered_candidates;
}
let query_tree = query_tree.map(|op| (maximum_proximity(&op), op));
self.state = Some((query_tree, candidates));
match bucket_candidates {
Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates,
None => self.bucket_candidates |= &candidates,
}
let maximum_proximity = maximum_proximity(&query_tree);
self.state = Some((maximum_proximity as u8, query_tree, candidates));
self.proximity = 0;
self.plane_sweep_cache = None;
},
Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => {
return Ok(Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
bucket_candidates,
}));
},
None => return Ok(None),
}
},

View File

@ -13,15 +13,19 @@ use super::{
CriterionParameters,
CriterionResult,
query_docids,
query_pair_proximity_docids
query_pair_proximity_docids,
resolve_query_tree,
};
/// Maximum number of typo for a word of any length.
const MAX_TYPOS_PER_WORD: u8 = 2;
pub struct Typo<'t> {
ctx: &'t dyn Context<'t>,
query_tree: Option<(usize, Operation)>,
number_typos: u8,
candidates: Candidates,
bucket_candidates: RoaringBitmap,
/// (max_typos, query_tree, candidates)
state: Option<(u8, Operation, Candidates)>,
typos: u8,
bucket_candidates: Option<RoaringBitmap>,
parent: Box<dyn Criterion + 't>,
candidates_cache: HashMap<(Operation, u8), RoaringBitmap>,
}
@ -30,10 +34,9 @@ impl<'t> Typo<'t> {
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self {
Typo {
ctx,
query_tree: None,
number_typos: 0,
candidates: Candidates::default(),
bucket_candidates: RoaringBitmap::new(),
state: None,
typos: 0,
bucket_candidates: None,
parent,
candidates_cache: HashMap::new(),
}
@ -45,114 +48,107 @@ impl<'t> Criterion for Typo<'t> {
fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result<Option<CriterionResult>> {
use Candidates::{Allowed, Forbidden};
// remove excluded candidates when next is called, instead of doing it in the loop.
match &mut self.candidates {
Allowed(candidates) => *candidates -= params.excluded_candidates,
Forbidden(candidates) => *candidates |= params.excluded_candidates,
match self.state.as_mut() {
Some((_, _, Allowed(candidates))) => *candidates -= params.excluded_candidates,
Some((_, _, Forbidden(candidates))) => *candidates |= params.excluded_candidates,
None => (),
}
loop {
debug!("Typo at iteration {} ({:?})", self.number_typos, self.candidates);
debug!("Typo at iteration {} (max typos {:?}) ({:?})",
self.typos,
self.state.as_ref().map(|(mt, _, _)| mt),
self.state.as_ref().map(|(_, _, cd)| cd),
);
match self.state.as_mut() {
Some((max_typos, _, _)) if self.typos > *max_typos => {
self.state = None; // reset state
},
Some((_, _, Allowed(allowed_candidates))) if allowed_candidates.is_empty() => {
self.state = None; // reset state
},
Some((_, query_tree, candidates_authorization)) => {
let fst = self.ctx.words_fst();
let new_query_tree = match self.typos {
typos if typos < MAX_TYPOS_PER_WORD => {
alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)?
},
MAX_TYPOS_PER_WORD => {
// When typos >= MAX_TYPOS_PER_WORD, no more alteration of the query tree is possible,
// we keep the altered query tree
*query_tree = alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)?;
// we compute the allowed candidates
let query_tree_allowed_candidates = resolve_query_tree(self.ctx, query_tree, params.wdcache)?;
// we assign the allowed candidates to the candidates authorization.
*candidates_authorization = match take(candidates_authorization) {
Allowed(allowed_candidates) => Allowed(query_tree_allowed_candidates & allowed_candidates),
Forbidden(forbidden_candidates) => Allowed(query_tree_allowed_candidates - forbidden_candidates),
};
query_tree.clone()
},
_otherwise => query_tree.clone(),
};
let mut candidates = resolve_candidates(
self.ctx,
&new_query_tree,
self.typos,
&mut self.candidates_cache,
params.wdcache,
)?;
match candidates_authorization {
Allowed(allowed_candidates) => {
candidates &= &*allowed_candidates;
*allowed_candidates -= &candidates;
},
Forbidden(forbidden_candidates) => {
candidates -= &*forbidden_candidates;
*forbidden_candidates |= &candidates;
},
}
let bucket_candidates = match self.bucket_candidates.as_mut() {
Some(bucket_candidates) => take(bucket_candidates),
None => candidates.clone(),
};
self.typos += 1;
match (&mut self.query_tree, &mut self.candidates) {
(_, Allowed(candidates)) if candidates.is_empty() => {
return Ok(Some(CriterionResult {
query_tree: self.query_tree.take().map(|(_, qt)| qt),
candidates: Some(take(&mut self.candidates).into_inner()),
bucket_candidates: take(&mut self.bucket_candidates),
query_tree: Some(new_query_tree),
candidates: Some(candidates),
filtered_candidates: None,
bucket_candidates: Some(bucket_candidates),
}));
},
(Some((max_typos, query_tree)), Allowed(candidates)) => {
if self.number_typos as usize > *max_typos {
self.query_tree = None;
self.candidates = Candidates::default();
} else {
let fst = self.ctx.words_fst();
let new_query_tree = if self.number_typos < 2 {
alterate_query_tree(&fst, query_tree.clone(), self.number_typos, params.wdcache)?
} else if self.number_typos == 2 {
*query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, params.wdcache)?;
query_tree.clone()
} else {
query_tree.clone()
};
let mut new_candidates = resolve_candidates(
self.ctx,
&new_query_tree,
self.number_typos,
&mut self.candidates_cache,
params.wdcache,
)?;
new_candidates.intersect_with(&candidates);
candidates.difference_with(&new_candidates);
self.number_typos += 1;
return Ok(Some(CriterionResult {
query_tree: Some(new_query_tree),
candidates: Some(new_candidates),
bucket_candidates: take(&mut self.bucket_candidates),
}));
}
},
(Some((max_typos, query_tree)), Forbidden(candidates)) => {
if self.number_typos as usize > *max_typos {
self.query_tree = None;
self.candidates = Candidates::default();
} else {
let fst = self.ctx.words_fst();
let new_query_tree = if self.number_typos < 2 {
alterate_query_tree(&fst, query_tree.clone(), self.number_typos, params.wdcache)?
} else if self.number_typos == 2 {
*query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, params.wdcache)?;
query_tree.clone()
} else {
query_tree.clone()
};
let mut new_candidates = resolve_candidates(
self.ctx,
&new_query_tree,
self.number_typos,
&mut self.candidates_cache,
params.wdcache,
)?;
new_candidates.difference_with(&candidates);
candidates.union_with(&new_candidates);
self.number_typos += 1;
self.bucket_candidates.union_with(&new_candidates);
return Ok(Some(CriterionResult {
query_tree: Some(new_query_tree),
candidates: Some(new_candidates),
bucket_candidates: take(&mut self.bucket_candidates),
}));
}
},
(None, Allowed(_)) => {
let candidates = take(&mut self.candidates).into_inner();
return Ok(Some(CriterionResult {
query_tree: None,
candidates: Some(candidates.clone()),
bucket_candidates: candidates,
}));
},
(None, Forbidden(_)) => {
None => {
match self.parent.next(params)? {
Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => {
Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => {
self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) {
(Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc),
(self_bc, parent_bc) => self_bc.or(parent_bc),
};
let candidates = match candidates.or(filtered_candidates) {
Some(candidates) => Candidates::Allowed(candidates - params.excluded_candidates),
None => Candidates::Forbidden(params.excluded_candidates.clone()),
};
let maximum_typos = maximum_typo(&query_tree) as u8;
self.state = Some((maximum_typos, query_tree, candidates));
self.typos = 0;
},
Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => {
return Ok(Some(CriterionResult {
query_tree: None,
candidates: None,
candidates,
filtered_candidates,
bucket_candidates,
}));
},
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
self.query_tree = query_tree.map(|op| (maximum_typo(&op), op));
self.number_typos = 0;
self.candidates = candidates.map_or_else(|| {
Candidates::Forbidden(params.excluded_candidates.clone())
}, Candidates::Allowed);
self.bucket_candidates.union_with(&bucket_candidates);
},
None => return Ok(None),
}
},
@ -185,7 +181,6 @@ fn alterate_query_tree(
ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache))
},
Operation::Query(q) => {
// TODO may be optimized when number_typos == 0
if let QueryKind::Tolerant { typo, word } = &q.kind {
// if no typo is allowed we don't call word_derivations function,
// and directly create an Exact query
@ -197,7 +192,7 @@ fn alterate_query_tree(
} else {
let typo = *typo.min(&number_typos);
let words = word_derivations(word, q.prefix, typo, words_fst, wdcache)?;
let queries = words.into_iter().map(|(word, typo)| {
let queries = words.iter().map(|(word, typo)| {
Operation::Query(Query {
prefix: false,
kind: QueryKind::Exact { original_typo: *typo, word: word.to_string() },
@ -384,7 +379,8 @@ mod test {
]),
])),
candidates: Some(candidates_1.clone()),
bucket_candidates: candidates_1,
bucket_candidates: Some(candidates_1),
filtered_candidates: None,
};
assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1));
@ -406,7 +402,8 @@ mod test {
]),
])),
candidates: Some(candidates_2.clone()),
bucket_candidates: candidates_2,
bucket_candidates: Some(candidates_2),
filtered_candidates: None,
};
assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_2));
@ -427,8 +424,9 @@ mod test {
let expected = CriterionResult {
query_tree: None,
candidates: Some(facet_candidates.clone()),
bucket_candidates: facet_candidates,
candidates: None,
bucket_candidates: None,
filtered_candidates: Some(facet_candidates.clone()),
};
// first iteration, returns the facet candidates
@ -471,7 +469,8 @@ mod test {
]),
])),
candidates: Some(&candidates_1 & &facet_candidates),
bucket_candidates: facet_candidates.clone(),
bucket_candidates: Some(&candidates_1 & &facet_candidates),
filtered_candidates: None,
};
assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1));
@ -493,7 +492,8 @@ mod test {
]),
])),
candidates: Some(&candidates_2 & &facet_candidates),
bucket_candidates: RoaringBitmap::new(),
bucket_candidates: Some(&candidates_2 & &facet_candidates),
filtered_candidates: None,
};
assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_2));

View File

@ -1,4 +1,3 @@
use std::collections::HashMap;
use std::mem::take;
use log::debug;
@ -11,9 +10,9 @@ pub struct Words<'t> {
ctx: &'t dyn Context<'t>,
query_trees: Vec<Operation>,
candidates: Option<RoaringBitmap>,
bucket_candidates: RoaringBitmap,
bucket_candidates: Option<RoaringBitmap>,
filtered_candidates: Option<RoaringBitmap>,
parent: Box<dyn Criterion + 't>,
candidates_cache: HashMap<(Operation, u8), RoaringBitmap>,
}
impl<'t> Words<'t> {
@ -22,9 +21,9 @@ impl<'t> Words<'t> {
ctx,
query_trees: Vec::default(),
candidates: None,
bucket_candidates: RoaringBitmap::new(),
bucket_candidates: None,
parent,
candidates_cache: HashMap::default(),
filtered_candidates: None,
}
}
}
@ -40,55 +39,50 @@ impl<'t> Criterion for Words<'t> {
loop {
debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates);
match (self.query_trees.pop(), &mut self.candidates) {
(query_tree, Some(candidates)) if candidates.is_empty() => {
self.query_trees = Vec::new();
return Ok(Some(CriterionResult {
query_tree,
candidates: self.candidates.take(),
bucket_candidates: take(&mut self.bucket_candidates),
}));
},
(Some(qt), Some(candidates)) => {
let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, params.wdcache)?;
found_candidates.intersect_with(&candidates);
candidates.difference_with(&found_candidates);
match self.query_trees.pop() {
Some(query_tree) => {
let candidates = match self.candidates.as_mut() {
Some(allowed_candidates) => {
let mut candidates = resolve_query_tree(self.ctx, &query_tree, params.wdcache)?;
candidates &= &*allowed_candidates;
*allowed_candidates -= &candidates;
Some(candidates)
},
None => None,
};
let bucket_candidates = match self.bucket_candidates.as_mut() {
Some(bucket_candidates) => Some(take(bucket_candidates)),
None => None,
};
return Ok(Some(CriterionResult {
query_tree: Some(qt),
candidates: Some(found_candidates),
bucket_candidates: take(&mut self.bucket_candidates),
query_tree: Some(query_tree),
candidates,
filtered_candidates: self.filtered_candidates.clone(),
bucket_candidates,
}));
},
(Some(qt), None) => {
return Ok(Some(CriterionResult {
query_tree: Some(qt),
candidates: None,
bucket_candidates: take(&mut self.bucket_candidates),
}));
},
(None, Some(_)) => {
let candidates = self.candidates.take();
return Ok(Some(CriterionResult {
query_tree: None,
candidates: candidates.clone(),
bucket_candidates: candidates.unwrap_or_default(),
}));
},
(None, None) => {
None => {
match self.parent.next(params)? {
Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => {
Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => {
self.query_trees = explode_query_tree(query_tree);
self.candidates = candidates;
self.filtered_candidates = filtered_candidates;
self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) {
(Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc),
(self_bc, parent_bc) => self_bc.or(parent_bc),
};
},
Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => {
return Ok(Some(CriterionResult {
query_tree: None,
candidates: None,
candidates,
filtered_candidates,
bucket_candidates,
}));
},
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default();
self.candidates = candidates;
self.bucket_candidates.union_with(&bucket_candidates);
},
None => return Ok(None),
}
},

View File

@ -241,7 +241,7 @@ impl<'a> QueryTreeBuilder<'a> {
}
/// Split the word depending on the frequency of subwords in the database documents.
fn split_best_frequency<'a>(ctx: &impl Context, word: &'a str) -> heed::Result<Option<Operation>> {
fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<Operation>> {
let chars = word.char_indices().skip(1);
let mut best = None;
@ -438,14 +438,14 @@ fn create_query_tree(
let start = number_phrases + (number_phrases == 0) as usize;
for len in start..=query.len() {
let mut word_count = len - number_phrases;
let query: Vec<_> = query.iter().filter_map(|p| {
let query: Vec<_> = query.iter().filter(|p| {
if p.is_phrase() {
Some(p)
true
} else if word_count != 0 {
word_count -= 1;
Some(p)
true
} else {
None
false
}
})
.cloned()