From e923d51b8f928e6bd1d5ccab765eb8b2c5f81ecf Mon Sep 17 00:00:00 2001 From: many Date: Wed, 5 May 2021 20:46:56 +0200 Subject: [PATCH] Make bucket candidates optionals --- milli/src/search/criteria/asc_desc.rs | 17 +- milli/src/search/criteria/attribute.rs | 111 ++++++------- milli/src/search/criteria/exactness.rs | 15 +- milli/src/search/criteria/final.rs | 20 +-- milli/src/search/criteria/initial.rs | 4 +- milli/src/search/criteria/mod.rs | 19 +-- milli/src/search/criteria/proximity.rs | 146 +++++++---------- milli/src/search/criteria/typo.rs | 210 ++++++++++++------------- milli/src/search/criteria/words.rs | 80 +++++----- 9 files changed, 276 insertions(+), 346 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 54cbb0fae..7b619f26a 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -94,7 +94,6 @@ impl<'t> Criterion for AscDesc<'t> { None => { match self.parent.next(params)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - let candidates_is_some = candidates.is_some(); self.query_tree = query_tree; let candidates = match (&self.query_tree, candidates) { (_, Some(mut candidates)) => { @@ -103,7 +102,7 @@ impl<'t> Criterion for AscDesc<'t> { }, (Some(qt), None) => { let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; - let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), params.wdcache)?; + let mut candidates = resolve_query_tree(&context, qt, params.wdcache)?; candidates -= params.excluded_candidates; candidates.intersect_with(&self.faceted_candidates); candidates @@ -111,15 +110,9 @@ impl<'t> Criterion for AscDesc<'t> { (None, None) => take(&mut self.faceted_candidates), }; - // If our parent returns candidates it means that the bucket - // candidates were already computed before and we can use them. - // - // If not, we must use the just computed candidates as our bucket - // candidates. - if candidates_is_some { - self.bucket_candidates.union_with(&bucket_candidates); - } else { - self.bucket_candidates.union_with(&candidates); + match bucket_candidates { + Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, + None => self.bucket_candidates |= &candidates, } if candidates.is_empty() { @@ -143,7 +136,7 @@ impl<'t> Criterion for AscDesc<'t> { return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), candidates: Some(candidates), - bucket_candidates: take(&mut self.bucket_candidates), + bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, } diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 5993f03bd..fc7050a7f 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -24,13 +24,12 @@ const LEVEL_EXPONENTIATION_BASE: u32 = 4; /// the system to choose between one algorithm or another. const CANDIDATES_THRESHOLD: u64 = 1000; +type FlattenedQueryTree = Vec>>; pub struct Attribute<'t> { ctx: &'t dyn Context<'t>, - query_tree: Option, - candidates: Option, + state: Option<(Operation, FlattenedQueryTree, RoaringBitmap)>, bucket_candidates: RoaringBitmap, parent: Box, - flattened_query_tree: Option>>>, current_buckets: Option>, } @@ -38,11 +37,9 @@ impl<'t> Attribute<'t> { pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { Attribute { ctx, - query_tree: None, - candidates: None, + state: None, bucket_candidates: RoaringBitmap::new(), parent, - flattened_query_tree: None, current_buckets: None, } } @@ -52,29 +49,25 @@ impl<'t> Criterion for Attribute<'t> { #[logging_timer::time("Attribute::{}")] fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { // remove excluded candidates when next is called, instead of doing it in the loop. - if let Some(candidates) = self.candidates.as_mut() { - *candidates -= params.excluded_candidates; + if let Some((_, _, allowed_candidates)) = self.state.as_mut() { + *allowed_candidates -= params.excluded_candidates; } loop { - match (&self.query_tree, &mut self.candidates) { - (_, Some(candidates)) if candidates.is_empty() => { + match self.state.take() { + Some((query_tree, _, allowed_candidates)) if allowed_candidates.is_empty() => { return Ok(Some(CriterionResult { - query_tree: self.query_tree.take(), - candidates: self.candidates.take(), - bucket_candidates: take(&mut self.bucket_candidates), + query_tree: Some(query_tree), + candidates: Some(RoaringBitmap::new()), + bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, - (Some(qt), Some(candidates)) => { - let flattened_query_tree = self.flattened_query_tree.get_or_insert_with(|| { - flatten_query_tree(&qt) - }); - - let found_candidates = if candidates.len() < CANDIDATES_THRESHOLD { + Some((query_tree, flattened_query_tree, mut allowed_candidates)) => { + let found_candidates = if allowed_candidates.len() < CANDIDATES_THRESHOLD { let current_buckets = match self.current_buckets.as_mut() { Some(current_buckets) => current_buckets, None => { - let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?; + let new_buckets = linear_compute_candidates(self.ctx, &flattened_query_tree, &allowed_candidates)?; self.current_buckets.get_or_insert(new_buckets.into_iter()) }, }; @@ -83,62 +76,60 @@ impl<'t> Criterion for Attribute<'t> { Some((_score, candidates)) => candidates, None => { return Ok(Some(CriterionResult { - query_tree: self.query_tree.take(), - candidates: self.candidates.take(), - bucket_candidates: take(&mut self.bucket_candidates), + query_tree: Some(query_tree), + candidates: Some(RoaringBitmap::new()), + bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, } } else { - match set_compute_candidates(self.ctx, flattened_query_tree, candidates, params.wdcache)? { + match set_compute_candidates(self.ctx, &flattened_query_tree, &allowed_candidates, params.wdcache)? { Some(candidates) => candidates, None => { return Ok(Some(CriterionResult { - query_tree: self.query_tree.take(), - candidates: self.candidates.take(), - bucket_candidates: take(&mut self.bucket_candidates), + query_tree: Some(query_tree), + candidates: Some(RoaringBitmap::new()), + bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, } }; - candidates.difference_with(&found_candidates); + allowed_candidates -= &found_candidates; + + self.state = Some((query_tree.clone(), flattened_query_tree, allowed_candidates)); return Ok(Some(CriterionResult { - query_tree: self.query_tree.clone(), + query_tree: Some(query_tree), candidates: Some(found_candidates), - bucket_candidates: take(&mut self.bucket_candidates), + bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, - (Some(qt), None) => { - let mut query_tree_candidates = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), params.wdcache)?; - query_tree_candidates -= params.excluded_candidates; - self.bucket_candidates |= &query_tree_candidates; - self.candidates = Some(query_tree_candidates); - }, - (None, Some(_)) => { - return Ok(Some(CriterionResult { - query_tree: self.query_tree.take(), - candidates: self.candidates.take(), - bucket_candidates: take(&mut self.bucket_candidates), - })); - }, - (None, None) => { + None => { match self.parent.next(params)? { - Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { + Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { + let candidates = match candidates { + Some(candidates) => candidates, + None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates, + }; + + let flattened_query_tree = flatten_query_tree(&query_tree); + + match bucket_candidates { + Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, + None => self.bucket_candidates |= &candidates, + } + + self.state = Some((query_tree, flattened_query_tree, candidates)); + self.current_buckets = None; + }, + Some(CriterionResult { query_tree: None, candidates, bucket_candidates }) => { return Ok(Some(CriterionResult { query_tree: None, - candidates: None, + candidates, bucket_candidates, })); }, - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - self.query_tree = query_tree; - self.candidates = candidates; - self.bucket_candidates |= bucket_candidates; - self.flattened_query_tree = None; - self.current_buckets = None; - }, None => return Ok(None), } }, @@ -467,7 +458,7 @@ impl<'t, 'q> Eq for Branch<'t, 'q> {} fn initialize_query_level_iterators<'t, 'q>( ctx: &'t dyn Context<'t>, - branches: &'q Vec>>, + branches: &'q FlattenedQueryTree, allowed_candidates: &RoaringBitmap, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result>> { @@ -517,7 +508,7 @@ fn initialize_query_level_iterators<'t, 'q>( fn set_compute_candidates<'t>( ctx: &'t dyn Context<'t>, - branches: &Vec>>, + branches: &FlattenedQueryTree, allowed_candidates: &RoaringBitmap, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> @@ -570,11 +561,11 @@ fn set_compute_candidates<'t>( fn linear_compute_candidates( ctx: &dyn Context, - branches: &Vec>>, + branches: &FlattenedQueryTree, allowed_candidates: &RoaringBitmap, ) -> anyhow::Result> { - fn compute_candidate_rank(branches: &Vec>>, words_positions: HashMap) -> u64 { + fn compute_candidate_rank(branches: &FlattenedQueryTree, words_positions: HashMap) -> u64 { let mut min_rank = u64::max_value(); for branch in branches { @@ -659,10 +650,10 @@ fn linear_compute_candidates( } // TODO can we keep refs of Query -fn flatten_query_tree(query_tree: &Operation) -> Vec>> { +fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { use crate::search::criteria::Operation::{And, Or, Consecutive}; - fn and_recurse(head: &Operation, tail: &[Operation]) -> Vec>> { + fn and_recurse(head: &Operation, tail: &[Operation]) -> FlattenedQueryTree { match tail.split_first() { Some((thead, tail)) => { let tail = and_recurse(thead, tail); @@ -680,7 +671,7 @@ fn flatten_query_tree(query_tree: &Operation) -> Vec>> { } } - fn recurse(op: &Operation) -> Vec>> { + fn recurse(op: &Operation) -> FlattenedQueryTree { match op { And(ops) | Consecutive(ops) => { ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)) diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index c004f4a51..2fd216053 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -1,4 +1,4 @@ -use std::{collections::HashMap, mem}; +use std::mem::take; use log::debug; use roaring::RoaringBitmap; @@ -60,13 +60,13 @@ impl<'t> Criterion for Exactness<'t> { self.query_tree = None; }, Some(state) => { - let (candidates, state) = resolve_state(self.ctx, mem::take(state), &self.query)?; + let (candidates, state) = resolve_state(self.ctx, take(state), &self.query)?; self.state = state; return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), candidates: Some(candidates), - bucket_candidates: mem::take(&mut self.bucket_candidates), + bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, None => { @@ -74,11 +74,16 @@ impl<'t> Criterion for Exactness<'t> { Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { let candidates = match candidates { Some(candidates) => candidates, - None => resolve_query_tree(self.ctx, &query_tree, &mut HashMap::new(), params.wdcache)?, + None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)?, }; + + match bucket_candidates { + Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, + None => self.bucket_candidates |= &candidates, + } + self.state = Some(State::new(candidates)); self.query_tree = Some(query_tree); - self.bucket_candidates |= bucket_candidates; }, Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { return Ok(Some(CriterionResult { diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs index 707195ba7..0dbf3ee1a 100644 --- a/milli/src/search/criteria/final.rs +++ b/milli/src/search/criteria/final.rs @@ -1,5 +1,3 @@ -use std::collections::HashMap; - use log::debug; use roaring::RoaringBitmap; @@ -41,19 +39,15 @@ impl<'t> Final<'t> { }; match self.parent.next(&mut criterion_parameters)? { - Some(CriterionResult { query_tree, candidates, mut bucket_candidates }) => { - let candidates = match candidates { - Some(candidates) => candidates, - None => { - let candidates = match query_tree.as_ref() { - Some(qt) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), &mut self.wdcache)?, - None => self.ctx.documents_ids()?, - }; - bucket_candidates |= &candidates; - candidates - } + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + let candidates = match (candidates, query_tree.as_ref()) { + (Some(candidates), _) => candidates, + (None, Some(qt)) => resolve_query_tree(self.ctx, qt, &mut self.wdcache)?, + (None, None) => self.ctx.documents_ids()?, }; + let bucket_candidates = bucket_candidates.unwrap_or_else(|| candidates.clone()); + self.returned_candidates |= &candidates; return Ok(Some(FinalResult { query_tree, candidates, bucket_candidates })); diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index 10858dd99..eb2bf0b95 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -12,8 +12,8 @@ impl Initial { pub fn new(query_tree: Option, mut candidates: Option) -> Initial { let answer = CriterionResult { query_tree, - candidates: candidates.clone(), - bucket_candidates: candidates.take().unwrap_or_default(), + candidates: candidates.take(), + bucket_candidates: None, }; Initial { answer: Some(answer) } } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 76e263036..f9fca2624 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -39,7 +39,7 @@ pub struct CriterionResult { /// if None, it is up to the child to compute the candidates itself. candidates: Option, /// Candidates that comes from the current bucket of the initial criterion. - bucket_candidates: RoaringBitmap, + bucket_candidates: Option, } #[derive(Debug, PartialEq)] @@ -57,15 +57,6 @@ enum Candidates { Forbidden(RoaringBitmap) } -impl Candidates { - fn into_inner(self) -> RoaringBitmap { - match self { - Self::Allowed(inner) => inner, - Self::Forbidden(inner) => inner, - } - } -} - impl Default for Candidates { fn default() -> Self { Self::Forbidden(RoaringBitmap::new()) @@ -236,14 +227,12 @@ impl<'t> CriteriaBuilder<'t> { pub fn resolve_query_tree<'t>( ctx: &'t dyn Context, query_tree: &Operation, - cache: &mut HashMap<(Operation, u8), RoaringBitmap>, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result { fn resolve_operation<'t>( ctx: &'t dyn Context, query_tree: &Operation, - cache: &mut HashMap<(Operation, u8), RoaringBitmap>, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result { @@ -252,7 +241,7 @@ pub fn resolve_query_tree<'t>( match query_tree { And(ops) => { let mut ops = ops.iter().map(|op| { - resolve_operation(ctx, op, cache, wdcache) + resolve_operation(ctx, op, wdcache) }).collect::>>()?; ops.sort_unstable_by_key(|cds| cds.len()); @@ -296,7 +285,7 @@ pub fn resolve_query_tree<'t>( Or(_, ops) => { let mut candidates = RoaringBitmap::new(); for op in ops { - let docids = resolve_operation(ctx, op, cache, wdcache)?; + let docids = resolve_operation(ctx, op, wdcache)?; candidates.union_with(&docids); } Ok(candidates) @@ -305,7 +294,7 @@ pub fn resolve_query_tree<'t>( } } - resolve_operation(ctx, query_tree, cache, wdcache) + resolve_operation(ctx, query_tree, wdcache) } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 08fba1447..e50d3941d 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -30,8 +30,8 @@ const PROXIMITY_THRESHOLD: u8 = 0; pub struct Proximity<'t> { ctx: &'t dyn Context<'t>, - /// ((max_proximity, query_tree), allowed_candidates) - state: Option<(Option<(usize, Operation)>, RoaringBitmap)>, + /// (max_proximity, query_tree, allowed_candidates) + state: Option<(u8, Operation, RoaringBitmap)>, proximity: u8, bucket_candidates: RoaringBitmap, parent: Box, @@ -57,114 +57,90 @@ impl<'t> Criterion for Proximity<'t> { #[logging_timer::time("Proximity::{}")] fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { // remove excluded candidates when next is called, instead of doing it in the loop. - if let Some((_, candidates)) = self.state.as_mut() { - *candidates -= params.excluded_candidates; + if let Some((_, _, allowed_candidates)) = self.state.as_mut() { + *allowed_candidates -= params.excluded_candidates; } loop { debug!("Proximity at iteration {} (max prox {:?}) ({:?})", self.proximity, - self.state.as_ref().map(|(qt, _)| qt.as_ref().map(|(mp, _)| mp)), - self.state.as_ref().map(|(_, cd)| cd), + self.state.as_ref().map(|(mp, _, _)| mp), + self.state.as_ref().map(|(_, _, cd)| cd), ); match &mut self.state { - Some((_, candidates)) if candidates.is_empty() => { + Some((max_prox, _, allowed_candidates)) if allowed_candidates.is_empty() || self.proximity > *max_prox => { self.state = None; // reset state }, - Some((Some((max_prox, query_tree)), candidates)) => { - if self.proximity as usize > *max_prox { - self.state = None; // reset state - } else { - let mut new_candidates = if candidates.len() <= CANDIDATES_THRESHOLD && self.proximity > PROXIMITY_THRESHOLD { - if let Some(cache) = self.plane_sweep_cache.as_mut() { - match cache.next() { - Some((p, candidates)) => { - self.proximity = p; - candidates - }, - None => { - self.state = None; // reset state - continue - }, - } - } else { - let cache = resolve_plane_sweep_candidates( - self.ctx, - query_tree, - candidates, - params.wdcache, - )?; - self.plane_sweep_cache = Some(cache.into_iter()); - - continue + Some((_, query_tree, allowed_candidates)) => { + let mut new_candidates = if allowed_candidates.len() <= CANDIDATES_THRESHOLD && self.proximity > PROXIMITY_THRESHOLD { + if let Some(cache) = self.plane_sweep_cache.as_mut() { + match cache.next() { + Some((p, candidates)) => { + self.proximity = p; + candidates + }, + None => { + self.state = None; // reset state + continue + }, } - } else { // use set theory based algorithm - resolve_candidates( - self.ctx, - &query_tree, - self.proximity, - &mut self.candidates_cache, - params.wdcache, - )? - }; + } else { + let cache = resolve_plane_sweep_candidates( + self.ctx, + query_tree, + allowed_candidates, + params.wdcache, + )?; + self.plane_sweep_cache = Some(cache.into_iter()); - new_candidates.intersect_with(&candidates); - candidates.difference_with(&new_candidates); - self.proximity += 1; + continue + } + } else { // use set theory based algorithm + resolve_candidates( + self.ctx, + &query_tree, + self.proximity, + &mut self.candidates_cache, + params.wdcache, + )? + }; + + new_candidates &= &*allowed_candidates; + *allowed_candidates -= &new_candidates; + self.proximity += 1; - return Ok(Some(CriterionResult { - query_tree: Some(query_tree.clone()), - candidates: Some(new_candidates), - bucket_candidates: take(&mut self.bucket_candidates), - })); - } - }, - Some((None, candidates)) => { - let candidates = take(candidates); - self.state = None; // reset state return Ok(Some(CriterionResult { - query_tree: None, - candidates: Some(candidates.clone()), - bucket_candidates: candidates, + query_tree: Some(query_tree.clone()), + candidates: Some(new_candidates), + bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, None => { match self.parent.next(params)? { - Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates: None, - bucket_candidates, - })); - }, - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - let candidates_is_some = candidates.is_some(); - let candidates = match (&query_tree, candidates) { - (_, Some(candidates)) => candidates, - (Some(qt), None) => { - let candidates = resolve_query_tree(self.ctx, qt, &mut HashMap::new(), params.wdcache)?; - candidates - params.excluded_candidates - }, - (None, None) => RoaringBitmap::new(), + Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { + let candidates = match candidates { + Some(candidates) => candidates, + None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates, }; - // If our parent returns candidates it means that the bucket - // candidates were already computed before and we can use them. - // - // If not, we must use the just computed candidates as our bucket - // candidates. - if candidates_is_some { - self.bucket_candidates.union_with(&bucket_candidates); - } else { - self.bucket_candidates.union_with(&candidates); + match bucket_candidates { + Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, + None => self.bucket_candidates |= &candidates, } - let query_tree = query_tree.map(|op| (maximum_proximity(&op), op)); - self.state = Some((query_tree, candidates)); + let maximum_proximity = maximum_proximity(&query_tree); + self.state = Some((maximum_proximity as u8, query_tree, candidates)); self.proximity = 0; self.plane_sweep_cache = None; }, + Some(CriterionResult { query_tree: None, candidates, bucket_candidates }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates, + bucket_candidates, + })); + }, None => return Ok(None), } }, diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index f265b30ae..288a92f65 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -13,15 +13,19 @@ use super::{ CriterionParameters, CriterionResult, query_docids, - query_pair_proximity_docids + query_pair_proximity_docids, + resolve_query_tree, }; +/// Maximum number of typo for a word of any length. +const MAX_TYPOS_PER_WORD: u8 = 2; + pub struct Typo<'t> { ctx: &'t dyn Context<'t>, - query_tree: Option<(usize, Operation)>, - number_typos: u8, - candidates: Candidates, - bucket_candidates: RoaringBitmap, + /// (max_typos, query_tree, candidates) + state: Option<(u8, Operation, Candidates)>, + typos: u8, + bucket_candidates: Option, parent: Box, candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, } @@ -30,10 +34,9 @@ impl<'t> Typo<'t> { pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { Typo { ctx, - query_tree: None, - number_typos: 0, - candidates: Candidates::default(), - bucket_candidates: RoaringBitmap::new(), + state: None, + typos: 0, + bucket_candidates: None, parent, candidates_cache: HashMap::new(), } @@ -45,113 +48,101 @@ impl<'t> Criterion for Typo<'t> { fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; // remove excluded candidates when next is called, instead of doing it in the loop. - match &mut self.candidates { - Allowed(candidates) => *candidates -= params.excluded_candidates, - Forbidden(candidates) => *candidates |= params.excluded_candidates, + match self.state.as_mut() { + Some((_, _, Allowed(candidates))) => *candidates -= params.excluded_candidates, + Some((_, _, Forbidden(candidates))) => *candidates |= params.excluded_candidates, + None => (), } loop { - debug!("Typo at iteration {} ({:?})", self.number_typos, self.candidates); + debug!("Typo at iteration {} (max typos {:?}) ({:?})", + self.typos, + self.state.as_ref().map(|(mt, _, _)| mt), + self.state.as_ref().map(|(_, _, cd)| cd), + ); - match (&mut self.query_tree, &mut self.candidates) { - (_, Allowed(candidates)) if candidates.is_empty() => { - return Ok(Some(CriterionResult { - query_tree: self.query_tree.take().map(|(_, qt)| qt), - candidates: Some(take(&mut self.candidates).into_inner()), - bucket_candidates: take(&mut self.bucket_candidates), - })); + match self.state.as_mut() { + Some((max_typos, _, _)) if self.typos > *max_typos => { + self.state = None; // reset state }, - (Some((max_typos, query_tree)), Allowed(candidates)) => { - if self.number_typos as usize > *max_typos { - self.query_tree = None; - self.candidates = Candidates::default(); - } else { - let fst = self.ctx.words_fst(); - let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, params.wdcache)? - } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, params.wdcache)?; - query_tree.clone() - } else { - query_tree.clone() + Some((_, _, Allowed(allowed_candidates))) if allowed_candidates.is_empty() => { + self.state = None; // reset state + }, + Some((_, query_tree, candidates_authorization)) => { + let fst = self.ctx.words_fst(); + let new_query_tree = if self.typos < MAX_TYPOS_PER_WORD { + alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)? + } else if self.typos == MAX_TYPOS_PER_WORD { + // When typos >= MAX_TYPOS_PER_WORD, no more alteration of the query tree is possible, + // we keep the altered query tree + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)?; + // we compute the allowed candidates + let query_tree_allowed_candidates = resolve_query_tree(self.ctx, query_tree, params.wdcache)?; + // we assign the allowed candidates to the candidates authorization. + *candidates_authorization = match take(candidates_authorization) { + Allowed(allowed_candidates) => Allowed(query_tree_allowed_candidates & allowed_candidates), + Forbidden(forbidden_candidates) => Allowed(query_tree_allowed_candidates - forbidden_candidates), }; - - let mut new_candidates = resolve_candidates( - self.ctx, - &new_query_tree, - self.number_typos, - &mut self.candidates_cache, - params.wdcache, - )?; - new_candidates.intersect_with(&candidates); - candidates.difference_with(&new_candidates); - self.number_typos += 1; - - return Ok(Some(CriterionResult { - query_tree: Some(new_query_tree), - candidates: Some(new_candidates), - bucket_candidates: take(&mut self.bucket_candidates), - })); - } - }, - (Some((max_typos, query_tree)), Forbidden(candidates)) => { - if self.number_typos as usize > *max_typos { - self.query_tree = None; - self.candidates = Candidates::default(); + query_tree.clone() } else { - let fst = self.ctx.words_fst(); - let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, params.wdcache)? - } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, params.wdcache)?; - query_tree.clone() - } else { - query_tree.clone() - }; + query_tree.clone() + }; - let mut new_candidates = resolve_candidates( - self.ctx, - &new_query_tree, - self.number_typos, - &mut self.candidates_cache, - params.wdcache, - )?; - new_candidates.difference_with(&candidates); - candidates.union_with(&new_candidates); - self.number_typos += 1; - self.bucket_candidates.union_with(&new_candidates); + let mut candidates = resolve_candidates( + self.ctx, + &new_query_tree, + self.typos, + &mut self.candidates_cache, + params.wdcache, + )?; - return Ok(Some(CriterionResult { - query_tree: Some(new_query_tree), - candidates: Some(new_candidates), - bucket_candidates: take(&mut self.bucket_candidates), - })); - } - }, - (None, Allowed(_)) => { - let candidates = take(&mut self.candidates).into_inner(); - return Ok(Some(CriterionResult { - query_tree: None, - candidates: Some(candidates.clone()), - bucket_candidates: candidates, - })); - }, - (None, Forbidden(_)) => { - match self.parent.next(params)? { - Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates: None, - bucket_candidates, - })); + match candidates_authorization { + Allowed(allowed_candidates) => { + candidates &= &*allowed_candidates; + *allowed_candidates -= &candidates; }, - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); - self.number_typos = 0; - self.candidates = candidates.map_or_else(|| { + Forbidden(forbidden_candidates) => { + candidates -= &*forbidden_candidates; + *forbidden_candidates |= &candidates; + }, + } + + let bucket_candidates = match self.bucket_candidates.as_mut() { + Some(bucket_candidates) => take(bucket_candidates), + None => candidates.clone(), + }; + + self.typos += 1; + + return Ok(Some(CriterionResult { + query_tree: Some(new_query_tree), + candidates: Some(candidates), + bucket_candidates: Some(bucket_candidates), + })); + }, + None => { + match self.parent.next(params)? { + Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { + self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) { + (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), + (self_bc, parent_bc) => self_bc.or(parent_bc), + }; + + let candidates = candidates.map_or_else(|| { Candidates::Forbidden(params.excluded_candidates.clone()) }, Candidates::Allowed); - self.bucket_candidates.union_with(&bucket_candidates); + + let maximum_typos = maximum_typo(&query_tree) as u8; + self.state = Some((maximum_typos, query_tree, candidates)); + self.typos = 0; + + }, + Some(CriterionResult { query_tree: None, candidates, bucket_candidates }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates, + bucket_candidates, + })); }, None => return Ok(None), } @@ -185,7 +176,6 @@ fn alterate_query_tree( ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache)) }, Operation::Query(q) => { - // TODO may be optimized when number_typos == 0 if let QueryKind::Tolerant { typo, word } = &q.kind { // if no typo is allowed we don't call word_derivations function, // and directly create an Exact query @@ -384,7 +374,7 @@ mod test { ]), ])), candidates: Some(candidates_1.clone()), - bucket_candidates: candidates_1, + bucket_candidates: Some(candidates_1), }; assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); @@ -406,7 +396,7 @@ mod test { ]), ])), candidates: Some(candidates_2.clone()), - bucket_candidates: candidates_2, + bucket_candidates: Some(candidates_2), }; assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_2)); @@ -428,7 +418,7 @@ mod test { let expected = CriterionResult { query_tree: None, candidates: Some(facet_candidates.clone()), - bucket_candidates: facet_candidates, + bucket_candidates: None, }; // first iteration, returns the facet candidates @@ -471,7 +461,7 @@ mod test { ]), ])), candidates: Some(&candidates_1 & &facet_candidates), - bucket_candidates: facet_candidates.clone(), + bucket_candidates: Some(&candidates_1 & &facet_candidates), }; assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); @@ -493,7 +483,7 @@ mod test { ]), ])), candidates: Some(&candidates_2 & &facet_candidates), - bucket_candidates: RoaringBitmap::new(), + bucket_candidates: Some(&candidates_2 & &facet_candidates), }; assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_2)); diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 23a45223a..2f7ebbfbf 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -1,4 +1,3 @@ -use std::collections::HashMap; use std::mem::take; use log::debug; @@ -11,9 +10,9 @@ pub struct Words<'t> { ctx: &'t dyn Context<'t>, query_trees: Vec, candidates: Option, - bucket_candidates: RoaringBitmap, + bucket_candidates: Option, parent: Box, - candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, + compute_candidates: bool, } impl<'t> Words<'t> { @@ -22,9 +21,9 @@ impl<'t> Words<'t> { ctx, query_trees: Vec::default(), candidates: None, - bucket_candidates: RoaringBitmap::new(), + bucket_candidates: None, parent, - candidates_cache: HashMap::default(), + compute_candidates: false, } } } @@ -40,55 +39,48 @@ impl<'t> Criterion for Words<'t> { loop { debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates); - match (self.query_trees.pop(), &mut self.candidates) { - (query_tree, Some(candidates)) if candidates.is_empty() => { - self.query_trees = Vec::new(); - return Ok(Some(CriterionResult { - query_tree, - candidates: self.candidates.take(), - bucket_candidates: take(&mut self.bucket_candidates), - })); - }, - (Some(qt), Some(candidates)) => { - let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, params.wdcache)?; - found_candidates.intersect_with(&candidates); - candidates.difference_with(&found_candidates); + match self.query_trees.pop() { + Some(query_tree) => { + let candidates = match self.candidates.as_mut() { + Some(allowed_candidates) if self.compute_candidates => { + let mut candidates = resolve_query_tree(self.ctx, &query_tree, params.wdcache)?; + candidates &= &*allowed_candidates; + *allowed_candidates -= &candidates; + Some(candidates) + }, + candidates => candidates.cloned(), + }; + + let bucket_candidates = match self.bucket_candidates.as_mut() { + Some(bucket_candidates) => Some(take(bucket_candidates)), + None => None, + }; return Ok(Some(CriterionResult { - query_tree: Some(qt), - candidates: Some(found_candidates), - bucket_candidates: take(&mut self.bucket_candidates), + query_tree: Some(query_tree), + candidates, + bucket_candidates, })); }, - (Some(qt), None) => { - return Ok(Some(CriterionResult { - query_tree: Some(qt), - candidates: None, - bucket_candidates: take(&mut self.bucket_candidates), - })); - }, - (None, Some(_)) => { - let candidates = self.candidates.take(); - return Ok(Some(CriterionResult { - query_tree: None, - candidates: candidates.clone(), - bucket_candidates: candidates.unwrap_or_default(), - })); - }, - (None, None) => { + None => { match self.parent.next(params)? { - Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { + Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { + self.query_trees = explode_query_tree(query_tree); + self.candidates = candidates; + self.compute_candidates = bucket_candidates.is_some(); + + self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) { + (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), + (self_bc, parent_bc) => self_bc.or(parent_bc), + }; + }, + Some(CriterionResult { query_tree: None, candidates, bucket_candidates }) => { return Ok(Some(CriterionResult { query_tree: None, - candidates: None, + candidates, bucket_candidates, })); }, - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); - self.candidates = candidates; - self.bucket_candidates.union_with(&bucket_candidates); - }, None => return Ok(None), } },