From e923d51b8f928e6bd1d5ccab765eb8b2c5f81ecf Mon Sep 17 00:00:00 2001 From: many Date: Wed, 5 May 2021 20:46:56 +0200 Subject: [PATCH 1/3] Make bucket candidates optionals --- milli/src/search/criteria/asc_desc.rs | 17 +- milli/src/search/criteria/attribute.rs | 111 ++++++------- milli/src/search/criteria/exactness.rs | 15 +- milli/src/search/criteria/final.rs | 20 +-- milli/src/search/criteria/initial.rs | 4 +- milli/src/search/criteria/mod.rs | 19 +-- milli/src/search/criteria/proximity.rs | 146 +++++++---------- milli/src/search/criteria/typo.rs | 210 ++++++++++++------------- milli/src/search/criteria/words.rs | 80 +++++----- 9 files changed, 276 insertions(+), 346 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 54cbb0fae..7b619f26a 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -94,7 +94,6 @@ impl<'t> Criterion for AscDesc<'t> { None => { match self.parent.next(params)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - let candidates_is_some = candidates.is_some(); self.query_tree = query_tree; let candidates = match (&self.query_tree, candidates) { (_, Some(mut candidates)) => { @@ -103,7 +102,7 @@ impl<'t> Criterion for AscDesc<'t> { }, (Some(qt), None) => { let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; - let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), params.wdcache)?; + let mut candidates = resolve_query_tree(&context, qt, params.wdcache)?; candidates -= params.excluded_candidates; candidates.intersect_with(&self.faceted_candidates); candidates @@ -111,15 +110,9 @@ impl<'t> Criterion for AscDesc<'t> { (None, None) => take(&mut self.faceted_candidates), }; - // If our parent returns candidates it means that the bucket - // candidates were already computed before and we can use them. - // - // If not, we must use the just computed candidates as our bucket - // candidates. - if candidates_is_some { - self.bucket_candidates.union_with(&bucket_candidates); - } else { - self.bucket_candidates.union_with(&candidates); + match bucket_candidates { + Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, + None => self.bucket_candidates |= &candidates, } if candidates.is_empty() { @@ -143,7 +136,7 @@ impl<'t> Criterion for AscDesc<'t> { return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), candidates: Some(candidates), - bucket_candidates: take(&mut self.bucket_candidates), + bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, } diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 5993f03bd..fc7050a7f 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -24,13 +24,12 @@ const LEVEL_EXPONENTIATION_BASE: u32 = 4; /// the system to choose between one algorithm or another. const CANDIDATES_THRESHOLD: u64 = 1000; +type FlattenedQueryTree = Vec>>; pub struct Attribute<'t> { ctx: &'t dyn Context<'t>, - query_tree: Option, - candidates: Option, + state: Option<(Operation, FlattenedQueryTree, RoaringBitmap)>, bucket_candidates: RoaringBitmap, parent: Box, - flattened_query_tree: Option>>>, current_buckets: Option>, } @@ -38,11 +37,9 @@ impl<'t> Attribute<'t> { pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { Attribute { ctx, - query_tree: None, - candidates: None, + state: None, bucket_candidates: RoaringBitmap::new(), parent, - flattened_query_tree: None, current_buckets: None, } } @@ -52,29 +49,25 @@ impl<'t> Criterion for Attribute<'t> { #[logging_timer::time("Attribute::{}")] fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { // remove excluded candidates when next is called, instead of doing it in the loop. - if let Some(candidates) = self.candidates.as_mut() { - *candidates -= params.excluded_candidates; + if let Some((_, _, allowed_candidates)) = self.state.as_mut() { + *allowed_candidates -= params.excluded_candidates; } loop { - match (&self.query_tree, &mut self.candidates) { - (_, Some(candidates)) if candidates.is_empty() => { + match self.state.take() { + Some((query_tree, _, allowed_candidates)) if allowed_candidates.is_empty() => { return Ok(Some(CriterionResult { - query_tree: self.query_tree.take(), - candidates: self.candidates.take(), - bucket_candidates: take(&mut self.bucket_candidates), + query_tree: Some(query_tree), + candidates: Some(RoaringBitmap::new()), + bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, - (Some(qt), Some(candidates)) => { - let flattened_query_tree = self.flattened_query_tree.get_or_insert_with(|| { - flatten_query_tree(&qt) - }); - - let found_candidates = if candidates.len() < CANDIDATES_THRESHOLD { + Some((query_tree, flattened_query_tree, mut allowed_candidates)) => { + let found_candidates = if allowed_candidates.len() < CANDIDATES_THRESHOLD { let current_buckets = match self.current_buckets.as_mut() { Some(current_buckets) => current_buckets, None => { - let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?; + let new_buckets = linear_compute_candidates(self.ctx, &flattened_query_tree, &allowed_candidates)?; self.current_buckets.get_or_insert(new_buckets.into_iter()) }, }; @@ -83,62 +76,60 @@ impl<'t> Criterion for Attribute<'t> { Some((_score, candidates)) => candidates, None => { return Ok(Some(CriterionResult { - query_tree: self.query_tree.take(), - candidates: self.candidates.take(), - bucket_candidates: take(&mut self.bucket_candidates), + query_tree: Some(query_tree), + candidates: Some(RoaringBitmap::new()), + bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, } } else { - match set_compute_candidates(self.ctx, flattened_query_tree, candidates, params.wdcache)? { + match set_compute_candidates(self.ctx, &flattened_query_tree, &allowed_candidates, params.wdcache)? { Some(candidates) => candidates, None => { return Ok(Some(CriterionResult { - query_tree: self.query_tree.take(), - candidates: self.candidates.take(), - bucket_candidates: take(&mut self.bucket_candidates), + query_tree: Some(query_tree), + candidates: Some(RoaringBitmap::new()), + bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, } }; - candidates.difference_with(&found_candidates); + allowed_candidates -= &found_candidates; + + self.state = Some((query_tree.clone(), flattened_query_tree, allowed_candidates)); return Ok(Some(CriterionResult { - query_tree: self.query_tree.clone(), + query_tree: Some(query_tree), candidates: Some(found_candidates), - bucket_candidates: take(&mut self.bucket_candidates), + bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, - (Some(qt), None) => { - let mut query_tree_candidates = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), params.wdcache)?; - query_tree_candidates -= params.excluded_candidates; - self.bucket_candidates |= &query_tree_candidates; - self.candidates = Some(query_tree_candidates); - }, - (None, Some(_)) => { - return Ok(Some(CriterionResult { - query_tree: self.query_tree.take(), - candidates: self.candidates.take(), - bucket_candidates: take(&mut self.bucket_candidates), - })); - }, - (None, None) => { + None => { match self.parent.next(params)? { - Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { + Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { + let candidates = match candidates { + Some(candidates) => candidates, + None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates, + }; + + let flattened_query_tree = flatten_query_tree(&query_tree); + + match bucket_candidates { + Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, + None => self.bucket_candidates |= &candidates, + } + + self.state = Some((query_tree, flattened_query_tree, candidates)); + self.current_buckets = None; + }, + Some(CriterionResult { query_tree: None, candidates, bucket_candidates }) => { return Ok(Some(CriterionResult { query_tree: None, - candidates: None, + candidates, bucket_candidates, })); }, - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - self.query_tree = query_tree; - self.candidates = candidates; - self.bucket_candidates |= bucket_candidates; - self.flattened_query_tree = None; - self.current_buckets = None; - }, None => return Ok(None), } }, @@ -467,7 +458,7 @@ impl<'t, 'q> Eq for Branch<'t, 'q> {} fn initialize_query_level_iterators<'t, 'q>( ctx: &'t dyn Context<'t>, - branches: &'q Vec>>, + branches: &'q FlattenedQueryTree, allowed_candidates: &RoaringBitmap, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result>> { @@ -517,7 +508,7 @@ fn initialize_query_level_iterators<'t, 'q>( fn set_compute_candidates<'t>( ctx: &'t dyn Context<'t>, - branches: &Vec>>, + branches: &FlattenedQueryTree, allowed_candidates: &RoaringBitmap, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> @@ -570,11 +561,11 @@ fn set_compute_candidates<'t>( fn linear_compute_candidates( ctx: &dyn Context, - branches: &Vec>>, + branches: &FlattenedQueryTree, allowed_candidates: &RoaringBitmap, ) -> anyhow::Result> { - fn compute_candidate_rank(branches: &Vec>>, words_positions: HashMap) -> u64 { + fn compute_candidate_rank(branches: &FlattenedQueryTree, words_positions: HashMap) -> u64 { let mut min_rank = u64::max_value(); for branch in branches { @@ -659,10 +650,10 @@ fn linear_compute_candidates( } // TODO can we keep refs of Query -fn flatten_query_tree(query_tree: &Operation) -> Vec>> { +fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { use crate::search::criteria::Operation::{And, Or, Consecutive}; - fn and_recurse(head: &Operation, tail: &[Operation]) -> Vec>> { + fn and_recurse(head: &Operation, tail: &[Operation]) -> FlattenedQueryTree { match tail.split_first() { Some((thead, tail)) => { let tail = and_recurse(thead, tail); @@ -680,7 +671,7 @@ fn flatten_query_tree(query_tree: &Operation) -> Vec>> { } } - fn recurse(op: &Operation) -> Vec>> { + fn recurse(op: &Operation) -> FlattenedQueryTree { match op { And(ops) | Consecutive(ops) => { ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)) diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index c004f4a51..2fd216053 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -1,4 +1,4 @@ -use std::{collections::HashMap, mem}; +use std::mem::take; use log::debug; use roaring::RoaringBitmap; @@ -60,13 +60,13 @@ impl<'t> Criterion for Exactness<'t> { self.query_tree = None; }, Some(state) => { - let (candidates, state) = resolve_state(self.ctx, mem::take(state), &self.query)?; + let (candidates, state) = resolve_state(self.ctx, take(state), &self.query)?; self.state = state; return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), candidates: Some(candidates), - bucket_candidates: mem::take(&mut self.bucket_candidates), + bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, None => { @@ -74,11 +74,16 @@ impl<'t> Criterion for Exactness<'t> { Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { let candidates = match candidates { Some(candidates) => candidates, - None => resolve_query_tree(self.ctx, &query_tree, &mut HashMap::new(), params.wdcache)?, + None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)?, }; + + match bucket_candidates { + Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, + None => self.bucket_candidates |= &candidates, + } + self.state = Some(State::new(candidates)); self.query_tree = Some(query_tree); - self.bucket_candidates |= bucket_candidates; }, Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { return Ok(Some(CriterionResult { diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs index 707195ba7..0dbf3ee1a 100644 --- a/milli/src/search/criteria/final.rs +++ b/milli/src/search/criteria/final.rs @@ -1,5 +1,3 @@ -use std::collections::HashMap; - use log::debug; use roaring::RoaringBitmap; @@ -41,19 +39,15 @@ impl<'t> Final<'t> { }; match self.parent.next(&mut criterion_parameters)? { - Some(CriterionResult { query_tree, candidates, mut bucket_candidates }) => { - let candidates = match candidates { - Some(candidates) => candidates, - None => { - let candidates = match query_tree.as_ref() { - Some(qt) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), &mut self.wdcache)?, - None => self.ctx.documents_ids()?, - }; - bucket_candidates |= &candidates; - candidates - } + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + let candidates = match (candidates, query_tree.as_ref()) { + (Some(candidates), _) => candidates, + (None, Some(qt)) => resolve_query_tree(self.ctx, qt, &mut self.wdcache)?, + (None, None) => self.ctx.documents_ids()?, }; + let bucket_candidates = bucket_candidates.unwrap_or_else(|| candidates.clone()); + self.returned_candidates |= &candidates; return Ok(Some(FinalResult { query_tree, candidates, bucket_candidates })); diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index 10858dd99..eb2bf0b95 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -12,8 +12,8 @@ impl Initial { pub fn new(query_tree: Option, mut candidates: Option) -> Initial { let answer = CriterionResult { query_tree, - candidates: candidates.clone(), - bucket_candidates: candidates.take().unwrap_or_default(), + candidates: candidates.take(), + bucket_candidates: None, }; Initial { answer: Some(answer) } } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 76e263036..f9fca2624 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -39,7 +39,7 @@ pub struct CriterionResult { /// if None, it is up to the child to compute the candidates itself. candidates: Option, /// Candidates that comes from the current bucket of the initial criterion. - bucket_candidates: RoaringBitmap, + bucket_candidates: Option, } #[derive(Debug, PartialEq)] @@ -57,15 +57,6 @@ enum Candidates { Forbidden(RoaringBitmap) } -impl Candidates { - fn into_inner(self) -> RoaringBitmap { - match self { - Self::Allowed(inner) => inner, - Self::Forbidden(inner) => inner, - } - } -} - impl Default for Candidates { fn default() -> Self { Self::Forbidden(RoaringBitmap::new()) @@ -236,14 +227,12 @@ impl<'t> CriteriaBuilder<'t> { pub fn resolve_query_tree<'t>( ctx: &'t dyn Context, query_tree: &Operation, - cache: &mut HashMap<(Operation, u8), RoaringBitmap>, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result { fn resolve_operation<'t>( ctx: &'t dyn Context, query_tree: &Operation, - cache: &mut HashMap<(Operation, u8), RoaringBitmap>, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result { @@ -252,7 +241,7 @@ pub fn resolve_query_tree<'t>( match query_tree { And(ops) => { let mut ops = ops.iter().map(|op| { - resolve_operation(ctx, op, cache, wdcache) + resolve_operation(ctx, op, wdcache) }).collect::>>()?; ops.sort_unstable_by_key(|cds| cds.len()); @@ -296,7 +285,7 @@ pub fn resolve_query_tree<'t>( Or(_, ops) => { let mut candidates = RoaringBitmap::new(); for op in ops { - let docids = resolve_operation(ctx, op, cache, wdcache)?; + let docids = resolve_operation(ctx, op, wdcache)?; candidates.union_with(&docids); } Ok(candidates) @@ -305,7 +294,7 @@ pub fn resolve_query_tree<'t>( } } - resolve_operation(ctx, query_tree, cache, wdcache) + resolve_operation(ctx, query_tree, wdcache) } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 08fba1447..e50d3941d 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -30,8 +30,8 @@ const PROXIMITY_THRESHOLD: u8 = 0; pub struct Proximity<'t> { ctx: &'t dyn Context<'t>, - /// ((max_proximity, query_tree), allowed_candidates) - state: Option<(Option<(usize, Operation)>, RoaringBitmap)>, + /// (max_proximity, query_tree, allowed_candidates) + state: Option<(u8, Operation, RoaringBitmap)>, proximity: u8, bucket_candidates: RoaringBitmap, parent: Box, @@ -57,114 +57,90 @@ impl<'t> Criterion for Proximity<'t> { #[logging_timer::time("Proximity::{}")] fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { // remove excluded candidates when next is called, instead of doing it in the loop. - if let Some((_, candidates)) = self.state.as_mut() { - *candidates -= params.excluded_candidates; + if let Some((_, _, allowed_candidates)) = self.state.as_mut() { + *allowed_candidates -= params.excluded_candidates; } loop { debug!("Proximity at iteration {} (max prox {:?}) ({:?})", self.proximity, - self.state.as_ref().map(|(qt, _)| qt.as_ref().map(|(mp, _)| mp)), - self.state.as_ref().map(|(_, cd)| cd), + self.state.as_ref().map(|(mp, _, _)| mp), + self.state.as_ref().map(|(_, _, cd)| cd), ); match &mut self.state { - Some((_, candidates)) if candidates.is_empty() => { + Some((max_prox, _, allowed_candidates)) if allowed_candidates.is_empty() || self.proximity > *max_prox => { self.state = None; // reset state }, - Some((Some((max_prox, query_tree)), candidates)) => { - if self.proximity as usize > *max_prox { - self.state = None; // reset state - } else { - let mut new_candidates = if candidates.len() <= CANDIDATES_THRESHOLD && self.proximity > PROXIMITY_THRESHOLD { - if let Some(cache) = self.plane_sweep_cache.as_mut() { - match cache.next() { - Some((p, candidates)) => { - self.proximity = p; - candidates - }, - None => { - self.state = None; // reset state - continue - }, - } - } else { - let cache = resolve_plane_sweep_candidates( - self.ctx, - query_tree, - candidates, - params.wdcache, - )?; - self.plane_sweep_cache = Some(cache.into_iter()); - - continue + Some((_, query_tree, allowed_candidates)) => { + let mut new_candidates = if allowed_candidates.len() <= CANDIDATES_THRESHOLD && self.proximity > PROXIMITY_THRESHOLD { + if let Some(cache) = self.plane_sweep_cache.as_mut() { + match cache.next() { + Some((p, candidates)) => { + self.proximity = p; + candidates + }, + None => { + self.state = None; // reset state + continue + }, } - } else { // use set theory based algorithm - resolve_candidates( - self.ctx, - &query_tree, - self.proximity, - &mut self.candidates_cache, - params.wdcache, - )? - }; + } else { + let cache = resolve_plane_sweep_candidates( + self.ctx, + query_tree, + allowed_candidates, + params.wdcache, + )?; + self.plane_sweep_cache = Some(cache.into_iter()); - new_candidates.intersect_with(&candidates); - candidates.difference_with(&new_candidates); - self.proximity += 1; + continue + } + } else { // use set theory based algorithm + resolve_candidates( + self.ctx, + &query_tree, + self.proximity, + &mut self.candidates_cache, + params.wdcache, + )? + }; + + new_candidates &= &*allowed_candidates; + *allowed_candidates -= &new_candidates; + self.proximity += 1; - return Ok(Some(CriterionResult { - query_tree: Some(query_tree.clone()), - candidates: Some(new_candidates), - bucket_candidates: take(&mut self.bucket_candidates), - })); - } - }, - Some((None, candidates)) => { - let candidates = take(candidates); - self.state = None; // reset state return Ok(Some(CriterionResult { - query_tree: None, - candidates: Some(candidates.clone()), - bucket_candidates: candidates, + query_tree: Some(query_tree.clone()), + candidates: Some(new_candidates), + bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, None => { match self.parent.next(params)? { - Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates: None, - bucket_candidates, - })); - }, - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - let candidates_is_some = candidates.is_some(); - let candidates = match (&query_tree, candidates) { - (_, Some(candidates)) => candidates, - (Some(qt), None) => { - let candidates = resolve_query_tree(self.ctx, qt, &mut HashMap::new(), params.wdcache)?; - candidates - params.excluded_candidates - }, - (None, None) => RoaringBitmap::new(), + Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { + let candidates = match candidates { + Some(candidates) => candidates, + None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates, }; - // If our parent returns candidates it means that the bucket - // candidates were already computed before and we can use them. - // - // If not, we must use the just computed candidates as our bucket - // candidates. - if candidates_is_some { - self.bucket_candidates.union_with(&bucket_candidates); - } else { - self.bucket_candidates.union_with(&candidates); + match bucket_candidates { + Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, + None => self.bucket_candidates |= &candidates, } - let query_tree = query_tree.map(|op| (maximum_proximity(&op), op)); - self.state = Some((query_tree, candidates)); + let maximum_proximity = maximum_proximity(&query_tree); + self.state = Some((maximum_proximity as u8, query_tree, candidates)); self.proximity = 0; self.plane_sweep_cache = None; }, + Some(CriterionResult { query_tree: None, candidates, bucket_candidates }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates, + bucket_candidates, + })); + }, None => return Ok(None), } }, diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index f265b30ae..288a92f65 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -13,15 +13,19 @@ use super::{ CriterionParameters, CriterionResult, query_docids, - query_pair_proximity_docids + query_pair_proximity_docids, + resolve_query_tree, }; +/// Maximum number of typo for a word of any length. +const MAX_TYPOS_PER_WORD: u8 = 2; + pub struct Typo<'t> { ctx: &'t dyn Context<'t>, - query_tree: Option<(usize, Operation)>, - number_typos: u8, - candidates: Candidates, - bucket_candidates: RoaringBitmap, + /// (max_typos, query_tree, candidates) + state: Option<(u8, Operation, Candidates)>, + typos: u8, + bucket_candidates: Option, parent: Box, candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, } @@ -30,10 +34,9 @@ impl<'t> Typo<'t> { pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { Typo { ctx, - query_tree: None, - number_typos: 0, - candidates: Candidates::default(), - bucket_candidates: RoaringBitmap::new(), + state: None, + typos: 0, + bucket_candidates: None, parent, candidates_cache: HashMap::new(), } @@ -45,113 +48,101 @@ impl<'t> Criterion for Typo<'t> { fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; // remove excluded candidates when next is called, instead of doing it in the loop. - match &mut self.candidates { - Allowed(candidates) => *candidates -= params.excluded_candidates, - Forbidden(candidates) => *candidates |= params.excluded_candidates, + match self.state.as_mut() { + Some((_, _, Allowed(candidates))) => *candidates -= params.excluded_candidates, + Some((_, _, Forbidden(candidates))) => *candidates |= params.excluded_candidates, + None => (), } loop { - debug!("Typo at iteration {} ({:?})", self.number_typos, self.candidates); + debug!("Typo at iteration {} (max typos {:?}) ({:?})", + self.typos, + self.state.as_ref().map(|(mt, _, _)| mt), + self.state.as_ref().map(|(_, _, cd)| cd), + ); - match (&mut self.query_tree, &mut self.candidates) { - (_, Allowed(candidates)) if candidates.is_empty() => { - return Ok(Some(CriterionResult { - query_tree: self.query_tree.take().map(|(_, qt)| qt), - candidates: Some(take(&mut self.candidates).into_inner()), - bucket_candidates: take(&mut self.bucket_candidates), - })); + match self.state.as_mut() { + Some((max_typos, _, _)) if self.typos > *max_typos => { + self.state = None; // reset state }, - (Some((max_typos, query_tree)), Allowed(candidates)) => { - if self.number_typos as usize > *max_typos { - self.query_tree = None; - self.candidates = Candidates::default(); - } else { - let fst = self.ctx.words_fst(); - let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, params.wdcache)? - } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, params.wdcache)?; - query_tree.clone() - } else { - query_tree.clone() + Some((_, _, Allowed(allowed_candidates))) if allowed_candidates.is_empty() => { + self.state = None; // reset state + }, + Some((_, query_tree, candidates_authorization)) => { + let fst = self.ctx.words_fst(); + let new_query_tree = if self.typos < MAX_TYPOS_PER_WORD { + alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)? + } else if self.typos == MAX_TYPOS_PER_WORD { + // When typos >= MAX_TYPOS_PER_WORD, no more alteration of the query tree is possible, + // we keep the altered query tree + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)?; + // we compute the allowed candidates + let query_tree_allowed_candidates = resolve_query_tree(self.ctx, query_tree, params.wdcache)?; + // we assign the allowed candidates to the candidates authorization. + *candidates_authorization = match take(candidates_authorization) { + Allowed(allowed_candidates) => Allowed(query_tree_allowed_candidates & allowed_candidates), + Forbidden(forbidden_candidates) => Allowed(query_tree_allowed_candidates - forbidden_candidates), }; - - let mut new_candidates = resolve_candidates( - self.ctx, - &new_query_tree, - self.number_typos, - &mut self.candidates_cache, - params.wdcache, - )?; - new_candidates.intersect_with(&candidates); - candidates.difference_with(&new_candidates); - self.number_typos += 1; - - return Ok(Some(CriterionResult { - query_tree: Some(new_query_tree), - candidates: Some(new_candidates), - bucket_candidates: take(&mut self.bucket_candidates), - })); - } - }, - (Some((max_typos, query_tree)), Forbidden(candidates)) => { - if self.number_typos as usize > *max_typos { - self.query_tree = None; - self.candidates = Candidates::default(); + query_tree.clone() } else { - let fst = self.ctx.words_fst(); - let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, params.wdcache)? - } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, params.wdcache)?; - query_tree.clone() - } else { - query_tree.clone() - }; + query_tree.clone() + }; - let mut new_candidates = resolve_candidates( - self.ctx, - &new_query_tree, - self.number_typos, - &mut self.candidates_cache, - params.wdcache, - )?; - new_candidates.difference_with(&candidates); - candidates.union_with(&new_candidates); - self.number_typos += 1; - self.bucket_candidates.union_with(&new_candidates); + let mut candidates = resolve_candidates( + self.ctx, + &new_query_tree, + self.typos, + &mut self.candidates_cache, + params.wdcache, + )?; - return Ok(Some(CriterionResult { - query_tree: Some(new_query_tree), - candidates: Some(new_candidates), - bucket_candidates: take(&mut self.bucket_candidates), - })); - } - }, - (None, Allowed(_)) => { - let candidates = take(&mut self.candidates).into_inner(); - return Ok(Some(CriterionResult { - query_tree: None, - candidates: Some(candidates.clone()), - bucket_candidates: candidates, - })); - }, - (None, Forbidden(_)) => { - match self.parent.next(params)? { - Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates: None, - bucket_candidates, - })); + match candidates_authorization { + Allowed(allowed_candidates) => { + candidates &= &*allowed_candidates; + *allowed_candidates -= &candidates; }, - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); - self.number_typos = 0; - self.candidates = candidates.map_or_else(|| { + Forbidden(forbidden_candidates) => { + candidates -= &*forbidden_candidates; + *forbidden_candidates |= &candidates; + }, + } + + let bucket_candidates = match self.bucket_candidates.as_mut() { + Some(bucket_candidates) => take(bucket_candidates), + None => candidates.clone(), + }; + + self.typos += 1; + + return Ok(Some(CriterionResult { + query_tree: Some(new_query_tree), + candidates: Some(candidates), + bucket_candidates: Some(bucket_candidates), + })); + }, + None => { + match self.parent.next(params)? { + Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { + self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) { + (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), + (self_bc, parent_bc) => self_bc.or(parent_bc), + }; + + let candidates = candidates.map_or_else(|| { Candidates::Forbidden(params.excluded_candidates.clone()) }, Candidates::Allowed); - self.bucket_candidates.union_with(&bucket_candidates); + + let maximum_typos = maximum_typo(&query_tree) as u8; + self.state = Some((maximum_typos, query_tree, candidates)); + self.typos = 0; + + }, + Some(CriterionResult { query_tree: None, candidates, bucket_candidates }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates, + bucket_candidates, + })); }, None => return Ok(None), } @@ -185,7 +176,6 @@ fn alterate_query_tree( ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache)) }, Operation::Query(q) => { - // TODO may be optimized when number_typos == 0 if let QueryKind::Tolerant { typo, word } = &q.kind { // if no typo is allowed we don't call word_derivations function, // and directly create an Exact query @@ -384,7 +374,7 @@ mod test { ]), ])), candidates: Some(candidates_1.clone()), - bucket_candidates: candidates_1, + bucket_candidates: Some(candidates_1), }; assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); @@ -406,7 +396,7 @@ mod test { ]), ])), candidates: Some(candidates_2.clone()), - bucket_candidates: candidates_2, + bucket_candidates: Some(candidates_2), }; assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_2)); @@ -428,7 +418,7 @@ mod test { let expected = CriterionResult { query_tree: None, candidates: Some(facet_candidates.clone()), - bucket_candidates: facet_candidates, + bucket_candidates: None, }; // first iteration, returns the facet candidates @@ -471,7 +461,7 @@ mod test { ]), ])), candidates: Some(&candidates_1 & &facet_candidates), - bucket_candidates: facet_candidates.clone(), + bucket_candidates: Some(&candidates_1 & &facet_candidates), }; assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); @@ -493,7 +483,7 @@ mod test { ]), ])), candidates: Some(&candidates_2 & &facet_candidates), - bucket_candidates: RoaringBitmap::new(), + bucket_candidates: Some(&candidates_2 & &facet_candidates), }; assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_2)); diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 23a45223a..2f7ebbfbf 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -1,4 +1,3 @@ -use std::collections::HashMap; use std::mem::take; use log::debug; @@ -11,9 +10,9 @@ pub struct Words<'t> { ctx: &'t dyn Context<'t>, query_trees: Vec, candidates: Option, - bucket_candidates: RoaringBitmap, + bucket_candidates: Option, parent: Box, - candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, + compute_candidates: bool, } impl<'t> Words<'t> { @@ -22,9 +21,9 @@ impl<'t> Words<'t> { ctx, query_trees: Vec::default(), candidates: None, - bucket_candidates: RoaringBitmap::new(), + bucket_candidates: None, parent, - candidates_cache: HashMap::default(), + compute_candidates: false, } } } @@ -40,55 +39,48 @@ impl<'t> Criterion for Words<'t> { loop { debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates); - match (self.query_trees.pop(), &mut self.candidates) { - (query_tree, Some(candidates)) if candidates.is_empty() => { - self.query_trees = Vec::new(); - return Ok(Some(CriterionResult { - query_tree, - candidates: self.candidates.take(), - bucket_candidates: take(&mut self.bucket_candidates), - })); - }, - (Some(qt), Some(candidates)) => { - let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, params.wdcache)?; - found_candidates.intersect_with(&candidates); - candidates.difference_with(&found_candidates); + match self.query_trees.pop() { + Some(query_tree) => { + let candidates = match self.candidates.as_mut() { + Some(allowed_candidates) if self.compute_candidates => { + let mut candidates = resolve_query_tree(self.ctx, &query_tree, params.wdcache)?; + candidates &= &*allowed_candidates; + *allowed_candidates -= &candidates; + Some(candidates) + }, + candidates => candidates.cloned(), + }; + + let bucket_candidates = match self.bucket_candidates.as_mut() { + Some(bucket_candidates) => Some(take(bucket_candidates)), + None => None, + }; return Ok(Some(CriterionResult { - query_tree: Some(qt), - candidates: Some(found_candidates), - bucket_candidates: take(&mut self.bucket_candidates), + query_tree: Some(query_tree), + candidates, + bucket_candidates, })); }, - (Some(qt), None) => { - return Ok(Some(CriterionResult { - query_tree: Some(qt), - candidates: None, - bucket_candidates: take(&mut self.bucket_candidates), - })); - }, - (None, Some(_)) => { - let candidates = self.candidates.take(); - return Ok(Some(CriterionResult { - query_tree: None, - candidates: candidates.clone(), - bucket_candidates: candidates.unwrap_or_default(), - })); - }, - (None, None) => { + None => { match self.parent.next(params)? { - Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { + Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { + self.query_trees = explode_query_tree(query_tree); + self.candidates = candidates; + self.compute_candidates = bucket_candidates.is_some(); + + self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) { + (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), + (self_bc, parent_bc) => self_bc.or(parent_bc), + }; + }, + Some(CriterionResult { query_tree: None, candidates, bucket_candidates }) => { return Ok(Some(CriterionResult { query_tree: None, - candidates: None, + candidates, bucket_candidates, })); }, - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); - self.candidates = candidates; - self.bucket_candidates.union_with(&bucket_candidates); - }, None => return Ok(None), } }, From efba662ca6ffc785acc5ae2ca7372016f010ea35 Mon Sep 17 00:00:00 2001 From: many Date: Mon, 10 May 2021 10:27:18 +0200 Subject: [PATCH 2/3] Fix clippy warnings in cirteria --- milli/src/search/criteria/asc_desc.rs | 4 +-- milli/src/search/criteria/attribute.rs | 20 +++++++------- milli/src/search/criteria/final.rs | 38 ++++++++++++-------------- milli/src/search/criteria/typo.rs | 36 ++++++++++++------------ milli/src/search/query_tree.rs | 10 +++---- 5 files changed, 54 insertions(+), 54 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 7b619f26a..31edc453e 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -215,7 +215,7 @@ fn iterative_facet_ordered_iter<'t>( docids_values.push((docid, OrderedFloat(value))); } } - docids_values.sort_unstable_by_key(|(_, v)| v.clone()); + docids_values.sort_unstable_by_key(|(_, v)| *v); let iter = docids_values.into_iter(); let iter = if ascending { Box::new(iter) as Box> @@ -226,7 +226,7 @@ fn iterative_facet_ordered_iter<'t>( // The itertools GroupBy iterator doesn't provide an owned version, we are therefore // required to collect the result into an owned collection (a Vec). // https://github.com/rust-itertools/itertools/issues/499 - let vec: Vec<_> = iter.group_by(|(_, v)| v.clone()) + let vec: Vec<_> = iter.group_by(|(_, v)| *v) .into_iter() .map(|(_, ids)| ids.map(|(id, _)| id).collect()) .collect(); diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index fc7050a7f..9d4d88d0d 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -155,7 +155,7 @@ impl<'t, 'q> WordLevelIterator<'t, 'q> { fn new(ctx: &'t dyn Context<'t>, word: Cow<'q, str>, in_prefix_cache: bool) -> heed::Result> { match ctx.word_position_last_level(&word, in_prefix_cache)? { Some(level) => { - let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::::into(level.clone()) as u32); + let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::::into(level) as u32); let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?; Ok(Some(Self { inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None })) }, @@ -164,8 +164,8 @@ impl<'t, 'q> WordLevelIterator<'t, 'q> { } fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel, left_interval: Option) -> heed::Result { - let level = level.min(&self.level).clone(); - let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::::into(level.clone()) as u32); + let level = *level.min(&self.level); + let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::::into(level) as u32); let word = self.word.clone(); let in_prefix_cache = self.in_prefix_cache; let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?; @@ -214,7 +214,7 @@ struct QueryLevelIterator<'t, 'q> { } impl<'t, 'q> QueryLevelIterator<'t, 'q> { - fn new(ctx: &'t dyn Context<'t>, queries: &'q Vec, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { + fn new(ctx: &'t dyn Context<'t>, queries: &'q [Query], wdcache: &mut WordDerivationsCache) -> anyhow::Result> { let mut inner = Vec::with_capacity(queries.len()); for query in queries { match &query.kind { @@ -244,7 +244,7 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { } } - let highest = inner.iter().max_by_key(|wli| wli.level).map(|wli| wli.level.clone()); + let highest = inner.iter().max_by_key(|wli| wli.level).map(|wli| wli.level); match highest { Some(level) => Ok(Some(Self { parent: None, @@ -287,7 +287,7 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { let u8_level = Into::::into(level); let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32); for wli in self.inner.iter_mut() { - let wli_u8_level = Into::::into(wli.level.clone()); + let wli_u8_level = Into::::into(wli.level); let accumulated_count = LEVEL_EXPONENTIATION_BASE.pow((u8_level - wli_u8_level) as u32); for _ in 0..accumulated_count { if let Some((next_left, _, next_docids)) = wli.next()? { @@ -364,8 +364,8 @@ fn interval_to_skip( already_skiped: usize, allowed_candidates: &RoaringBitmap, ) -> usize { - parent_accumulator.into_iter() - .zip(current_accumulator.into_iter()) + parent_accumulator.iter() + .zip(current_accumulator.iter()) .skip(already_skiped) .take_while(|(parent, current)| { let skip_parent = parent.as_ref().map_or(true, |(_, _, docids)| docids.is_empty()); @@ -410,7 +410,7 @@ impl<'t, 'q> Branch<'t, 'q> { /// update inner interval in order to be ranked by the binary_heap without computing it, /// the next() method should be called when the real interval is needed. fn lazy_next(&mut self) { - let u8_level = Into::::into(self.tree_level.clone()); + let u8_level = Into::::into(self.tree_level); let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32); let (left, right, _) = self.last_result; @@ -679,7 +679,7 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) { vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]] } else { - ops.into_iter().map(recurse).flatten().collect() + ops.iter().map(recurse).flatten().collect() }, Operation::Query(query) => vec![vec![vec![query.clone()]]], } diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs index 0dbf3ee1a..e2fb81aaf 100644 --- a/milli/src/search/criteria/final.rs +++ b/milli/src/search/criteria/final.rs @@ -30,30 +30,28 @@ impl<'t> Final<'t> { #[logging_timer::time("Final::{}")] pub fn next(&mut self, excluded_candidates: &RoaringBitmap) -> anyhow::Result> { - loop { - debug!("Final iteration"); - let mut criterion_parameters = CriterionParameters { - wdcache: &mut self.wdcache, - // returned_candidates is merged with excluded_candidates to avoid duplicas - excluded_candidates: &(&self.returned_candidates | excluded_candidates), - }; + debug!("Final iteration"); + let mut criterion_parameters = CriterionParameters { + wdcache: &mut self.wdcache, + // returned_candidates is merged with excluded_candidates to avoid duplicas + excluded_candidates: &(&self.returned_candidates | excluded_candidates), + }; - match self.parent.next(&mut criterion_parameters)? { - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - let candidates = match (candidates, query_tree.as_ref()) { - (Some(candidates), _) => candidates, - (None, Some(qt)) => resolve_query_tree(self.ctx, qt, &mut self.wdcache)?, - (None, None) => self.ctx.documents_ids()?, - }; + match self.parent.next(&mut criterion_parameters)? { + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + let candidates = match (candidates, query_tree.as_ref()) { + (Some(candidates), _) => candidates, + (None, Some(qt)) => resolve_query_tree(self.ctx, qt, &mut self.wdcache)?, + (None, None) => self.ctx.documents_ids()?, + }; - let bucket_candidates = bucket_candidates.unwrap_or_else(|| candidates.clone()); + let bucket_candidates = bucket_candidates.unwrap_or_else(|| candidates.clone()); - self.returned_candidates |= &candidates; + self.returned_candidates |= &candidates; - return Ok(Some(FinalResult { query_tree, candidates, bucket_candidates })); - }, - None => return Ok(None), - } + return Ok(Some(FinalResult { query_tree, candidates, bucket_candidates })); + }, + None => return Ok(None), } } } diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 288a92f65..059d52e48 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -70,22 +70,24 @@ impl<'t> Criterion for Typo<'t> { }, Some((_, query_tree, candidates_authorization)) => { let fst = self.ctx.words_fst(); - let new_query_tree = if self.typos < MAX_TYPOS_PER_WORD { - alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)? - } else if self.typos == MAX_TYPOS_PER_WORD { - // When typos >= MAX_TYPOS_PER_WORD, no more alteration of the query tree is possible, - // we keep the altered query tree - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)?; - // we compute the allowed candidates - let query_tree_allowed_candidates = resolve_query_tree(self.ctx, query_tree, params.wdcache)?; - // we assign the allowed candidates to the candidates authorization. - *candidates_authorization = match take(candidates_authorization) { - Allowed(allowed_candidates) => Allowed(query_tree_allowed_candidates & allowed_candidates), - Forbidden(forbidden_candidates) => Allowed(query_tree_allowed_candidates - forbidden_candidates), - }; - query_tree.clone() - } else { - query_tree.clone() + let new_query_tree = match self.typos { + typos if typos < MAX_TYPOS_PER_WORD => { + alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)? + }, + MAX_TYPOS_PER_WORD => { + // When typos >= MAX_TYPOS_PER_WORD, no more alteration of the query tree is possible, + // we keep the altered query tree + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)?; + // we compute the allowed candidates + let query_tree_allowed_candidates = resolve_query_tree(self.ctx, query_tree, params.wdcache)?; + // we assign the allowed candidates to the candidates authorization. + *candidates_authorization = match take(candidates_authorization) { + Allowed(allowed_candidates) => Allowed(query_tree_allowed_candidates & allowed_candidates), + Forbidden(forbidden_candidates) => Allowed(query_tree_allowed_candidates - forbidden_candidates), + }; + query_tree.clone() + }, + _otherwise => query_tree.clone(), }; let mut candidates = resolve_candidates( @@ -187,7 +189,7 @@ fn alterate_query_tree( } else { let typo = *typo.min(&number_typos); let words = word_derivations(word, q.prefix, typo, words_fst, wdcache)?; - let queries = words.into_iter().map(|(word, typo)| { + let queries = words.iter().map(|(word, typo)| { Operation::Query(Query { prefix: false, kind: QueryKind::Exact { original_typo: *typo, word: word.to_string() }, diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index b74b8af58..4876e37c8 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -241,7 +241,7 @@ impl<'a> QueryTreeBuilder<'a> { } /// Split the word depending on the frequency of subwords in the database documents. -fn split_best_frequency<'a>(ctx: &impl Context, word: &'a str) -> heed::Result> { +fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result> { let chars = word.char_indices().skip(1); let mut best = None; @@ -438,14 +438,14 @@ fn create_query_tree( let start = number_phrases + (number_phrases == 0) as usize; for len in start..=query.len() { let mut word_count = len - number_phrases; - let query: Vec<_> = query.iter().filter_map(|p| { + let query: Vec<_> = query.iter().filter(|p| { if p.is_phrase() { - Some(p) + true } else if word_count != 0 { word_count -= 1; - Some(p) + true } else { - None + false } }) .cloned() From a3944a7083ebe3a4729051b59da03f30d2e78512 Mon Sep 17 00:00:00 2001 From: many Date: Mon, 10 May 2021 12:33:37 +0200 Subject: [PATCH 3/3] Introduce a filtered_candidates field --- milli/src/search/criteria/asc_desc.rs | 20 ++++++++++---------- milli/src/search/criteria/attribute.rs | 16 +++++++++++++--- milli/src/search/criteria/exactness.rs | 16 +++++++++++----- milli/src/search/criteria/final.rs | 19 ++++++++++++------- milli/src/search/criteria/initial.rs | 5 +++-- milli/src/search/criteria/mod.rs | 2 ++ milli/src/search/criteria/proximity.rs | 12 +++++++++--- milli/src/search/criteria/typo.rs | 20 ++++++++++++++------ milli/src/search/criteria/words.rs | 16 +++++++++------- 9 files changed, 83 insertions(+), 43 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 31edc453e..0511ce319 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -93,23 +93,22 @@ impl<'t> Criterion for AscDesc<'t> { match self.candidates.next().transpose()? { None => { match self.parent.next(params)? { - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + Some(CriterionResult { query_tree, candidates, filtered_candidates, bucket_candidates }) => { self.query_tree = query_tree; - let candidates = match (&self.query_tree, candidates) { - (_, Some(mut candidates)) => { - candidates.intersect_with(&self.faceted_candidates); - candidates - }, + let mut candidates = match (&self.query_tree, candidates) { + (_, Some(candidates)) => candidates & &self.faceted_candidates, (Some(qt), None) => { let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; - let mut candidates = resolve_query_tree(&context, qt, params.wdcache)?; - candidates -= params.excluded_candidates; - candidates.intersect_with(&self.faceted_candidates); - candidates + let candidates = resolve_query_tree(&context, qt, params.wdcache)?; + candidates & &self.faceted_candidates }, (None, None) => take(&mut self.faceted_candidates), }; + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } + match bucket_candidates { Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, None => self.bucket_candidates |= &candidates, @@ -136,6 +135,7 @@ impl<'t> Criterion for AscDesc<'t> { return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), candidates: Some(candidates), + filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 9d4d88d0d..6818e02fd 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -25,6 +25,7 @@ const LEVEL_EXPONENTIATION_BASE: u32 = 4; const CANDIDATES_THRESHOLD: u64 = 1000; type FlattenedQueryTree = Vec>>; + pub struct Attribute<'t> { ctx: &'t dyn Context<'t>, state: Option<(Operation, FlattenedQueryTree, RoaringBitmap)>, @@ -59,6 +60,7 @@ impl<'t> Criterion for Attribute<'t> { return Ok(Some(CriterionResult { query_tree: Some(query_tree), candidates: Some(RoaringBitmap::new()), + filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, @@ -78,6 +80,7 @@ impl<'t> Criterion for Attribute<'t> { return Ok(Some(CriterionResult { query_tree: Some(query_tree), candidates: Some(RoaringBitmap::new()), + filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, @@ -89,6 +92,7 @@ impl<'t> Criterion for Attribute<'t> { return Ok(Some(CriterionResult { query_tree: Some(query_tree), candidates: Some(RoaringBitmap::new()), + filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, @@ -102,17 +106,22 @@ impl<'t> Criterion for Attribute<'t> { return Ok(Some(CriterionResult { query_tree: Some(query_tree), candidates: Some(found_candidates), + filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, None => { match self.parent.next(params)? { - Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { - let candidates = match candidates { + Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { + let mut candidates = match candidates { Some(candidates) => candidates, None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates, }; + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } + let flattened_query_tree = flatten_query_tree(&query_tree); match bucket_candidates { @@ -123,10 +132,11 @@ impl<'t> Criterion for Attribute<'t> { self.state = Some((query_tree, flattened_query_tree, candidates)); self.current_buckets = None; }, - Some(CriterionResult { query_tree: None, candidates, bucket_candidates }) => { + Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { return Ok(Some(CriterionResult { query_tree: None, candidates, + filtered_candidates, bucket_candidates, })); }, diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index 2fd216053..b1026ccc2 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -66,17 +66,22 @@ impl<'t> Criterion for Exactness<'t> { return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), candidates: Some(candidates), + filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, None => { match self.parent.next(params)? { - Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { - let candidates = match candidates { + Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { + let mut candidates = match candidates { Some(candidates) => candidates, - None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)?, + None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates, }; + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } + match bucket_candidates { Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, None => self.bucket_candidates |= &candidates, @@ -85,10 +90,11 @@ impl<'t> Criterion for Exactness<'t> { self.state = Some(State::new(candidates)); self.query_tree = Some(query_tree); }, - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { return Ok(Some(CriterionResult { - query_tree, + query_tree: None, candidates, + filtered_candidates, bucket_candidates, })); }, diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs index e2fb81aaf..860362f51 100644 --- a/milli/src/search/criteria/final.rs +++ b/milli/src/search/criteria/final.rs @@ -31,27 +31,32 @@ impl<'t> Final<'t> { #[logging_timer::time("Final::{}")] pub fn next(&mut self, excluded_candidates: &RoaringBitmap) -> anyhow::Result> { debug!("Final iteration"); + let excluded_candidates = &self.returned_candidates | excluded_candidates; let mut criterion_parameters = CriterionParameters { wdcache: &mut self.wdcache, // returned_candidates is merged with excluded_candidates to avoid duplicas - excluded_candidates: &(&self.returned_candidates | excluded_candidates), + excluded_candidates: &excluded_candidates, }; match self.parent.next(&mut criterion_parameters)? { - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - let candidates = match (candidates, query_tree.as_ref()) { + Some(CriterionResult { query_tree, candidates, filtered_candidates, bucket_candidates }) => { + let mut candidates = match (candidates, query_tree.as_ref()) { (Some(candidates), _) => candidates, - (None, Some(qt)) => resolve_query_tree(self.ctx, qt, &mut self.wdcache)?, - (None, None) => self.ctx.documents_ids()?, + (None, Some(qt)) => resolve_query_tree(self.ctx, qt, &mut self.wdcache)? - excluded_candidates, + (None, None) => self.ctx.documents_ids()? - excluded_candidates, }; + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } + let bucket_candidates = bucket_candidates.unwrap_or_else(|| candidates.clone()); self.returned_candidates |= &candidates; - return Ok(Some(FinalResult { query_tree, candidates, bucket_candidates })); + Ok(Some(FinalResult { query_tree, candidates, bucket_candidates })) }, - None => return Ok(None), + None => Ok(None), } } } diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index eb2bf0b95..5d242a0eb 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -9,10 +9,11 @@ pub struct Initial { } impl Initial { - pub fn new(query_tree: Option, mut candidates: Option) -> Initial { + pub fn new(query_tree: Option, filtered_candidates: Option) -> Initial { let answer = CriterionResult { query_tree, - candidates: candidates.take(), + candidates: None, + filtered_candidates, bucket_candidates: None, }; Initial { answer: Some(answer) } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index f9fca2624..99e4a4209 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -38,6 +38,8 @@ pub struct CriterionResult { /// The candidates that this criterion is allowed to return subsets of, /// if None, it is up to the child to compute the candidates itself. candidates: Option, + /// The candidates, coming from facet filters, that this criterion is allowed to return subsets of. + filtered_candidates: Option, /// Candidates that comes from the current bucket of the initial criterion. bucket_candidates: Option, } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index e50d3941d..bf9be9b9f 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -113,17 +113,22 @@ impl<'t> Criterion for Proximity<'t> { return Ok(Some(CriterionResult { query_tree: Some(query_tree.clone()), candidates: Some(new_candidates), + filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, None => { match self.parent.next(params)? { - Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { - let candidates = match candidates { + Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { + let mut candidates = match candidates { Some(candidates) => candidates, None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates, }; + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } + match bucket_candidates { Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, None => self.bucket_candidates |= &candidates, @@ -134,10 +139,11 @@ impl<'t> Criterion for Proximity<'t> { self.proximity = 0; self.plane_sweep_cache = None; }, - Some(CriterionResult { query_tree: None, candidates, bucket_candidates }) => { + Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { return Ok(Some(CriterionResult { query_tree: None, candidates, + filtered_candidates, bucket_candidates, })); }, diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 059d52e48..a844417eb 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -119,30 +119,33 @@ impl<'t> Criterion for Typo<'t> { return Ok(Some(CriterionResult { query_tree: Some(new_query_tree), candidates: Some(candidates), + filtered_candidates: None, bucket_candidates: Some(bucket_candidates), })); }, None => { match self.parent.next(params)? { - Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { + Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) { (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), (self_bc, parent_bc) => self_bc.or(parent_bc), }; - let candidates = candidates.map_or_else(|| { - Candidates::Forbidden(params.excluded_candidates.clone()) - }, Candidates::Allowed); + let candidates = match candidates.or(filtered_candidates) { + Some(candidates) => Candidates::Allowed(candidates - params.excluded_candidates), + None => Candidates::Forbidden(params.excluded_candidates.clone()), + }; let maximum_typos = maximum_typo(&query_tree) as u8; self.state = Some((maximum_typos, query_tree, candidates)); self.typos = 0; }, - Some(CriterionResult { query_tree: None, candidates, bucket_candidates }) => { + Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { return Ok(Some(CriterionResult { query_tree: None, candidates, + filtered_candidates, bucket_candidates, })); }, @@ -377,6 +380,7 @@ mod test { ])), candidates: Some(candidates_1.clone()), bucket_candidates: Some(candidates_1), + filtered_candidates: None, }; assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); @@ -399,6 +403,7 @@ mod test { ])), candidates: Some(candidates_2.clone()), bucket_candidates: Some(candidates_2), + filtered_candidates: None, }; assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_2)); @@ -419,8 +424,9 @@ mod test { let expected = CriterionResult { query_tree: None, - candidates: Some(facet_candidates.clone()), + candidates: None, bucket_candidates: None, + filtered_candidates: Some(facet_candidates.clone()), }; // first iteration, returns the facet candidates @@ -464,6 +470,7 @@ mod test { ])), candidates: Some(&candidates_1 & &facet_candidates), bucket_candidates: Some(&candidates_1 & &facet_candidates), + filtered_candidates: None, }; assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); @@ -486,6 +493,7 @@ mod test { ])), candidates: Some(&candidates_2 & &facet_candidates), bucket_candidates: Some(&candidates_2 & &facet_candidates), + filtered_candidates: None, }; assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_2)); diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 2f7ebbfbf..8730fa331 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -11,8 +11,8 @@ pub struct Words<'t> { query_trees: Vec, candidates: Option, bucket_candidates: Option, + filtered_candidates: Option, parent: Box, - compute_candidates: bool, } impl<'t> Words<'t> { @@ -23,7 +23,7 @@ impl<'t> Words<'t> { candidates: None, bucket_candidates: None, parent, - compute_candidates: false, + filtered_candidates: None, } } } @@ -42,13 +42,13 @@ impl<'t> Criterion for Words<'t> { match self.query_trees.pop() { Some(query_tree) => { let candidates = match self.candidates.as_mut() { - Some(allowed_candidates) if self.compute_candidates => { + Some(allowed_candidates) => { let mut candidates = resolve_query_tree(self.ctx, &query_tree, params.wdcache)?; candidates &= &*allowed_candidates; *allowed_candidates -= &candidates; Some(candidates) }, - candidates => candidates.cloned(), + None => None, }; let bucket_candidates = match self.bucket_candidates.as_mut() { @@ -59,25 +59,27 @@ impl<'t> Criterion for Words<'t> { return Ok(Some(CriterionResult { query_tree: Some(query_tree), candidates, + filtered_candidates: self.filtered_candidates.clone(), bucket_candidates, })); }, None => { match self.parent.next(params)? { - Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { + Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { self.query_trees = explode_query_tree(query_tree); self.candidates = candidates; - self.compute_candidates = bucket_candidates.is_some(); + self.filtered_candidates = filtered_candidates; self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) { (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), (self_bc, parent_bc) => self_bc.or(parent_bc), }; }, - Some(CriterionResult { query_tree: None, candidates, bucket_candidates }) => { + Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { return Ok(Some(CriterionResult { query_tree: None, candidates, + filtered_candidates, bucket_candidates, })); },