227: Replace Consecutive by Phrase in query tree r=Kerollmops a=ManyTheFish

Replace `Consecutive` by `Phrase` in the query tree in order to remove theoretical bugs,
due to the `Consecutive` enum type.

Co-authored-by: many <maxime@meilisearch.com>
Co-authored-by: Many <legendre.maxime.isn@gmail.com>
This commit is contained in:
bors[bot] 2021-06-10 09:31:39 +00:00 committed by GitHub
commit 3e6c05fe13
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 130 additions and 104 deletions

View File

@ -578,7 +578,6 @@ fn linear_compute_candidates(
fn compute_candidate_rank(branches: &FlattenedQueryTree, words_positions: HashMap<String, RoaringBitmap>) -> u64 { fn compute_candidate_rank(branches: &FlattenedQueryTree, words_positions: HashMap<String, RoaringBitmap>) -> u64 {
let mut min_rank = u64::max_value(); let mut min_rank = u64::max_value();
for branch in branches { for branch in branches {
let branch_len = branch.len(); let branch_len = branch.len();
let mut branch_rank = Vec::with_capacity(branch_len); let mut branch_rank = Vec::with_capacity(branch_len);
for derivates in branch { for derivates in branch {
@ -661,7 +660,7 @@ fn linear_compute_candidates(
// TODO can we keep refs of Query // TODO can we keep refs of Query
fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
use crate::search::criteria::Operation::{And, Or, Consecutive}; use crate::search::criteria::Operation::{And, Or, Phrase};
fn and_recurse(head: &Operation, tail: &[Operation]) -> FlattenedQueryTree { fn and_recurse(head: &Operation, tail: &[Operation]) -> FlattenedQueryTree {
match tail.split_first() { match tail.split_first() {
@ -683,7 +682,7 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
fn recurse(op: &Operation) -> FlattenedQueryTree { fn recurse(op: &Operation) -> FlattenedQueryTree {
match op { match op {
And(ops) | Consecutive(ops) => { And(ops) => {
ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)) ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t))
}, },
Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) { Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) {
@ -691,6 +690,12 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
} else { } else {
ops.iter().map(recurse).flatten().collect() ops.iter().map(recurse).flatten().collect()
}, },
Phrase(words) => {
let queries = words.iter().map(|word| {
vec![Query {prefix: false, kind: QueryKind::exact(word.clone())}]
}).collect();
vec![queries]
}
Operation::Query(query) => vec![vec![vec![query.clone()]]], Operation::Query(query) => vec![vec![vec![query.clone()]]],
} }
} }

View File

@ -1,7 +1,6 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::borrow::Cow; use std::borrow::Cow;
use anyhow::bail;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::{FieldId, TreeLevel, search::{word_derivations, WordDerivationsCache}}; use crate::{FieldId, TreeLevel, search::{word_derivations, WordDerivationsCache}};
@ -239,7 +238,7 @@ pub fn resolve_query_tree<'t>(
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<RoaringBitmap> ) -> anyhow::Result<RoaringBitmap>
{ {
use Operation::{And, Consecutive, Or, Query}; use Operation::{And, Phrase, Or, Query};
match query_tree { match query_tree {
And(ops) => { And(ops) => {
@ -261,26 +260,23 @@ pub fn resolve_query_tree<'t>(
} }
Ok(candidates) Ok(candidates)
}, },
Consecutive(ops) => { Phrase(words) => {
let mut candidates = RoaringBitmap::new(); let mut candidates = RoaringBitmap::new();
let mut first_loop = true; let mut first_loop = true;
for slice in ops.windows(2) { for slice in words.windows(2) {
match (&slice[0], &slice[1]) { let (left, right) = (&slice[0], &slice[1]);
(Operation::Query(left), Operation::Query(right)) => { match ctx.word_pair_proximity_docids(left, right, 1)? {
match query_pair_proximity_docids(ctx, left, right, 1, wdcache)? { Some(pair_docids) => {
pair_docids if pair_docids.is_empty() => { if pair_docids.is_empty() {
return Ok(RoaringBitmap::new()) return Ok(RoaringBitmap::new());
}, } else if first_loop {
pair_docids if first_loop => {
candidates = pair_docids; candidates = pair_docids;
first_loop = false; first_loop = false;
}, } else {
pair_docids => { candidates &= pair_docids;
candidates.intersect_with(&pair_docids);
},
} }
}, },
_ => bail!("invalid consecutive query type"), None => return Ok(RoaringBitmap::new())
} }
} }
Ok(candidates) Ok(candidates)

View File

@ -171,12 +171,33 @@ fn resolve_candidates<'t>(
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<Vec<(Query, Query, RoaringBitmap)>> ) -> anyhow::Result<Vec<(Query, Query, RoaringBitmap)>>
{ {
use Operation::{And, Consecutive, Or, Query}; use Operation::{And, Phrase, Or};
let result = match query_tree { let result = match query_tree {
And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?, And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?,
Consecutive(ops) => if proximity == 0 { Phrase(words) => if proximity == 0 {
mdfs(ctx, ops, 0, cache, wdcache)? let most_left = words.first().map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
let most_right = words.last().map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
let mut candidates = None;
for slice in words.windows(2) {
let (left, right) = (&slice[0], &slice[1]);
match ctx.word_pair_proximity_docids(left, right, 1)? {
Some(pair_docids) => {
match candidates.as_mut() {
Some(candidates) => *candidates &= pair_docids,
None => candidates = Some(pair_docids),
}
},
None => {
candidates = None;
break;
}
}
}
match (most_left, most_right, candidates) {
(Some(l), Some(r), Some(c)) => vec![(l, r, c)],
_otherwise => Default::default(),
}
} else { } else {
Default::default() Default::default()
}, },
@ -188,7 +209,7 @@ fn resolve_candidates<'t>(
} }
output output
}, },
Query(q) => if proximity == 0 { Operation::Query(q) => if proximity == 0 {
let candidates = query_docids(ctx, q, wdcache)?; let candidates = query_docids(ctx, q, wdcache)?;
vec![(q.clone(), q.clone(), candidates)] vec![(q.clone(), q.clone(), candidates)]
} else { } else {
@ -306,14 +327,9 @@ fn resolve_plane_sweep_candidates(
) -> anyhow::Result<BTreeMap<u8, RoaringBitmap>> ) -> anyhow::Result<BTreeMap<u8, RoaringBitmap>>
{ {
/// FIXME may be buggy with query like "new new york" /// FIXME may be buggy with query like "new new york"
fn plane_sweep<'a>( fn plane_sweep(
ctx: &dyn Context, groups_positions: Vec<Vec<(Position, u8, Position)>>,
operations: &'a [Operation],
docid: DocumentId,
consecutive: bool, consecutive: bool,
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
words_positions: &HashMap<String, RoaringBitmap>,
wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<Vec<(Position, u8, Position)>> ) -> anyhow::Result<Vec<(Position, u8, Position)>>
{ {
fn compute_groups_proximity( fn compute_groups_proximity(
@ -362,13 +378,9 @@ fn resolve_plane_sweep_candidates(
} }
} }
let groups_len = operations.len(); let groups_len = groups_positions.len();
let mut groups_positions = Vec::with_capacity(groups_len);
for operation in operations { let mut groups_positions: Vec<_> = groups_positions.into_iter().map(|pos| pos.into_iter()).collect();
let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?;
groups_positions.push(positions.into_iter());
}
// Pop top elements of each list. // Pop top elements of each list.
let mut current = Vec::with_capacity(groups_len); let mut current = Vec::with_capacity(groups_len);
@ -441,15 +453,32 @@ fn resolve_plane_sweep_candidates(
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<Vec<(Position, u8, Position)>> ) -> anyhow::Result<Vec<(Position, u8, Position)>>
{ {
use Operation::{And, Consecutive, Or}; use Operation::{And, Phrase, Or};
if let Some(result) = rocache.get(query_tree) { if let Some(result) = rocache.get(query_tree) {
return Ok(result.clone()); return Ok(result.clone());
} }
let result = match query_tree { let result = match query_tree {
And(ops) => plane_sweep(ctx, ops, docid, false, rocache, words_positions, wdcache)?, And(ops) => {
Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, words_positions, wdcache)?, let mut groups_positions = Vec::with_capacity(ops.len());
for operation in ops {
let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?;
groups_positions.push(positions);
}
plane_sweep(groups_positions, false)?
},
Phrase(words) => {
let mut groups_positions = Vec::with_capacity(words.len());
for word in words {
let positions = match words_positions.get(word) {
Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(),
None => vec![],
};
groups_positions.push(positions);
}
plane_sweep(groups_positions, true)?
},
Or(_, ops) => { Or(_, ops) => {
let mut result = Vec::new(); let mut result = Vec::new();
for op in ops { for op in ops {

View File

@ -1,6 +1,5 @@
use std::{borrow::Cow, collections::HashMap, mem::take}; use std::{borrow::Cow, collections::HashMap, mem::take};
use anyhow::bail;
use log::debug; use log::debug;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
@ -13,7 +12,6 @@ use super::{
CriterionParameters, CriterionParameters,
CriterionResult, CriterionResult,
query_docids, query_docids,
query_pair_proximity_docids,
resolve_query_tree, resolve_query_tree,
}; };
@ -174,12 +172,14 @@ fn alterate_query_tree(
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<()> ) -> anyhow::Result<()>
{ {
use Operation::{And, Consecutive, Or}; use Operation::{And, Phrase, Or};
match operation { match operation {
And(ops) | Consecutive(ops) | Or(_, ops) => { And(ops) | Or(_, ops) => {
ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache)) ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache))
}, },
// Because Phrases don't allow typos, no alteration can be done.
Phrase(_words) => return Ok(()),
Operation::Query(q) => { Operation::Query(q) => {
if let QueryKind::Tolerant { typo, word } = &q.kind { if let QueryKind::Tolerant { typo, word } = &q.kind {
// if no typo is allowed we don't call word_derivations function, // if no typo is allowed we don't call word_derivations function,
@ -228,32 +228,29 @@ fn resolve_candidates<'t>(
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<RoaringBitmap> ) -> anyhow::Result<RoaringBitmap>
{ {
use Operation::{And, Consecutive, Or, Query}; use Operation::{And, Phrase, Or, Query};
match query_tree { match query_tree {
And(ops) => { And(ops) => {
mdfs(ctx, ops, number_typos, cache, wdcache) mdfs(ctx, ops, number_typos, cache, wdcache)
}, },
Consecutive(ops) => { Phrase(words) => {
let mut candidates = RoaringBitmap::new(); let mut candidates = RoaringBitmap::new();
let mut first_loop = true; let mut first_loop = true;
for slice in ops.windows(2) { for slice in words.windows(2) {
match (&slice[0], &slice[1]) { let (left, right) = (&slice[0], &slice[1]);
(Operation::Query(left), Operation::Query(right)) => { match ctx.word_pair_proximity_docids(left, right, 1)? {
match query_pair_proximity_docids(ctx, left, right, 1, wdcache)? { Some(pair_docids) => {
pair_docids if pair_docids.is_empty() => { if pair_docids.is_empty() {
return Ok(RoaringBitmap::new()) return Ok(RoaringBitmap::new());
}, } else if first_loop {
pair_docids if first_loop => {
candidates = pair_docids; candidates = pair_docids;
first_loop = false; first_loop = false;
}, } else {
pair_docids => { candidates &= pair_docids;
candidates.intersect_with(&pair_docids);
},
} }
}, },
_ => bail!("invalid consecutive query type"), None => return Ok(RoaringBitmap::new())
} }
} }
Ok(candidates) Ok(candidates)

View File

@ -52,13 +52,18 @@ impl MatchingWords {
fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> { fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> {
fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) { fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) {
match tree { match tree {
Operation::Or(_, ops) | Operation::And(ops) | Operation::Consecutive(ops) => { Operation::Or(_, ops) | Operation::And(ops) => {
ops.as_slice().iter().for_each(|op| resolve_ops(op, out)); ops.as_slice().iter().for_each(|op| resolve_ops(op, out));
}, },
Operation::Query(Query { prefix, kind }) => { Operation::Query(Query { prefix, kind }) => {
let typo = if kind.is_exact() { 0 } else { kind.typo() }; let typo = if kind.is_exact() { 0 } else { kind.typo() };
out.insert((kind.word(), typo, *prefix)); out.insert((kind.word(), typo, *prefix));
}, },
Operation::Phrase(words) => {
for word in words {
out.insert((word, 0, false));
}
}
} }
} }

View File

@ -15,7 +15,8 @@ type IsPrefix = bool;
#[derive(Clone, PartialEq, Eq, Hash)] #[derive(Clone, PartialEq, Eq, Hash)]
pub enum Operation { pub enum Operation {
And(Vec<Operation>), And(Vec<Operation>),
Consecutive(Vec<Operation>), // serie of consecutive non prefix and exact words
Phrase(Vec<String>),
Or(IsOptionalWord, Vec<Operation>), Or(IsOptionalWord, Vec<Operation>),
Query(Query), Query(Query),
} }
@ -28,9 +29,8 @@ impl fmt::Debug for Operation {
writeln!(f, "{:1$}AND", "", depth * 2)?; writeln!(f, "{:1$}AND", "", depth * 2)?;
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
}, },
Operation::Consecutive(children) => { Operation::Phrase(children) => {
writeln!(f, "{:1$}CONSECUTIVE", "", depth * 2)?; writeln!(f, "{:2$}PHRASE {:?}", "", children, depth * 2)
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
}, },
Operation::Or(true, children) => { Operation::Or(true, children) => {
writeln!(f, "{:1$}OR(WORD)", "", depth * 2)?; writeln!(f, "{:1$}OR(WORD)", "", depth * 2)?;
@ -49,14 +49,6 @@ impl fmt::Debug for Operation {
} }
impl Operation { impl Operation {
fn phrase(words: Vec<String>) -> Operation {
Operation::consecutive(
words.into_iter().map(|s| {
Operation::Query(Query { prefix: false, kind: QueryKind::exact(s) })
}).collect()
)
}
fn and(mut ops: Vec<Self>) -> Self { fn and(mut ops: Vec<Self>) -> Self {
if ops.len() == 1 { if ops.len() == 1 {
ops.pop().unwrap() ops.pop().unwrap()
@ -73,11 +65,11 @@ impl Operation {
} }
} }
fn consecutive(mut ops: Vec<Self>) -> Self { fn phrase(mut words: Vec<String>) -> Self {
if ops.len() == 1 { if words.len() == 1 {
ops.pop().unwrap() Self::Query(Query { prefix: false, kind: QueryKind::exact(words.pop().unwrap()) })
} else { } else {
Self::Consecutive(ops) Self::Phrase(words)
} }
} }
@ -256,10 +248,10 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<O
} }
} }
Ok(best.map(|(_, left, right)| Operation::Consecutive( Ok(best.map(|(_, left, right)| Operation::Phrase(
vec![ vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact(left.to_string()) }), left.to_string(),
Operation::Query(Query { prefix: false, kind: QueryKind::exact(right.to_string()) }) right.to_string()
] ]
))) )))
} }
@ -494,24 +486,26 @@ fn create_primitive_query(query: TokenStream, stop_words: Option<Set<&[u8]>>, wo
/// Returns the maximum number of typos that this Operation allows. /// Returns the maximum number of typos that this Operation allows.
pub fn maximum_typo(operation: &Operation) -> usize { pub fn maximum_typo(operation: &Operation) -> usize {
use Operation::{Or, And, Query, Consecutive}; use Operation::{Or, And, Query, Phrase};
match operation { match operation {
Or(_, ops) => ops.iter().map(maximum_typo).max().unwrap_or(0), Or(_, ops) => ops.iter().map(maximum_typo).max().unwrap_or(0),
And(ops) | Consecutive(ops) => ops.iter().map(maximum_typo).sum::<usize>(), And(ops) => ops.iter().map(maximum_typo).sum::<usize>(),
Query(q) => q.kind.typo() as usize, Query(q) => q.kind.typo() as usize,
// no typo allowed in phrases
Phrase(_) => 0,
} }
} }
/// Returns the maximum proximity that this Operation allows. /// Returns the maximum proximity that this Operation allows.
pub fn maximum_proximity(operation: &Operation) -> usize { pub fn maximum_proximity(operation: &Operation) -> usize {
use Operation::{Or, And, Query, Consecutive}; use Operation::{Or, And, Query, Phrase};
match operation { match operation {
Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0), Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0),
And(ops) => { And(ops) => {
ops.iter().map(maximum_proximity).sum::<usize>() ops.iter().map(maximum_proximity).sum::<usize>()
+ ops.len().saturating_sub(1) * 7 + ops.len().saturating_sub(1) * 7
}, },
Query(_) | Consecutive(_) => 0, Query(_) | Phrase(_) => 0,
} }
} }
@ -765,9 +759,9 @@ mod test {
let expected = Operation::Or(false, vec![ let expected = Operation::Or(false, vec![
Operation::And(vec![ Operation::And(vec![
Operation::Or(false, vec![ Operation::Or(false, vec![
Operation::Consecutive(vec![ Operation::Phrase(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("word".to_string()) }), "word".to_string(),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), "split".to_string(),
]), ]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplit".to_string()) }), Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplit".to_string()) }),
]), ]),
@ -789,9 +783,9 @@ mod test {
let tokens = result.tokens(); let tokens = result.tokens();
let expected = Operation::And(vec![ let expected = Operation::And(vec![
Operation::Consecutive(vec![ Operation::Phrase(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), "hey".to_string(),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("friends".to_string()) }), "friends".to_string(),
]), ]),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }), Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }),
]); ]);
@ -809,13 +803,13 @@ mod test {
let tokens = result.tokens(); let tokens = result.tokens();
let expected = Operation::And(vec![ let expected = Operation::And(vec![
Operation::Consecutive(vec![ Operation::Phrase(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), "hey".to_string(),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("friends".to_string()) }), "friends".to_string(),
]), ]),
Operation::Consecutive(vec![ Operation::Phrase(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }), "wooop".to_string(),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }), "wooop".to_string(),
]), ]),
]); ]);
@ -870,9 +864,9 @@ mod test {
let result = analyzer.analyze(query); let result = analyzer.analyze(query);
let tokens = result.tokens(); let tokens = result.tokens();
let expected = Operation::Consecutive(vec![ let expected = Operation::Phrase(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), "hey".to_string(),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), "my".to_string(),
]); ]);
let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
@ -940,9 +934,9 @@ mod test {
let tokens = result.tokens(); let tokens = result.tokens();
let expected = Operation::And(vec![ let expected = Operation::And(vec![
Operation::Consecutive(vec![ Operation::Phrase(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), "hey".to_string(),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), "my".to_string(),
]), ]),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }), Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }),
]); ]);