Replace Consecutive by Phrase in query tree

Replace Consecutive by Phrase in query tree in order to remove theorical bugs,
due of the Consecutive enum type.
This commit is contained in:
many 2021-06-09 17:28:12 +02:00
parent bc02031793
commit e923a3ed6a
No known key found for this signature in database
GPG Key ID: 2CEF23B75189EACA
6 changed files with 130 additions and 104 deletions

View File

@ -578,7 +578,6 @@ fn linear_compute_candidates(
fn compute_candidate_rank(branches: &FlattenedQueryTree, words_positions: HashMap<String, RoaringBitmap>) -> u64 { fn compute_candidate_rank(branches: &FlattenedQueryTree, words_positions: HashMap<String, RoaringBitmap>) -> u64 {
let mut min_rank = u64::max_value(); let mut min_rank = u64::max_value();
for branch in branches { for branch in branches {
let branch_len = branch.len(); let branch_len = branch.len();
let mut branch_rank = Vec::with_capacity(branch_len); let mut branch_rank = Vec::with_capacity(branch_len);
for derivates in branch { for derivates in branch {
@ -661,7 +660,7 @@ fn linear_compute_candidates(
// TODO can we keep refs of Query // TODO can we keep refs of Query
fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
use crate::search::criteria::Operation::{And, Or, Consecutive}; use crate::search::criteria::Operation::{And, Or, Phrase};
fn and_recurse(head: &Operation, tail: &[Operation]) -> FlattenedQueryTree { fn and_recurse(head: &Operation, tail: &[Operation]) -> FlattenedQueryTree {
match tail.split_first() { match tail.split_first() {
@ -683,7 +682,7 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
fn recurse(op: &Operation) -> FlattenedQueryTree { fn recurse(op: &Operation) -> FlattenedQueryTree {
match op { match op {
And(ops) | Consecutive(ops) => { And(ops) => {
ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)) ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t))
}, },
Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) { Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) {
@ -691,6 +690,12 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
} else { } else {
ops.iter().map(recurse).flatten().collect() ops.iter().map(recurse).flatten().collect()
}, },
Phrase(words) => {
let queries = words.iter().map(|word| {
vec![Query {prefix: false, kind: QueryKind::exact(word.clone())}]
}).collect();
vec![queries]
}
Operation::Query(query) => vec![vec![vec![query.clone()]]], Operation::Query(query) => vec![vec![vec![query.clone()]]],
} }
} }

View File

@ -1,7 +1,6 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::borrow::Cow; use std::borrow::Cow;
use anyhow::bail;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::{FieldId, TreeLevel, search::{word_derivations, WordDerivationsCache}}; use crate::{FieldId, TreeLevel, search::{word_derivations, WordDerivationsCache}};
@ -239,7 +238,7 @@ pub fn resolve_query_tree<'t>(
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<RoaringBitmap> ) -> anyhow::Result<RoaringBitmap>
{ {
use Operation::{And, Consecutive, Or, Query}; use Operation::{And, Phrase, Or, Query};
match query_tree { match query_tree {
And(ops) => { And(ops) => {
@ -261,26 +260,23 @@ pub fn resolve_query_tree<'t>(
} }
Ok(candidates) Ok(candidates)
}, },
Consecutive(ops) => { Phrase(words) => {
let mut candidates = RoaringBitmap::new(); let mut candidates = RoaringBitmap::new();
let mut first_loop = true; let mut first_loop = true;
for slice in ops.windows(2) { for slice in words.windows(2) {
match (&slice[0], &slice[1]) { let (left, right) = (&slice[0], &slice[1]);
(Operation::Query(left), Operation::Query(right)) => { match ctx.word_pair_proximity_docids(left, right, 1)? {
match query_pair_proximity_docids(ctx, left, right, 1, wdcache)? { Some(pair_docids) => {
pair_docids if pair_docids.is_empty() => { if pair_docids.is_empty() {
return Ok(RoaringBitmap::new()) return Ok(RoaringBitmap::new());
}, } else if first_loop {
pair_docids if first_loop => {
candidates = pair_docids; candidates = pair_docids;
first_loop = false; first_loop = false;
}, } else {
pair_docids => { candidates &= pair_docids;
candidates.intersect_with(&pair_docids);
},
} }
}, },
_ => bail!("invalid consecutive query type"), None => return Ok(RoaringBitmap::new())
} }
} }
Ok(candidates) Ok(candidates)

View File

@ -171,12 +171,33 @@ fn resolve_candidates<'t>(
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<Vec<(Query, Query, RoaringBitmap)>> ) -> anyhow::Result<Vec<(Query, Query, RoaringBitmap)>>
{ {
use Operation::{And, Consecutive, Or, Query}; use Operation::{And, Phrase, Or};
let result = match query_tree { let result = match query_tree {
And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?, And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?,
Consecutive(ops) => if proximity == 0 { Phrase(words) => if proximity == 0 {
mdfs(ctx, ops, 0, cache, wdcache)? let most_left = words.first().map(|w| Query {prefix: false, kind: QueryKind::exact(w.clone())});
let most_right = words.last().map(|w| Query {prefix: false, kind: QueryKind::exact(w.clone())});
let mut candidates = None;
for slice in words.windows(2) {
let (left, right) = (&slice[0], &slice[1]);
match ctx.word_pair_proximity_docids(left, right, 1)? {
Some(pair_docids) => {
match candidates.as_mut() {
Some(candidates) => *candidates &= pair_docids,
None => candidates = Some(pair_docids),
}
},
None => {
candidates = None;
break;
}
}
}
match (most_left, most_right, candidates) {
(Some(l), Some(r), Some(c)) => vec![(l, r, c)],
_otherwise => Default::default(),
}
} else { } else {
Default::default() Default::default()
}, },
@ -188,7 +209,7 @@ fn resolve_candidates<'t>(
} }
output output
}, },
Query(q) => if proximity == 0 { Operation::Query(q) => if proximity == 0 {
let candidates = query_docids(ctx, q, wdcache)?; let candidates = query_docids(ctx, q, wdcache)?;
vec![(q.clone(), q.clone(), candidates)] vec![(q.clone(), q.clone(), candidates)]
} else { } else {
@ -306,14 +327,9 @@ fn resolve_plane_sweep_candidates(
) -> anyhow::Result<BTreeMap<u8, RoaringBitmap>> ) -> anyhow::Result<BTreeMap<u8, RoaringBitmap>>
{ {
/// FIXME may be buggy with query like "new new york" /// FIXME may be buggy with query like "new new york"
fn plane_sweep<'a>( fn plane_sweep(
ctx: &dyn Context, groups_positions: Vec<Vec<(Position, u8, Position)>>,
operations: &'a [Operation],
docid: DocumentId,
consecutive: bool, consecutive: bool,
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
words_positions: &HashMap<String, RoaringBitmap>,
wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<Vec<(Position, u8, Position)>> ) -> anyhow::Result<Vec<(Position, u8, Position)>>
{ {
fn compute_groups_proximity( fn compute_groups_proximity(
@ -362,13 +378,9 @@ fn resolve_plane_sweep_candidates(
} }
} }
let groups_len = operations.len(); let groups_len = groups_positions.len();
let mut groups_positions = Vec::with_capacity(groups_len);
for operation in operations { let mut groups_positions: Vec<_> = groups_positions.into_iter().map(|pos| pos.into_iter()).collect();
let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?;
groups_positions.push(positions.into_iter());
}
// Pop top elements of each list. // Pop top elements of each list.
let mut current = Vec::with_capacity(groups_len); let mut current = Vec::with_capacity(groups_len);
@ -441,15 +453,32 @@ fn resolve_plane_sweep_candidates(
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<Vec<(Position, u8, Position)>> ) -> anyhow::Result<Vec<(Position, u8, Position)>>
{ {
use Operation::{And, Consecutive, Or}; use Operation::{And, Phrase, Or};
if let Some(result) = rocache.get(query_tree) { if let Some(result) = rocache.get(query_tree) {
return Ok(result.clone()); return Ok(result.clone());
} }
let result = match query_tree { let result = match query_tree {
And(ops) => plane_sweep(ctx, ops, docid, false, rocache, words_positions, wdcache)?, And(ops) => {
Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, words_positions, wdcache)?, let mut groups_positions = Vec::with_capacity(ops.len());
for operation in ops {
let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?;
groups_positions.push(positions);
}
plane_sweep(groups_positions, false)?
},
Phrase(words) => {
let mut groups_positions = Vec::with_capacity(words.len());
for word in words {
let positions = match words_positions.get(word) {
Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(),
None => vec![],
};
groups_positions.push(positions);
}
plane_sweep(groups_positions, true)?
},
Or(_, ops) => { Or(_, ops) => {
let mut result = Vec::new(); let mut result = Vec::new();
for op in ops { for op in ops {

View File

@ -1,6 +1,5 @@
use std::{borrow::Cow, collections::HashMap, mem::take}; use std::{borrow::Cow, collections::HashMap, mem::take};
use anyhow::bail;
use log::debug; use log::debug;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
@ -13,7 +12,6 @@ use super::{
CriterionParameters, CriterionParameters,
CriterionResult, CriterionResult,
query_docids, query_docids,
query_pair_proximity_docids,
resolve_query_tree, resolve_query_tree,
}; };
@ -174,12 +172,14 @@ fn alterate_query_tree(
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<()> ) -> anyhow::Result<()>
{ {
use Operation::{And, Consecutive, Or}; use Operation::{And, Phrase, Or};
match operation { match operation {
And(ops) | Consecutive(ops) | Or(_, ops) => { And(ops) | Or(_, ops) => {
ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache)) ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache))
}, },
// Because Phrases don't allow typos, no alteration can be done.
Phrase(_words) => return Ok(()),
Operation::Query(q) => { Operation::Query(q) => {
if let QueryKind::Tolerant { typo, word } = &q.kind { if let QueryKind::Tolerant { typo, word } = &q.kind {
// if no typo is allowed we don't call word_derivations function, // if no typo is allowed we don't call word_derivations function,
@ -228,32 +228,29 @@ fn resolve_candidates<'t>(
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<RoaringBitmap> ) -> anyhow::Result<RoaringBitmap>
{ {
use Operation::{And, Consecutive, Or, Query}; use Operation::{And, Phrase, Or, Query};
match query_tree { match query_tree {
And(ops) => { And(ops) => {
mdfs(ctx, ops, number_typos, cache, wdcache) mdfs(ctx, ops, number_typos, cache, wdcache)
}, },
Consecutive(ops) => { Phrase(words) => {
let mut candidates = RoaringBitmap::new(); let mut candidates = RoaringBitmap::new();
let mut first_loop = true; let mut first_loop = true;
for slice in ops.windows(2) { for slice in words.windows(2) {
match (&slice[0], &slice[1]) { let (left, right) = (&slice[0], &slice[1]);
(Operation::Query(left), Operation::Query(right)) => { match ctx.word_pair_proximity_docids(left, right, 1)? {
match query_pair_proximity_docids(ctx, left, right, 1, wdcache)? { Some(pair_docids) => {
pair_docids if pair_docids.is_empty() => { if pair_docids.is_empty() {
return Ok(RoaringBitmap::new()) return Ok(RoaringBitmap::new());
}, } else if first_loop {
pair_docids if first_loop => {
candidates = pair_docids; candidates = pair_docids;
first_loop = false; first_loop = false;
}, } else {
pair_docids => { candidates &= pair_docids;
candidates.intersect_with(&pair_docids);
},
} }
}, },
_ => bail!("invalid consecutive query type"), None => return Ok(RoaringBitmap::new())
} }
} }
Ok(candidates) Ok(candidates)

View File

@ -52,13 +52,18 @@ impl MatchingWords {
fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> { fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> {
fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) { fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) {
match tree { match tree {
Operation::Or(_, ops) | Operation::And(ops) | Operation::Consecutive(ops) => { Operation::Or(_, ops) | Operation::And(ops) => {
ops.as_slice().iter().for_each(|op| resolve_ops(op, out)); ops.as_slice().iter().for_each(|op| resolve_ops(op, out));
}, },
Operation::Query(Query { prefix, kind }) => { Operation::Query(Query { prefix, kind }) => {
let typo = if kind.is_exact() { 0 } else { kind.typo() }; let typo = if kind.is_exact() { 0 } else { kind.typo() };
out.insert((kind.word(), typo, *prefix)); out.insert((kind.word(), typo, *prefix));
}, },
Operation::Phrase(words) => {
for word in words {
out.insert((word, 0, false));
}
}
} }
} }

View File

@ -15,7 +15,8 @@ type IsPrefix = bool;
#[derive(Clone, PartialEq, Eq, Hash)] #[derive(Clone, PartialEq, Eq, Hash)]
pub enum Operation { pub enum Operation {
And(Vec<Operation>), And(Vec<Operation>),
Consecutive(Vec<Operation>), // serie of consecutive non prefix and exact words
Phrase(Vec<String>),
Or(IsOptionalWord, Vec<Operation>), Or(IsOptionalWord, Vec<Operation>),
Query(Query), Query(Query),
} }
@ -28,9 +29,8 @@ impl fmt::Debug for Operation {
writeln!(f, "{:1$}AND", "", depth * 2)?; writeln!(f, "{:1$}AND", "", depth * 2)?;
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
}, },
Operation::Consecutive(children) => { Operation::Phrase(children) => {
writeln!(f, "{:1$}CONSECUTIVE", "", depth * 2)?; writeln!(f, "{:2$}PHRASE {:?}", "", children, depth * 2)
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
}, },
Operation::Or(true, children) => { Operation::Or(true, children) => {
writeln!(f, "{:1$}OR(WORD)", "", depth * 2)?; writeln!(f, "{:1$}OR(WORD)", "", depth * 2)?;
@ -49,14 +49,6 @@ impl fmt::Debug for Operation {
} }
impl Operation { impl Operation {
fn phrase(words: Vec<String>) -> Operation {
Operation::consecutive(
words.into_iter().map(|s| {
Operation::Query(Query { prefix: false, kind: QueryKind::exact(s) })
}).collect()
)
}
fn and(mut ops: Vec<Self>) -> Self { fn and(mut ops: Vec<Self>) -> Self {
if ops.len() == 1 { if ops.len() == 1 {
ops.pop().unwrap() ops.pop().unwrap()
@ -73,11 +65,11 @@ impl Operation {
} }
} }
fn consecutive(mut ops: Vec<Self>) -> Self { fn phrase(mut words: Vec<String>) -> Self {
if ops.len() == 1 { if words.len() == 1 {
ops.pop().unwrap() Self::Query(Query {prefix: false, kind: QueryKind::exact(words.pop().unwrap())})
} else { } else {
Self::Consecutive(ops) Self::Phrase(words)
} }
} }
@ -256,10 +248,10 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<O
} }
} }
Ok(best.map(|(_, left, right)| Operation::Consecutive( Ok(best.map(|(_, left, right)| Operation::Phrase(
vec![ vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact(left.to_string()) }), left.to_string(),
Operation::Query(Query { prefix: false, kind: QueryKind::exact(right.to_string()) }) right.to_string()
] ]
))) )))
} }
@ -494,24 +486,26 @@ fn create_primitive_query(query: TokenStream, stop_words: Option<Set<&[u8]>>, wo
/// Returns the maximum number of typos that this Operation allows. /// Returns the maximum number of typos that this Operation allows.
pub fn maximum_typo(operation: &Operation) -> usize { pub fn maximum_typo(operation: &Operation) -> usize {
use Operation::{Or, And, Query, Consecutive}; use Operation::{Or, And, Query, Phrase};
match operation { match operation {
Or(_, ops) => ops.iter().map(maximum_typo).max().unwrap_or(0), Or(_, ops) => ops.iter().map(maximum_typo).max().unwrap_or(0),
And(ops) | Consecutive(ops) => ops.iter().map(maximum_typo).sum::<usize>(), And(ops) => ops.iter().map(maximum_typo).sum::<usize>(),
Query(q) => q.kind.typo() as usize, Query(q) => q.kind.typo() as usize,
// no typo allowed in phrases
Phrase(_) => 0,
} }
} }
/// Returns the maximum proximity that this Operation allows. /// Returns the maximum proximity that this Operation allows.
pub fn maximum_proximity(operation: &Operation) -> usize { pub fn maximum_proximity(operation: &Operation) -> usize {
use Operation::{Or, And, Query, Consecutive}; use Operation::{Or, And, Query, Phrase};
match operation { match operation {
Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0), Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0),
And(ops) => { And(ops) => {
ops.iter().map(maximum_proximity).sum::<usize>() ops.iter().map(maximum_proximity).sum::<usize>()
+ ops.len().saturating_sub(1) * 7 + ops.len().saturating_sub(1) * 7
}, },
Query(_) | Consecutive(_) => 0, Query(_) | Phrase(_) => 0,
} }
} }
@ -765,9 +759,9 @@ mod test {
let expected = Operation::Or(false, vec![ let expected = Operation::Or(false, vec![
Operation::And(vec![ Operation::And(vec![
Operation::Or(false, vec![ Operation::Or(false, vec![
Operation::Consecutive(vec![ Operation::Phrase(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("word".to_string()) }), "word".to_string(),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), "split".to_string(),
]), ]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplit".to_string()) }), Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplit".to_string()) }),
]), ]),
@ -789,9 +783,9 @@ mod test {
let tokens = result.tokens(); let tokens = result.tokens();
let expected = Operation::And(vec![ let expected = Operation::And(vec![
Operation::Consecutive(vec![ Operation::Phrase(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), "hey".to_string(),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("friends".to_string()) }), "friends".to_string(),
]), ]),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }), Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }),
]); ]);
@ -809,13 +803,13 @@ mod test {
let tokens = result.tokens(); let tokens = result.tokens();
let expected = Operation::And(vec![ let expected = Operation::And(vec![
Operation::Consecutive(vec![ Operation::Phrase(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), "hey".to_string(),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("friends".to_string()) }), "friends".to_string(),
]), ]),
Operation::Consecutive(vec![ Operation::Phrase(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }), "wooop".to_string(),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }), "wooop".to_string(),
]), ]),
]); ]);
@ -870,9 +864,9 @@ mod test {
let result = analyzer.analyze(query); let result = analyzer.analyze(query);
let tokens = result.tokens(); let tokens = result.tokens();
let expected = Operation::Consecutive(vec![ let expected = Operation::Phrase(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), "hey".to_string(),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), "my".to_string(),
]); ]);
let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
@ -940,9 +934,9 @@ mod test {
let tokens = result.tokens(); let tokens = result.tokens();
let expected = Operation::And(vec![ let expected = Operation::And(vec![
Operation::Consecutive(vec![ Operation::Phrase(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), "hey".to_string(),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), "my".to_string(),
]), ]),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }), Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }),
]); ]);