Replace Consecutive by Phrase in query tree

Replace Consecutive by Phrase in query tree in order to remove theorical bugs,
due of the Consecutive enum type.
This commit is contained in:
many 2021-06-09 17:28:12 +02:00
parent bc02031793
commit e923a3ed6a
No known key found for this signature in database
GPG Key ID: 2CEF23B75189EACA
6 changed files with 130 additions and 104 deletions

View File

@ -578,7 +578,6 @@ fn linear_compute_candidates(
fn compute_candidate_rank(branches: &FlattenedQueryTree, words_positions: HashMap<String, RoaringBitmap>) -> u64 {
let mut min_rank = u64::max_value();
for branch in branches {
let branch_len = branch.len();
let mut branch_rank = Vec::with_capacity(branch_len);
for derivates in branch {
@ -661,7 +660,7 @@ fn linear_compute_candidates(
// TODO can we keep refs of Query
fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
use crate::search::criteria::Operation::{And, Or, Consecutive};
use crate::search::criteria::Operation::{And, Or, Phrase};
fn and_recurse(head: &Operation, tail: &[Operation]) -> FlattenedQueryTree {
match tail.split_first() {
@ -683,7 +682,7 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
fn recurse(op: &Operation) -> FlattenedQueryTree {
match op {
And(ops) | Consecutive(ops) => {
And(ops) => {
ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t))
},
Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) {
@ -691,6 +690,12 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
} else {
ops.iter().map(recurse).flatten().collect()
},
Phrase(words) => {
let queries = words.iter().map(|word| {
vec![Query {prefix: false, kind: QueryKind::exact(word.clone())}]
}).collect();
vec![queries]
}
Operation::Query(query) => vec![vec![vec![query.clone()]]],
}
}

View File

@ -1,7 +1,6 @@
use std::collections::HashMap;
use std::borrow::Cow;
use anyhow::bail;
use roaring::RoaringBitmap;
use crate::{FieldId, TreeLevel, search::{word_derivations, WordDerivationsCache}};
@ -239,7 +238,7 @@ pub fn resolve_query_tree<'t>(
wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<RoaringBitmap>
{
use Operation::{And, Consecutive, Or, Query};
use Operation::{And, Phrase, Or, Query};
match query_tree {
And(ops) => {
@ -261,26 +260,23 @@ pub fn resolve_query_tree<'t>(
}
Ok(candidates)
},
Consecutive(ops) => {
Phrase(words) => {
let mut candidates = RoaringBitmap::new();
let mut first_loop = true;
for slice in ops.windows(2) {
match (&slice[0], &slice[1]) {
(Operation::Query(left), Operation::Query(right)) => {
match query_pair_proximity_docids(ctx, left, right, 1, wdcache)? {
pair_docids if pair_docids.is_empty() => {
return Ok(RoaringBitmap::new())
},
pair_docids if first_loop => {
for slice in words.windows(2) {
let (left, right) = (&slice[0], &slice[1]);
match ctx.word_pair_proximity_docids(left, right, 1)? {
Some(pair_docids) => {
if pair_docids.is_empty() {
return Ok(RoaringBitmap::new());
} else if first_loop {
candidates = pair_docids;
first_loop = false;
},
pair_docids => {
candidates.intersect_with(&pair_docids);
},
} else {
candidates &= pair_docids;
}
},
_ => bail!("invalid consecutive query type"),
None => return Ok(RoaringBitmap::new())
}
}
Ok(candidates)

View File

@ -171,12 +171,33 @@ fn resolve_candidates<'t>(
wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<Vec<(Query, Query, RoaringBitmap)>>
{
use Operation::{And, Consecutive, Or, Query};
use Operation::{And, Phrase, Or};
let result = match query_tree {
And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?,
Consecutive(ops) => if proximity == 0 {
mdfs(ctx, ops, 0, cache, wdcache)?
Phrase(words) => if proximity == 0 {
let most_left = words.first().map(|w| Query {prefix: false, kind: QueryKind::exact(w.clone())});
let most_right = words.last().map(|w| Query {prefix: false, kind: QueryKind::exact(w.clone())});
let mut candidates = None;
for slice in words.windows(2) {
let (left, right) = (&slice[0], &slice[1]);
match ctx.word_pair_proximity_docids(left, right, 1)? {
Some(pair_docids) => {
match candidates.as_mut() {
Some(candidates) => *candidates &= pair_docids,
None => candidates = Some(pair_docids),
}
},
None => {
candidates = None;
break;
}
}
}
match (most_left, most_right, candidates) {
(Some(l), Some(r), Some(c)) => vec![(l, r, c)],
_otherwise => Default::default(),
}
} else {
Default::default()
},
@ -188,7 +209,7 @@ fn resolve_candidates<'t>(
}
output
},
Query(q) => if proximity == 0 {
Operation::Query(q) => if proximity == 0 {
let candidates = query_docids(ctx, q, wdcache)?;
vec![(q.clone(), q.clone(), candidates)]
} else {
@ -306,14 +327,9 @@ fn resolve_plane_sweep_candidates(
) -> anyhow::Result<BTreeMap<u8, RoaringBitmap>>
{
/// FIXME may be buggy with query like "new new york"
fn plane_sweep<'a>(
ctx: &dyn Context,
operations: &'a [Operation],
docid: DocumentId,
fn plane_sweep(
groups_positions: Vec<Vec<(Position, u8, Position)>>,
consecutive: bool,
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
words_positions: &HashMap<String, RoaringBitmap>,
wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<Vec<(Position, u8, Position)>>
{
fn compute_groups_proximity(
@ -362,13 +378,9 @@ fn resolve_plane_sweep_candidates(
}
}
let groups_len = operations.len();
let mut groups_positions = Vec::with_capacity(groups_len);
let groups_len = groups_positions.len();
for operation in operations {
let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?;
groups_positions.push(positions.into_iter());
}
let mut groups_positions: Vec<_> = groups_positions.into_iter().map(|pos| pos.into_iter()).collect();
// Pop top elements of each list.
let mut current = Vec::with_capacity(groups_len);
@ -441,15 +453,32 @@ fn resolve_plane_sweep_candidates(
wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<Vec<(Position, u8, Position)>>
{
use Operation::{And, Consecutive, Or};
use Operation::{And, Phrase, Or};
if let Some(result) = rocache.get(query_tree) {
return Ok(result.clone());
}
let result = match query_tree {
And(ops) => plane_sweep(ctx, ops, docid, false, rocache, words_positions, wdcache)?,
Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, words_positions, wdcache)?,
And(ops) => {
let mut groups_positions = Vec::with_capacity(ops.len());
for operation in ops {
let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?;
groups_positions.push(positions);
}
plane_sweep(groups_positions, false)?
},
Phrase(words) => {
let mut groups_positions = Vec::with_capacity(words.len());
for word in words {
let positions = match words_positions.get(word) {
Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(),
None => vec![],
};
groups_positions.push(positions);
}
plane_sweep(groups_positions, true)?
},
Or(_, ops) => {
let mut result = Vec::new();
for op in ops {

View File

@ -1,6 +1,5 @@
use std::{borrow::Cow, collections::HashMap, mem::take};
use anyhow::bail;
use log::debug;
use roaring::RoaringBitmap;
@ -13,7 +12,6 @@ use super::{
CriterionParameters,
CriterionResult,
query_docids,
query_pair_proximity_docids,
resolve_query_tree,
};
@ -174,12 +172,14 @@ fn alterate_query_tree(
wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<()>
{
use Operation::{And, Consecutive, Or};
use Operation::{And, Phrase, Or};
match operation {
And(ops) | Consecutive(ops) | Or(_, ops) => {
And(ops) | Or(_, ops) => {
ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache))
},
// Because Phrases don't allow typos, no alteration can be done.
Phrase(_words) => return Ok(()),
Operation::Query(q) => {
if let QueryKind::Tolerant { typo, word } = &q.kind {
// if no typo is allowed we don't call word_derivations function,
@ -228,32 +228,29 @@ fn resolve_candidates<'t>(
wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<RoaringBitmap>
{
use Operation::{And, Consecutive, Or, Query};
use Operation::{And, Phrase, Or, Query};
match query_tree {
And(ops) => {
mdfs(ctx, ops, number_typos, cache, wdcache)
},
Consecutive(ops) => {
Phrase(words) => {
let mut candidates = RoaringBitmap::new();
let mut first_loop = true;
for slice in ops.windows(2) {
match (&slice[0], &slice[1]) {
(Operation::Query(left), Operation::Query(right)) => {
match query_pair_proximity_docids(ctx, left, right, 1, wdcache)? {
pair_docids if pair_docids.is_empty() => {
return Ok(RoaringBitmap::new())
},
pair_docids if first_loop => {
for slice in words.windows(2) {
let (left, right) = (&slice[0], &slice[1]);
match ctx.word_pair_proximity_docids(left, right, 1)? {
Some(pair_docids) => {
if pair_docids.is_empty() {
return Ok(RoaringBitmap::new());
} else if first_loop {
candidates = pair_docids;
first_loop = false;
},
pair_docids => {
candidates.intersect_with(&pair_docids);
},
} else {
candidates &= pair_docids;
}
},
_ => bail!("invalid consecutive query type"),
None => return Ok(RoaringBitmap::new())
}
}
Ok(candidates)

View File

@ -52,13 +52,18 @@ impl MatchingWords {
fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> {
fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) {
match tree {
Operation::Or(_, ops) | Operation::And(ops) | Operation::Consecutive(ops) => {
Operation::Or(_, ops) | Operation::And(ops) => {
ops.as_slice().iter().for_each(|op| resolve_ops(op, out));
},
Operation::Query(Query { prefix, kind }) => {
let typo = if kind.is_exact() { 0 } else { kind.typo() };
out.insert((kind.word(), typo, *prefix));
},
Operation::Phrase(words) => {
for word in words {
out.insert((word, 0, false));
}
}
}
}

View File

@ -15,7 +15,8 @@ type IsPrefix = bool;
#[derive(Clone, PartialEq, Eq, Hash)]
pub enum Operation {
And(Vec<Operation>),
Consecutive(Vec<Operation>),
// serie of consecutive non prefix and exact words
Phrase(Vec<String>),
Or(IsOptionalWord, Vec<Operation>),
Query(Query),
}
@ -28,9 +29,8 @@ impl fmt::Debug for Operation {
writeln!(f, "{:1$}AND", "", depth * 2)?;
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
},
Operation::Consecutive(children) => {
writeln!(f, "{:1$}CONSECUTIVE", "", depth * 2)?;
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
Operation::Phrase(children) => {
writeln!(f, "{:2$}PHRASE {:?}", "", children, depth * 2)
},
Operation::Or(true, children) => {
writeln!(f, "{:1$}OR(WORD)", "", depth * 2)?;
@ -49,14 +49,6 @@ impl fmt::Debug for Operation {
}
impl Operation {
fn phrase(words: Vec<String>) -> Operation {
Operation::consecutive(
words.into_iter().map(|s| {
Operation::Query(Query { prefix: false, kind: QueryKind::exact(s) })
}).collect()
)
}
fn and(mut ops: Vec<Self>) -> Self {
if ops.len() == 1 {
ops.pop().unwrap()
@ -73,11 +65,11 @@ impl Operation {
}
}
fn consecutive(mut ops: Vec<Self>) -> Self {
if ops.len() == 1 {
ops.pop().unwrap()
fn phrase(mut words: Vec<String>) -> Self {
if words.len() == 1 {
Self::Query(Query {prefix: false, kind: QueryKind::exact(words.pop().unwrap())})
} else {
Self::Consecutive(ops)
Self::Phrase(words)
}
}
@ -256,10 +248,10 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<O
}
}
Ok(best.map(|(_, left, right)| Operation::Consecutive(
Ok(best.map(|(_, left, right)| Operation::Phrase(
vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact(left.to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact(right.to_string()) })
left.to_string(),
right.to_string()
]
)))
}
@ -494,24 +486,26 @@ fn create_primitive_query(query: TokenStream, stop_words: Option<Set<&[u8]>>, wo
/// Returns the maximum number of typos that this Operation allows.
pub fn maximum_typo(operation: &Operation) -> usize {
use Operation::{Or, And, Query, Consecutive};
use Operation::{Or, And, Query, Phrase};
match operation {
Or(_, ops) => ops.iter().map(maximum_typo).max().unwrap_or(0),
And(ops) | Consecutive(ops) => ops.iter().map(maximum_typo).sum::<usize>(),
And(ops) => ops.iter().map(maximum_typo).sum::<usize>(),
Query(q) => q.kind.typo() as usize,
// no typo allowed in phrases
Phrase(_) => 0,
}
}
/// Returns the maximum proximity that this Operation allows.
pub fn maximum_proximity(operation: &Operation) -> usize {
use Operation::{Or, And, Query, Consecutive};
use Operation::{Or, And, Query, Phrase};
match operation {
Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0),
And(ops) => {
ops.iter().map(maximum_proximity).sum::<usize>()
+ ops.len().saturating_sub(1) * 7
},
Query(_) | Consecutive(_) => 0,
Query(_) | Phrase(_) => 0,
}
}
@ -765,9 +759,9 @@ mod test {
let expected = Operation::Or(false, vec![
Operation::And(vec![
Operation::Or(false, vec![
Operation::Consecutive(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("word".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
Operation::Phrase(vec![
"word".to_string(),
"split".to_string(),
]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplit".to_string()) }),
]),
@ -789,9 +783,9 @@ mod test {
let tokens = result.tokens();
let expected = Operation::And(vec![
Operation::Consecutive(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("friends".to_string()) }),
Operation::Phrase(vec![
"hey".to_string(),
"friends".to_string(),
]),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }),
]);
@ -809,13 +803,13 @@ mod test {
let tokens = result.tokens();
let expected = Operation::And(vec![
Operation::Consecutive(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("friends".to_string()) }),
Operation::Phrase(vec![
"hey".to_string(),
"friends".to_string(),
]),
Operation::Consecutive(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }),
Operation::Phrase(vec![
"wooop".to_string(),
"wooop".to_string(),
]),
]);
@ -870,9 +864,9 @@ mod test {
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::Consecutive(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }),
let expected = Operation::Phrase(vec![
"hey".to_string(),
"my".to_string(),
]);
let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
@ -940,9 +934,9 @@ mod test {
let tokens = result.tokens();
let expected = Operation::And(vec![
Operation::Consecutive(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }),
Operation::Phrase(vec![
"hey".to_string(),
"my".to_string(),
]),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }),
]);