mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-11 14:04:31 +01:00
Merge #227
227: Replace Consecutive by Phrase in query tree r=Kerollmops a=ManyTheFish Replace `Consecutive` by `Phrase` in the query tree in order to remove theoretical bugs, due to the `Consecutive` enum type. Co-authored-by: many <maxime@meilisearch.com> Co-authored-by: Many <legendre.maxime.isn@gmail.com>
This commit is contained in:
commit
3e6c05fe13
@ -578,7 +578,6 @@ fn linear_compute_candidates(
|
||||
fn compute_candidate_rank(branches: &FlattenedQueryTree, words_positions: HashMap<String, RoaringBitmap>) -> u64 {
|
||||
let mut min_rank = u64::max_value();
|
||||
for branch in branches {
|
||||
|
||||
let branch_len = branch.len();
|
||||
let mut branch_rank = Vec::with_capacity(branch_len);
|
||||
for derivates in branch {
|
||||
@ -661,7 +660,7 @@ fn linear_compute_candidates(
|
||||
|
||||
// TODO can we keep refs of Query
|
||||
fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
|
||||
use crate::search::criteria::Operation::{And, Or, Consecutive};
|
||||
use crate::search::criteria::Operation::{And, Or, Phrase};
|
||||
|
||||
fn and_recurse(head: &Operation, tail: &[Operation]) -> FlattenedQueryTree {
|
||||
match tail.split_first() {
|
||||
@ -683,7 +682,7 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
|
||||
|
||||
fn recurse(op: &Operation) -> FlattenedQueryTree {
|
||||
match op {
|
||||
And(ops) | Consecutive(ops) => {
|
||||
And(ops) => {
|
||||
ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t))
|
||||
},
|
||||
Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) {
|
||||
@ -691,6 +690,12 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
|
||||
} else {
|
||||
ops.iter().map(recurse).flatten().collect()
|
||||
},
|
||||
Phrase(words) => {
|
||||
let queries = words.iter().map(|word| {
|
||||
vec![Query {prefix: false, kind: QueryKind::exact(word.clone())}]
|
||||
}).collect();
|
||||
vec![queries]
|
||||
}
|
||||
Operation::Query(query) => vec![vec![vec![query.clone()]]],
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,6 @@
|
||||
use std::collections::HashMap;
|
||||
use std::borrow::Cow;
|
||||
|
||||
use anyhow::bail;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::{FieldId, TreeLevel, search::{word_derivations, WordDerivationsCache}};
|
||||
@ -239,7 +238,7 @@ pub fn resolve_query_tree<'t>(
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> anyhow::Result<RoaringBitmap>
|
||||
{
|
||||
use Operation::{And, Consecutive, Or, Query};
|
||||
use Operation::{And, Phrase, Or, Query};
|
||||
|
||||
match query_tree {
|
||||
And(ops) => {
|
||||
@ -261,26 +260,23 @@ pub fn resolve_query_tree<'t>(
|
||||
}
|
||||
Ok(candidates)
|
||||
},
|
||||
Consecutive(ops) => {
|
||||
Phrase(words) => {
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
let mut first_loop = true;
|
||||
for slice in ops.windows(2) {
|
||||
match (&slice[0], &slice[1]) {
|
||||
(Operation::Query(left), Operation::Query(right)) => {
|
||||
match query_pair_proximity_docids(ctx, left, right, 1, wdcache)? {
|
||||
pair_docids if pair_docids.is_empty() => {
|
||||
return Ok(RoaringBitmap::new())
|
||||
},
|
||||
pair_docids if first_loop => {
|
||||
candidates = pair_docids;
|
||||
first_loop = false;
|
||||
},
|
||||
pair_docids => {
|
||||
candidates.intersect_with(&pair_docids);
|
||||
},
|
||||
for slice in words.windows(2) {
|
||||
let (left, right) = (&slice[0], &slice[1]);
|
||||
match ctx.word_pair_proximity_docids(left, right, 1)? {
|
||||
Some(pair_docids) => {
|
||||
if pair_docids.is_empty() {
|
||||
return Ok(RoaringBitmap::new());
|
||||
} else if first_loop {
|
||||
candidates = pair_docids;
|
||||
first_loop = false;
|
||||
} else {
|
||||
candidates &= pair_docids;
|
||||
}
|
||||
},
|
||||
_ => bail!("invalid consecutive query type"),
|
||||
None => return Ok(RoaringBitmap::new())
|
||||
}
|
||||
}
|
||||
Ok(candidates)
|
||||
|
@ -171,12 +171,33 @@ fn resolve_candidates<'t>(
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> anyhow::Result<Vec<(Query, Query, RoaringBitmap)>>
|
||||
{
|
||||
use Operation::{And, Consecutive, Or, Query};
|
||||
use Operation::{And, Phrase, Or};
|
||||
|
||||
let result = match query_tree {
|
||||
And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?,
|
||||
Consecutive(ops) => if proximity == 0 {
|
||||
mdfs(ctx, ops, 0, cache, wdcache)?
|
||||
Phrase(words) => if proximity == 0 {
|
||||
let most_left = words.first().map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
|
||||
let most_right = words.last().map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
|
||||
let mut candidates = None;
|
||||
for slice in words.windows(2) {
|
||||
let (left, right) = (&slice[0], &slice[1]);
|
||||
match ctx.word_pair_proximity_docids(left, right, 1)? {
|
||||
Some(pair_docids) => {
|
||||
match candidates.as_mut() {
|
||||
Some(candidates) => *candidates &= pair_docids,
|
||||
None => candidates = Some(pair_docids),
|
||||
}
|
||||
},
|
||||
None => {
|
||||
candidates = None;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
match (most_left, most_right, candidates) {
|
||||
(Some(l), Some(r), Some(c)) => vec![(l, r, c)],
|
||||
_otherwise => Default::default(),
|
||||
}
|
||||
} else {
|
||||
Default::default()
|
||||
},
|
||||
@ -188,7 +209,7 @@ fn resolve_candidates<'t>(
|
||||
}
|
||||
output
|
||||
},
|
||||
Query(q) => if proximity == 0 {
|
||||
Operation::Query(q) => if proximity == 0 {
|
||||
let candidates = query_docids(ctx, q, wdcache)?;
|
||||
vec![(q.clone(), q.clone(), candidates)]
|
||||
} else {
|
||||
@ -306,14 +327,9 @@ fn resolve_plane_sweep_candidates(
|
||||
) -> anyhow::Result<BTreeMap<u8, RoaringBitmap>>
|
||||
{
|
||||
/// FIXME may be buggy with query like "new new york"
|
||||
fn plane_sweep<'a>(
|
||||
ctx: &dyn Context,
|
||||
operations: &'a [Operation],
|
||||
docid: DocumentId,
|
||||
fn plane_sweep(
|
||||
groups_positions: Vec<Vec<(Position, u8, Position)>>,
|
||||
consecutive: bool,
|
||||
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
|
||||
words_positions: &HashMap<String, RoaringBitmap>,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> anyhow::Result<Vec<(Position, u8, Position)>>
|
||||
{
|
||||
fn compute_groups_proximity(
|
||||
@ -362,13 +378,9 @@ fn resolve_plane_sweep_candidates(
|
||||
}
|
||||
}
|
||||
|
||||
let groups_len = operations.len();
|
||||
let mut groups_positions = Vec::with_capacity(groups_len);
|
||||
let groups_len = groups_positions.len();
|
||||
|
||||
for operation in operations {
|
||||
let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?;
|
||||
groups_positions.push(positions.into_iter());
|
||||
}
|
||||
let mut groups_positions: Vec<_> = groups_positions.into_iter().map(|pos| pos.into_iter()).collect();
|
||||
|
||||
// Pop top elements of each list.
|
||||
let mut current = Vec::with_capacity(groups_len);
|
||||
@ -441,15 +453,32 @@ fn resolve_plane_sweep_candidates(
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> anyhow::Result<Vec<(Position, u8, Position)>>
|
||||
{
|
||||
use Operation::{And, Consecutive, Or};
|
||||
use Operation::{And, Phrase, Or};
|
||||
|
||||
if let Some(result) = rocache.get(query_tree) {
|
||||
return Ok(result.clone());
|
||||
}
|
||||
|
||||
let result = match query_tree {
|
||||
And(ops) => plane_sweep(ctx, ops, docid, false, rocache, words_positions, wdcache)?,
|
||||
Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, words_positions, wdcache)?,
|
||||
And(ops) => {
|
||||
let mut groups_positions = Vec::with_capacity(ops.len());
|
||||
for operation in ops {
|
||||
let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?;
|
||||
groups_positions.push(positions);
|
||||
}
|
||||
plane_sweep(groups_positions, false)?
|
||||
},
|
||||
Phrase(words) => {
|
||||
let mut groups_positions = Vec::with_capacity(words.len());
|
||||
for word in words {
|
||||
let positions = match words_positions.get(word) {
|
||||
Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(),
|
||||
None => vec![],
|
||||
};
|
||||
groups_positions.push(positions);
|
||||
}
|
||||
plane_sweep(groups_positions, true)?
|
||||
},
|
||||
Or(_, ops) => {
|
||||
let mut result = Vec::new();
|
||||
for op in ops {
|
||||
|
@ -1,6 +1,5 @@
|
||||
use std::{borrow::Cow, collections::HashMap, mem::take};
|
||||
|
||||
use anyhow::bail;
|
||||
use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
@ -13,7 +12,6 @@ use super::{
|
||||
CriterionParameters,
|
||||
CriterionResult,
|
||||
query_docids,
|
||||
query_pair_proximity_docids,
|
||||
resolve_query_tree,
|
||||
};
|
||||
|
||||
@ -174,12 +172,14 @@ fn alterate_query_tree(
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> anyhow::Result<()>
|
||||
{
|
||||
use Operation::{And, Consecutive, Or};
|
||||
use Operation::{And, Phrase, Or};
|
||||
|
||||
match operation {
|
||||
And(ops) | Consecutive(ops) | Or(_, ops) => {
|
||||
And(ops) | Or(_, ops) => {
|
||||
ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache))
|
||||
},
|
||||
// Because Phrases don't allow typos, no alteration can be done.
|
||||
Phrase(_words) => return Ok(()),
|
||||
Operation::Query(q) => {
|
||||
if let QueryKind::Tolerant { typo, word } = &q.kind {
|
||||
// if no typo is allowed we don't call word_derivations function,
|
||||
@ -228,32 +228,29 @@ fn resolve_candidates<'t>(
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> anyhow::Result<RoaringBitmap>
|
||||
{
|
||||
use Operation::{And, Consecutive, Or, Query};
|
||||
use Operation::{And, Phrase, Or, Query};
|
||||
|
||||
match query_tree {
|
||||
And(ops) => {
|
||||
mdfs(ctx, ops, number_typos, cache, wdcache)
|
||||
},
|
||||
Consecutive(ops) => {
|
||||
Phrase(words) => {
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
let mut first_loop = true;
|
||||
for slice in ops.windows(2) {
|
||||
match (&slice[0], &slice[1]) {
|
||||
(Operation::Query(left), Operation::Query(right)) => {
|
||||
match query_pair_proximity_docids(ctx, left, right, 1, wdcache)? {
|
||||
pair_docids if pair_docids.is_empty() => {
|
||||
return Ok(RoaringBitmap::new())
|
||||
},
|
||||
pair_docids if first_loop => {
|
||||
candidates = pair_docids;
|
||||
first_loop = false;
|
||||
},
|
||||
pair_docids => {
|
||||
candidates.intersect_with(&pair_docids);
|
||||
},
|
||||
for slice in words.windows(2) {
|
||||
let (left, right) = (&slice[0], &slice[1]);
|
||||
match ctx.word_pair_proximity_docids(left, right, 1)? {
|
||||
Some(pair_docids) => {
|
||||
if pair_docids.is_empty() {
|
||||
return Ok(RoaringBitmap::new());
|
||||
} else if first_loop {
|
||||
candidates = pair_docids;
|
||||
first_loop = false;
|
||||
} else {
|
||||
candidates &= pair_docids;
|
||||
}
|
||||
},
|
||||
_ => bail!("invalid consecutive query type"),
|
||||
None => return Ok(RoaringBitmap::new())
|
||||
}
|
||||
}
|
||||
Ok(candidates)
|
||||
|
@ -52,13 +52,18 @@ impl MatchingWords {
|
||||
fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> {
|
||||
fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) {
|
||||
match tree {
|
||||
Operation::Or(_, ops) | Operation::And(ops) | Operation::Consecutive(ops) => {
|
||||
Operation::Or(_, ops) | Operation::And(ops) => {
|
||||
ops.as_slice().iter().for_each(|op| resolve_ops(op, out));
|
||||
},
|
||||
Operation::Query(Query { prefix, kind }) => {
|
||||
let typo = if kind.is_exact() { 0 } else { kind.typo() };
|
||||
out.insert((kind.word(), typo, *prefix));
|
||||
},
|
||||
Operation::Phrase(words) => {
|
||||
for word in words {
|
||||
out.insert((word, 0, false));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -15,7 +15,8 @@ type IsPrefix = bool;
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub enum Operation {
|
||||
And(Vec<Operation>),
|
||||
Consecutive(Vec<Operation>),
|
||||
// serie of consecutive non prefix and exact words
|
||||
Phrase(Vec<String>),
|
||||
Or(IsOptionalWord, Vec<Operation>),
|
||||
Query(Query),
|
||||
}
|
||||
@ -28,9 +29,8 @@ impl fmt::Debug for Operation {
|
||||
writeln!(f, "{:1$}AND", "", depth * 2)?;
|
||||
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
|
||||
},
|
||||
Operation::Consecutive(children) => {
|
||||
writeln!(f, "{:1$}CONSECUTIVE", "", depth * 2)?;
|
||||
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
|
||||
Operation::Phrase(children) => {
|
||||
writeln!(f, "{:2$}PHRASE {:?}", "", children, depth * 2)
|
||||
},
|
||||
Operation::Or(true, children) => {
|
||||
writeln!(f, "{:1$}OR(WORD)", "", depth * 2)?;
|
||||
@ -49,14 +49,6 @@ impl fmt::Debug for Operation {
|
||||
}
|
||||
|
||||
impl Operation {
|
||||
fn phrase(words: Vec<String>) -> Operation {
|
||||
Operation::consecutive(
|
||||
words.into_iter().map(|s| {
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(s) })
|
||||
}).collect()
|
||||
)
|
||||
}
|
||||
|
||||
fn and(mut ops: Vec<Self>) -> Self {
|
||||
if ops.len() == 1 {
|
||||
ops.pop().unwrap()
|
||||
@ -73,11 +65,11 @@ impl Operation {
|
||||
}
|
||||
}
|
||||
|
||||
fn consecutive(mut ops: Vec<Self>) -> Self {
|
||||
if ops.len() == 1 {
|
||||
ops.pop().unwrap()
|
||||
fn phrase(mut words: Vec<String>) -> Self {
|
||||
if words.len() == 1 {
|
||||
Self::Query(Query { prefix: false, kind: QueryKind::exact(words.pop().unwrap()) })
|
||||
} else {
|
||||
Self::Consecutive(ops)
|
||||
Self::Phrase(words)
|
||||
}
|
||||
}
|
||||
|
||||
@ -256,10 +248,10 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<O
|
||||
}
|
||||
}
|
||||
|
||||
Ok(best.map(|(_, left, right)| Operation::Consecutive(
|
||||
Ok(best.map(|(_, left, right)| Operation::Phrase(
|
||||
vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(left.to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(right.to_string()) })
|
||||
left.to_string(),
|
||||
right.to_string()
|
||||
]
|
||||
)))
|
||||
}
|
||||
@ -494,24 +486,26 @@ fn create_primitive_query(query: TokenStream, stop_words: Option<Set<&[u8]>>, wo
|
||||
|
||||
/// Returns the maximum number of typos that this Operation allows.
|
||||
pub fn maximum_typo(operation: &Operation) -> usize {
|
||||
use Operation::{Or, And, Query, Consecutive};
|
||||
use Operation::{Or, And, Query, Phrase};
|
||||
match operation {
|
||||
Or(_, ops) => ops.iter().map(maximum_typo).max().unwrap_or(0),
|
||||
And(ops) | Consecutive(ops) => ops.iter().map(maximum_typo).sum::<usize>(),
|
||||
And(ops) => ops.iter().map(maximum_typo).sum::<usize>(),
|
||||
Query(q) => q.kind.typo() as usize,
|
||||
// no typo allowed in phrases
|
||||
Phrase(_) => 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the maximum proximity that this Operation allows.
|
||||
pub fn maximum_proximity(operation: &Operation) -> usize {
|
||||
use Operation::{Or, And, Query, Consecutive};
|
||||
use Operation::{Or, And, Query, Phrase};
|
||||
match operation {
|
||||
Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0),
|
||||
And(ops) => {
|
||||
ops.iter().map(maximum_proximity).sum::<usize>()
|
||||
+ ops.len().saturating_sub(1) * 7
|
||||
},
|
||||
Query(_) | Consecutive(_) => 0,
|
||||
Query(_) | Phrase(_) => 0,
|
||||
}
|
||||
}
|
||||
|
||||
@ -765,9 +759,9 @@ mod test {
|
||||
let expected = Operation::Or(false, vec![
|
||||
Operation::And(vec![
|
||||
Operation::Or(false, vec![
|
||||
Operation::Consecutive(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("word".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
|
||||
Operation::Phrase(vec![
|
||||
"word".to_string(),
|
||||
"split".to_string(),
|
||||
]),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplit".to_string()) }),
|
||||
]),
|
||||
@ -789,9 +783,9 @@ mod test {
|
||||
let tokens = result.tokens();
|
||||
|
||||
let expected = Operation::And(vec![
|
||||
Operation::Consecutive(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("friends".to_string()) }),
|
||||
Operation::Phrase(vec![
|
||||
"hey".to_string(),
|
||||
"friends".to_string(),
|
||||
]),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }),
|
||||
]);
|
||||
@ -809,13 +803,13 @@ mod test {
|
||||
let tokens = result.tokens();
|
||||
|
||||
let expected = Operation::And(vec![
|
||||
Operation::Consecutive(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("friends".to_string()) }),
|
||||
Operation::Phrase(vec![
|
||||
"hey".to_string(),
|
||||
"friends".to_string(),
|
||||
]),
|
||||
Operation::Consecutive(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }),
|
||||
Operation::Phrase(vec![
|
||||
"wooop".to_string(),
|
||||
"wooop".to_string(),
|
||||
]),
|
||||
]);
|
||||
|
||||
@ -870,9 +864,9 @@ mod test {
|
||||
let result = analyzer.analyze(query);
|
||||
let tokens = result.tokens();
|
||||
|
||||
let expected = Operation::Consecutive(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }),
|
||||
let expected = Operation::Phrase(vec![
|
||||
"hey".to_string(),
|
||||
"my".to_string(),
|
||||
]);
|
||||
let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
|
||||
|
||||
@ -940,9 +934,9 @@ mod test {
|
||||
let tokens = result.tokens();
|
||||
|
||||
let expected = Operation::And(vec![
|
||||
Operation::Consecutive(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }),
|
||||
Operation::Phrase(vec![
|
||||
"hey".to_string(),
|
||||
"my".to_string(),
|
||||
]),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }),
|
||||
]);
|
||||
|
Loading…
x
Reference in New Issue
Block a user