Factorize phrase computation

This commit is contained in:
ManyTheFish 2022-08-08 10:37:31 +02:00
parent cbb3b25459
commit b389be48a0
2 changed files with 42 additions and 73 deletions

View File

@ -326,43 +326,7 @@ pub fn resolve_query_tree<'t>(
} }
Ok(candidates) Ok(candidates)
} }
Phrase(words) => { Phrase(words) => resolve_phrase(ctx, &words),
let mut candidates = RoaringBitmap::new();
let mut first_iter = true;
let winsize = words.len().min(7);
for win in words.windows(winsize) {
// Get all the documents with the matching distance for each word pairs.
let mut bitmaps = Vec::with_capacity(winsize.pow(2));
for (offset, s1) in win.iter().enumerate() {
for (dist, s2) in win.iter().skip(offset + 1).enumerate() {
match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? {
Some(m) => bitmaps.push(m),
// If there are no document for this distance, there will be no
// results for the phrase query.
None => return Ok(RoaringBitmap::new()),
}
}
}
// We sort the bitmaps so that we perform the small intersections first, which is faster.
bitmaps.sort_unstable_by(|a, b| a.len().cmp(&b.len()));
for bitmap in bitmaps {
if first_iter {
candidates = bitmap;
first_iter = false;
} else {
candidates &= bitmap;
}
// There will be no match, return early
if candidates.is_empty() {
break;
}
}
}
Ok(candidates)
}
Or(_, ops) => { Or(_, ops) => {
let mut candidates = RoaringBitmap::new(); let mut candidates = RoaringBitmap::new();
for op in ops { for op in ops {
@ -378,6 +342,44 @@ pub fn resolve_query_tree<'t>(
resolve_operation(ctx, query_tree, wdcache) resolve_operation(ctx, query_tree, wdcache)
} }
pub fn resolve_phrase<'t>(ctx: &'t dyn Context, phrase: &[String]) -> Result<RoaringBitmap> {
let mut candidates = RoaringBitmap::new();
let mut first_iter = true;
let winsize = phrase.len().min(7);
for win in phrase.windows(winsize) {
// Get all the documents with the matching distance for each word pairs.
let mut bitmaps = Vec::with_capacity(winsize.pow(2));
for (offset, s1) in win.iter().enumerate() {
for (dist, s2) in win.iter().skip(offset + 1).enumerate() {
match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? {
Some(m) => bitmaps.push(m),
// If there are no document for this distance, there will be no
// results for the phrase query.
None => return Ok(RoaringBitmap::new()),
}
}
}
// We sort the bitmaps so that we perform the small intersections first, which is faster.
bitmaps.sort_unstable_by(|a, b| a.len().cmp(&b.len()));
for bitmap in bitmaps {
if first_iter {
candidates = bitmap;
first_iter = false;
} else {
candidates &= bitmap;
}
// There will be no match, return early
if candidates.is_empty() {
break;
}
}
}
Ok(candidates)
}
fn all_word_pair_proximity_docids<T: AsRef<str>, U: AsRef<str>>( fn all_word_pair_proximity_docids<T: AsRef<str>, U: AsRef<str>>(
ctx: &dyn Context, ctx: &dyn Context,
left_words: &[(T, u8)], left_words: &[(T, u8)],

View File

@ -6,8 +6,8 @@ use log::debug;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::{ use super::{
query_docids, query_pair_proximity_docids, resolve_query_tree, Context, Criterion, query_docids, query_pair_proximity_docids, resolve_phrase, resolve_query_tree, Context,
CriterionParameters, CriterionResult, Criterion, CriterionParameters, CriterionResult,
}; };
use crate::search::query_tree::{maximum_proximity, Operation, Query, QueryKind}; use crate::search::query_tree::{maximum_proximity, Operation, Query, QueryKind};
use crate::search::{build_dfa, WordDerivationsCache}; use crate::search::{build_dfa, WordDerivationsCache};
@ -192,42 +192,9 @@ fn resolve_candidates<'t>(
let most_right = words let most_right = words
.last() .last()
.map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
let mut candidates = RoaringBitmap::new();
let mut first_iter = true;
let winsize = words.len().min(7);
for win in words.windows(winsize) {
// Get all the documents with the matching distance for each word pairs.
let mut bitmaps = Vec::with_capacity(winsize.pow(2));
for (offset, s1) in win.iter().enumerate() {
for (dist, s2) in win.iter().skip(offset + 1).enumerate() {
match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? {
Some(m) => bitmaps.push(m),
// If there are no document for this distance, there will be no
// results for the phrase query.
None => return Ok(Default::default()),
}
}
}
// We sort the bitmaps so that we perform the small intersections first, which is faster.
bitmaps.sort_unstable_by(|a, b| a.len().cmp(&b.len()));
for bitmap in bitmaps {
if first_iter {
candidates = bitmap;
first_iter = false;
} else {
candidates &= bitmap;
}
// There will be no match, return early
if candidates.is_empty() {
break;
}
}
}
match (most_left, most_right) { match (most_left, most_right) {
(Some(l), Some(r)) => vec![(l, r, candidates)], (Some(l), Some(r)) => vec![(l, r, resolve_phrase(ctx, &words)?)],
_otherwise => Default::default(), _otherwise => Default::default(),
} }
} else { } else {