442: fix phrase search r=curquiza a=MarinPostma

Run the exact match search on 7 words windows instead of only two. This makes false positive very very unlikely, and impossible on phrase query that are less than seven words.


Co-authored-by: ad hoc <postma.marin@protonmail.com>
This commit is contained in:
bors[bot] 2022-02-07 16:18:20 +00:00 committed by GitHub
commit 5d58cb7449
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 40 additions and 14 deletions

View File

@ -448,8 +448,10 @@ impl Search {
#[derive(Debug, StructOpt)]
struct SettingsUpdate {
#[structopt(short, long)]
#[structopt(long)]
filterable_attributes: Option<Vec<String>>,
#[structopt(long)]
criteria: Option<Vec<String>>,
}
impl Performer for SettingsUpdate {
@ -468,6 +470,14 @@ impl Performer for SettingsUpdate {
}
}
if let Some(criteria) = self.criteria {
if !criteria.is_empty() {
update.set_criteria(criteria);
} else {
update.reset_criteria();
}
}
let mut bars = Vec::new();
let progesses = MultiProgress::new();
for _ in 0..4 {

View File

@ -318,21 +318,37 @@ pub fn resolve_query_tree<'t>(
}
Phrase(words) => {
let mut candidates = RoaringBitmap::new();
let mut first_loop = true;
for slice in words.windows(2) {
let (left, right) = (&slice[0], &slice[1]);
match ctx.word_pair_proximity_docids(left, right, 1)? {
Some(pair_docids) => {
if pair_docids.is_empty() {
return Ok(RoaringBitmap::new());
} else if first_loop {
candidates = pair_docids;
first_loop = false;
} else {
candidates &= pair_docids;
let mut first_iter = true;
let winsize = words.len().min(7);
for win in words.windows(winsize) {
// Get all the documents with the matching distance for each word pairs.
let mut bitmaps = Vec::with_capacity(winsize.pow(2));
for (offset, s1) in win.iter().enumerate() {
for (dist, s2) in win.iter().skip(offset).enumerate() {
match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? {
Some(m) => bitmaps.push(m),
// If there are no document for this distance, there will be no
// results for the phrase query.
None => return Ok(RoaringBitmap::new()),
}
}
None => return Ok(RoaringBitmap::new()),
}
// We sort the bitmaps so that we perform the small intersections first, which is faster.
bitmaps.sort_unstable_by(|a, b| a.len().cmp(&b.len()));
for bitmap in bitmaps {
if first_iter {
candidates = bitmap;
first_iter = false;
} else {
candidates &= bitmap;
}
// There will be no match, return early
if candidates.is_empty() {
break;
}
}
}
Ok(candidates)