From cbb3b254595ed126313b7c33899642cbedf83e10 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 21 Jul 2022 10:04:30 +0200 Subject: [PATCH 1/3] Fix(Search): Fix phrase search candidates computation This bug is an old bug but was hidden by the proximity criterion, Phrase search were always returning an empty candidates list. Before the fix, we were trying to find any words[n] near words[n] instead of finding any words[n] near words[n+1], for example: for a phrase search '"Hello world"' we were searching for "hello" near "hello" first, instead of "hello" near "world". --- milli/src/search/criteria/mod.rs | 2 +- milli/src/search/criteria/proximity.rs | 44 +++++++++++++++++++------- 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 05305d724..4613acb4f 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -335,7 +335,7 @@ pub fn resolve_query_tree<'t>( // Get all the documents with the matching distance for each word pairs. let mut bitmaps = Vec::with_capacity(winsize.pow(2)); for (offset, s1) in win.iter().enumerate() { - for (dist, s2) in win.iter().skip(offset).enumerate() { + for (dist, s2) in win.iter().skip(offset + 1).enumerate() { match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { Some(m) => bitmaps.push(m), // If there are no document for this distance, there will be no diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 2bfa61e85..30919585b 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -192,22 +192,42 @@ fn resolve_candidates<'t>( let most_right = words .last() .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); - let mut candidates = None; - for slice in words.windows(2) { - let (left, right) = (&slice[0], &slice[1]); - match ctx.word_pair_proximity_docids(left, right, 1)? { - Some(pair_docids) => match candidates.as_mut() { - Some(candidates) => *candidates &= pair_docids, - None => candidates = Some(pair_docids), - }, - None => { - candidates = None; + let mut candidates = RoaringBitmap::new(); + let mut first_iter = true; + let winsize = words.len().min(7); + + for win in words.windows(winsize) { + // Get all the documents with the matching distance for each word pairs. + let mut bitmaps = Vec::with_capacity(winsize.pow(2)); + for (offset, s1) in win.iter().enumerate() { + for (dist, s2) in win.iter().skip(offset + 1).enumerate() { + match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { + Some(m) => bitmaps.push(m), + // If there are no document for this distance, there will be no + // results for the phrase query. + None => return Ok(Default::default()), + } + } + } + + // We sort the bitmaps so that we perform the small intersections first, which is faster. + bitmaps.sort_unstable_by(|a, b| a.len().cmp(&b.len())); + + for bitmap in bitmaps { + if first_iter { + candidates = bitmap; + first_iter = false; + } else { + candidates &= bitmap; + } + // There will be no match, return early + if candidates.is_empty() { break; } } } - match (most_left, most_right, candidates) { - (Some(l), Some(r), Some(c)) => vec![(l, r, c)], + match (most_left, most_right) { + (Some(l), Some(r)) => vec![(l, r, candidates)], _otherwise => Default::default(), } } else { From b389be48a02c5fe2de20590466077119df82a227 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 8 Aug 2022 10:37:31 +0200 Subject: [PATCH 2/3] Factorize phrase computation --- milli/src/search/criteria/mod.rs | 76 +++++++++++++------------- milli/src/search/criteria/proximity.rs | 39 +------------ 2 files changed, 42 insertions(+), 73 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 4613acb4f..ae9e0c218 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -326,43 +326,7 @@ pub fn resolve_query_tree<'t>( } Ok(candidates) } - Phrase(words) => { - let mut candidates = RoaringBitmap::new(); - let mut first_iter = true; - let winsize = words.len().min(7); - - for win in words.windows(winsize) { - // Get all the documents with the matching distance for each word pairs. - let mut bitmaps = Vec::with_capacity(winsize.pow(2)); - for (offset, s1) in win.iter().enumerate() { - for (dist, s2) in win.iter().skip(offset + 1).enumerate() { - match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { - Some(m) => bitmaps.push(m), - // If there are no document for this distance, there will be no - // results for the phrase query. - None => return Ok(RoaringBitmap::new()), - } - } - } - - // We sort the bitmaps so that we perform the small intersections first, which is faster. - bitmaps.sort_unstable_by(|a, b| a.len().cmp(&b.len())); - - for bitmap in bitmaps { - if first_iter { - candidates = bitmap; - first_iter = false; - } else { - candidates &= bitmap; - } - // There will be no match, return early - if candidates.is_empty() { - break; - } - } - } - Ok(candidates) - } + Phrase(words) => resolve_phrase(ctx, &words), Or(_, ops) => { let mut candidates = RoaringBitmap::new(); for op in ops { @@ -378,6 +342,44 @@ pub fn resolve_query_tree<'t>( resolve_operation(ctx, query_tree, wdcache) } +pub fn resolve_phrase<'t>(ctx: &'t dyn Context, phrase: &[String]) -> Result { + let mut candidates = RoaringBitmap::new(); + let mut first_iter = true; + let winsize = phrase.len().min(7); + + for win in phrase.windows(winsize) { + // Get all the documents with the matching distance for each word pairs. + let mut bitmaps = Vec::with_capacity(winsize.pow(2)); + for (offset, s1) in win.iter().enumerate() { + for (dist, s2) in win.iter().skip(offset + 1).enumerate() { + match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { + Some(m) => bitmaps.push(m), + // If there are no document for this distance, there will be no + // results for the phrase query. + None => return Ok(RoaringBitmap::new()), + } + } + } + + // We sort the bitmaps so that we perform the small intersections first, which is faster. + bitmaps.sort_unstable_by(|a, b| a.len().cmp(&b.len())); + + for bitmap in bitmaps { + if first_iter { + candidates = bitmap; + first_iter = false; + } else { + candidates &= bitmap; + } + // There will be no match, return early + if candidates.is_empty() { + break; + } + } + } + Ok(candidates) +} + fn all_word_pair_proximity_docids, U: AsRef>( ctx: &dyn Context, left_words: &[(T, u8)], diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 30919585b..e942a7bef 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -6,8 +6,8 @@ use log::debug; use roaring::RoaringBitmap; use super::{ - query_docids, query_pair_proximity_docids, resolve_query_tree, Context, Criterion, - CriterionParameters, CriterionResult, + query_docids, query_pair_proximity_docids, resolve_phrase, resolve_query_tree, Context, + Criterion, CriterionParameters, CriterionResult, }; use crate::search::query_tree::{maximum_proximity, Operation, Query, QueryKind}; use crate::search::{build_dfa, WordDerivationsCache}; @@ -192,42 +192,9 @@ fn resolve_candidates<'t>( let most_right = words .last() .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); - let mut candidates = RoaringBitmap::new(); - let mut first_iter = true; - let winsize = words.len().min(7); - for win in words.windows(winsize) { - // Get all the documents with the matching distance for each word pairs. - let mut bitmaps = Vec::with_capacity(winsize.pow(2)); - for (offset, s1) in win.iter().enumerate() { - for (dist, s2) in win.iter().skip(offset + 1).enumerate() { - match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { - Some(m) => bitmaps.push(m), - // If there are no document for this distance, there will be no - // results for the phrase query. - None => return Ok(Default::default()), - } - } - } - - // We sort the bitmaps so that we perform the small intersections first, which is faster. - bitmaps.sort_unstable_by(|a, b| a.len().cmp(&b.len())); - - for bitmap in bitmaps { - if first_iter { - candidates = bitmap; - first_iter = false; - } else { - candidates &= bitmap; - } - // There will be no match, return early - if candidates.is_empty() { - break; - } - } - } match (most_left, most_right) { - (Some(l), Some(r)) => vec![(l, r, candidates)], + (Some(l), Some(r)) => vec![(l, r, resolve_phrase(ctx, &words)?)], _otherwise => Default::default(), } } else { From 8c3f1a9c39620d26adaf1a7c3e25f1e1dc5dd133 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 17 Aug 2022 15:20:43 +0200 Subject: [PATCH 3/3] Remove useless lifetime declaration --- milli/src/search/criteria/mod.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index ae9e0c218..f48865ba5 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -293,13 +293,13 @@ impl<'t> CriteriaBuilder<'t> { } } -pub fn resolve_query_tree<'t>( - ctx: &'t dyn Context, +pub fn resolve_query_tree( + ctx: &dyn Context, query_tree: &Operation, wdcache: &mut WordDerivationsCache, ) -> Result { - fn resolve_operation<'t>( - ctx: &'t dyn Context, + fn resolve_operation( + ctx: &dyn Context, query_tree: &Operation, wdcache: &mut WordDerivationsCache, ) -> Result { @@ -342,7 +342,7 @@ pub fn resolve_query_tree<'t>( resolve_operation(ctx, query_tree, wdcache) } -pub fn resolve_phrase<'t>(ctx: &'t dyn Context, phrase: &[String]) -> Result { +pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result { let mut candidates = RoaringBitmap::new(); let mut first_iter = true; let winsize = phrase.len().min(7);