664: Fix phrase search containing stop words r=ManyTheFish a=Samyak2

# Pull Request

This a WIP draft PR I wanted to create to let other potential contributors know that I'm working on this issue. I'll be completing this in a few hours from opening this.

## Related issue
Fixes #661 and towards fixing meilisearch/meilisearch#2905

## What does this PR do?
- [x] Change Phrase Operation to use a `Vec<Option<String>>` instead of `Vec<String>` where `None` corresponds to a stop word
- [x] Update all other uses of phrase operation
- [x] Update `resolve_phrase`
- [x] Update `create_primitive_query`?
- [x] Add test

## PR checklist
Please check if your PR fulfills the following requirements:
- [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)?
- [x] Have you read the contributing guidelines?
- [x] Have you made sure that the title is accurate and descriptive of the changes?


Co-authored-by: Samyak S Sarnayak <samyak201@gmail.com>
Co-authored-by: Samyak Sarnayak <samyak201@gmail.com>
This commit is contained in:
bors[bot] 2022-10-29 13:42:52 +00:00 committed by GitHub
commit c965200010
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 156 additions and 58 deletions

View File

@ -579,6 +579,7 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
Phrase(words) => {
let queries = words
.iter()
.filter_map(|w| w.as_ref())
.map(|word| vec![Query { prefix: false, kind: QueryKind::exact(word.clone()) }])
.collect();
vec![queries]

View File

@ -299,9 +299,11 @@ fn attribute_start_with_docids(
}
Phrase(phrase) => {
for word in phrase {
let wc = ctx.word_position_docids(word, pos)?;
if let Some(word_candidates) = wc {
attribute_candidates_array.push(word_candidates);
if let Some(word) = word {
let wc = ctx.word_position_docids(word, pos)?;
if let Some(word_candidates) = wc {
attribute_candidates_array.push(word_candidates);
}
}
pos += 1;
}
@ -323,7 +325,7 @@ fn intersection_of(mut rbs: Vec<&RoaringBitmap>) -> RoaringBitmap {
#[derive(Debug, Clone)]
pub enum ExactQueryPart {
Phrase(Vec<String>),
Phrase(Vec<Option<String>>),
Synonyms(Vec<String>),
}

View File

@ -418,15 +418,32 @@ pub fn resolve_query_tree(
resolve_operation(ctx, query_tree, wdcache)
}
pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result<RoaringBitmap> {
pub fn resolve_phrase(ctx: &dyn Context, phrase: &[Option<String>]) -> Result<RoaringBitmap> {
let mut candidates = RoaringBitmap::new();
let mut first_iter = true;
let winsize = phrase.len().min(3);
if phrase.is_empty() {
return Ok(candidates);
}
for win in phrase.windows(winsize) {
// Get all the documents with the matching distance for each word pairs.
let mut bitmaps = Vec::with_capacity(winsize.pow(2));
for (offset, s1) in win.iter().enumerate() {
for (dist, s2) in win.iter().skip(offset + 1).enumerate() {
for (offset, s1) in win.iter().enumerate().filter_map(|(index, word)| {
if let Some(word) = word {
Some((index, word))
} else {
None
}
}) {
for (dist, s2) in win.iter().skip(offset + 1).enumerate().filter_map(|(index, word)| {
if let Some(word) = word {
Some((index, word))
} else {
None
}
}) {
if dist == 0 {
match ctx.word_pair_proximity_docids(s1, s2, 1)? {
Some(m) => bitmaps.push(m),

View File

@ -4,6 +4,7 @@ use std::mem::take;
use log::debug;
use roaring::RoaringBitmap;
use slice_group_by::GroupBy;
use super::{
query_docids, query_pair_proximity_docids, resolve_phrase, resolve_query_tree, Context,
@ -187,10 +188,15 @@ fn resolve_candidates<'t>(
Phrase(words) => {
if proximity == 0 {
let most_left = words
.first()
.iter()
.filter_map(|o| o.as_ref())
.next()
.map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
let most_right = words
.last()
.iter()
.rev()
.filter_map(|o| o.as_ref())
.next()
.map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
match (most_left, most_right) {
@ -473,14 +479,34 @@ fn resolve_plane_sweep_candidates(
}
Phrase(words) => {
let mut groups_positions = Vec::with_capacity(words.len());
for word in words {
let positions = match words_positions.get(word) {
Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(),
None => return Ok(vec![]),
};
groups_positions.push(positions);
// group stop_words together.
for words in words.linear_group_by_key(Option::is_none) {
// skip if it's a group of stop words.
if matches!(words.first(), None | Some(None)) {
continue;
}
// make a consecutive plane-sweep on the subgroup of words.
let mut subgroup = Vec::with_capacity(words.len());
for word in words.into_iter().map(|w| w.as_deref().unwrap()) {
match words_positions.get(word) {
Some(positions) => {
subgroup.push(positions.iter().map(|p| (p, 0, p)).collect())
}
None => return Ok(vec![]),
}
}
match subgroup.len() {
0 => {}
1 => groups_positions.push(subgroup.pop().unwrap()),
_ => groups_positions.push(plane_sweep(subgroup, true)?),
}
}
match groups_positions.len() {
0 => vec![],
1 => groups_positions.pop().unwrap(),
_ => plane_sweep(groups_positions, false)?,
}
plane_sweep(groups_positions, true)?
}
Or(_, ops) => {
let mut result = Vec::new();

View File

@ -9,6 +9,7 @@ use super::{
query_docids, resolve_query_tree, Candidates, Context, Criterion, CriterionParameters,
CriterionResult,
};
use crate::search::criteria::resolve_phrase;
use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind};
use crate::search::{word_derivations, WordDerivationsCache};
use crate::Result;
@ -256,27 +257,7 @@ fn resolve_candidates<'t>(
match query_tree {
And(ops) => mdfs(ctx, ops, number_typos, cache, wdcache),
Phrase(words) => {
let mut candidates = RoaringBitmap::new();
let mut first_loop = true;
for slice in words.windows(2) {
let (left, right) = (&slice[0], &slice[1]);
match ctx.word_pair_proximity_docids(left, right, 1)? {
Some(pair_docids) => {
if pair_docids.is_empty() {
return Ok(RoaringBitmap::new());
} else if first_loop {
candidates = pair_docids;
first_loop = false;
} else {
candidates &= pair_docids;
}
}
None => return Ok(RoaringBitmap::new()),
}
}
Ok(candidates)
}
Phrase(words) => resolve_phrase(ctx, words),
Or(_, ops) => {
let mut candidates = RoaringBitmap::new();
for op in ops {

View File

@ -4,7 +4,6 @@ use std::{fmt, mem};
use charabia::classifier::ClassifiedTokenIter;
use charabia::{SeparatorKind, TokenKind};
use fst::Set;
use roaring::RoaringBitmap;
use slice_group_by::GroupBy;
@ -18,8 +17,9 @@ type IsPrefix = bool;
#[derive(Clone, PartialEq, Eq, Hash)]
pub enum Operation {
And(Vec<Operation>),
// serie of consecutive non prefix and exact words
Phrase(Vec<String>),
// series of consecutive non prefix and exact words
// `None` means a stop word.
Phrase(Vec<Option<String>>),
Or(IsOptionalWord, Vec<Operation>),
Query(Query),
}
@ -75,9 +75,13 @@ impl Operation {
}
}
fn phrase(mut words: Vec<String>) -> Self {
fn phrase(mut words: Vec<Option<String>>) -> Self {
if words.len() == 1 {
Self::Query(Query { prefix: false, kind: QueryKind::exact(words.pop().unwrap()) })
if let Some(word) = words.pop().unwrap() {
Self::Query(Query { prefix: false, kind: QueryKind::exact(word) })
} else {
Self::Phrase(words)
}
} else {
Self::Phrase(words)
}
@ -264,8 +268,7 @@ impl<'a> QueryTreeBuilder<'a> {
&self,
query: ClassifiedTokenIter<A>,
) -> Result<Option<(Operation, PrimitiveQuery, MatchingWords)>> {
let stop_words = self.index.stop_words(self.rtxn)?;
let primitive_query = create_primitive_query(query, stop_words, self.words_limit);
let primitive_query = create_primitive_query(query, self.words_limit);
if !primitive_query.is_empty() {
let qt = create_query_tree(
self,
@ -370,7 +373,10 @@ fn create_query_tree(
PrimitiveQueryPart::Word(word, prefix) => {
let mut children = synonyms(ctx, &[&word])?.unwrap_or_default();
if let Some((left, right)) = split_best_frequency(ctx, &word)? {
children.push(Operation::Phrase(vec![left.to_string(), right.to_string()]));
children.push(Operation::Phrase(vec![
Some(left.to_string()),
Some(right.to_string()),
]));
}
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
let exact_words = ctx.exact_words();
@ -583,7 +589,11 @@ fn create_matching_words(
PrimitiveQueryPart::Phrase(words) => {
let ids: Vec<_> =
(0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect();
let words = words.into_iter().map(|w| MatchingWord::new(w, 0, false)).collect();
let words = words
.into_iter()
.filter_map(|w| w)
.map(|w| MatchingWord::new(w, 0, false))
.collect();
matching_words.push((words, ids));
}
}
@ -685,7 +695,7 @@ pub type PrimitiveQuery = Vec<PrimitiveQueryPart>;
#[derive(Debug, Clone)]
pub enum PrimitiveQueryPart {
Phrase(Vec<String>),
Phrase(Vec<Option<String>>),
Word(String, IsPrefix),
}
@ -710,7 +720,6 @@ impl PrimitiveQueryPart {
/// the primitive query is an intermediate state to build the query tree.
fn create_primitive_query<A>(
query: ClassifiedTokenIter<A>,
stop_words: Option<Set<&[u8]>>,
words_limit: Option<usize>,
) -> PrimitiveQuery
where
@ -735,9 +744,14 @@ where
// 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word,
// 3. if the word is the last token of the query we push it as a prefix word.
if quoted {
phrase.push(token.lemma().to_string());
if let TokenKind::StopWord = token.kind {
phrase.push(None)
} else {
phrase.push(Some(token.lemma().to_string()));
}
} else if peekable.peek().is_some() {
if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) {
if let TokenKind::StopWord = token.kind {
} else {
primitive_query
.push(PrimitiveQueryPart::Word(token.lemma().to_string(), false));
}
@ -820,7 +834,7 @@ mod test {
words_limit: Option<usize>,
query: ClassifiedTokenIter<A>,
) -> Result<Option<(Operation, PrimitiveQuery)>> {
let primitive_query = create_primitive_query(query, None, words_limit);
let primitive_query = create_primitive_query(query, words_limit);
if !primitive_query.is_empty() {
let qt = create_query_tree(
self,
@ -1065,7 +1079,7 @@ mod test {
OR
AND
OR
PHRASE ["word", "split"]
PHRASE [Some("word"), Some("split")]
Tolerant { word: "wordsplit", max typo: 2 }
Exact { word: "fish" }
Tolerant { word: "wordsplitfish", max typo: 1 }
@ -1084,7 +1098,7 @@ mod test {
insta::assert_debug_snapshot!(query_tree, @r###"
OR
PHRASE ["quickbrown", "fox"]
PHRASE [Some("quickbrown"), Some("fox")]
PrefixTolerant { word: "quickbrownfox", max typo: 2 }
"###);
}
@ -1101,7 +1115,7 @@ mod test {
insta::assert_debug_snapshot!(query_tree, @r###"
AND
PHRASE ["hey", "friends"]
PHRASE [Some("hey"), Some("friends")]
Exact { word: "wooop" }
"###);
}
@ -1138,8 +1152,8 @@ mod test {
insta::assert_debug_snapshot!(query_tree, @r###"
AND
PHRASE ["hey", "friends"]
PHRASE ["wooop", "wooop"]
PHRASE [Some("hey"), Some("friends")]
PHRASE [Some("wooop"), Some("wooop")]
"###);
}
@ -1187,7 +1201,7 @@ mod test {
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###"
PHRASE ["hey", "my"]
PHRASE [Some("hey"), Some("my")]
"###);
}
@ -1252,7 +1266,7 @@ mod test {
insta::assert_debug_snapshot!(query_tree, @r###"
AND
PHRASE ["hey", "my"]
PHRASE [Some("hey"), Some("my")]
Exact { word: "good" }
"###);
}

View File

@ -15,6 +15,7 @@ use slice_group_by::GroupBy;
mod distinct;
mod facet_distribution;
mod filters;
mod phrase_search;
mod query_criteria;
mod sort;
mod typo_tolerance;

View File

@ -0,0 +1,56 @@
use milli::update::{IndexerConfig, Settings};
use milli::{Criterion, Index, Search, TermsMatchingStrategy};
use crate::search::Criterion::{Attribute, Exactness, Proximity};
fn set_stop_words(index: &Index, stop_words: &[&str]) {
let mut wtxn = index.write_txn().unwrap();
let config = IndexerConfig::default();
let mut builder = Settings::new(&mut wtxn, &index, &config);
let stop_words = stop_words.into_iter().map(|s| s.to_string()).collect();
builder.set_stop_words(stop_words);
builder.execute(|_| (), || false).unwrap();
wtxn.commit().unwrap();
}
fn test_phrase_search_with_stop_words_given_criteria(criteria: &[Criterion]) {
let index = super::setup_search_index_with_criteria(&criteria);
// Add stop_words
set_stop_words(&index, &["a", "an", "the", "of"]);
// Phrase search containing stop words
let txn = index.read_txn().unwrap();
let mut search = Search::new(&txn, &index);
search.query("\"the use of force\"");
search.limit(10);
search.authorize_typos(false);
search.terms_matching_strategy(TermsMatchingStrategy::All);
let result = search.execute().unwrap();
// 1 document should match
assert_eq!(result.documents_ids.len(), 1);
// test for a single stop word only, no other search terms
let mut search = Search::new(&txn, &index);
search.query("\"the\"");
search.limit(10);
search.authorize_typos(false);
search.terms_matching_strategy(TermsMatchingStrategy::All);
let result = search.execute().unwrap();
assert_eq!(result.documents_ids.len(), 0);
}
#[test]
fn test_phrase_search_with_stop_words_no_criteria() {
let criteria = [];
test_phrase_search_with_stop_words_given_criteria(&criteria);
}
#[test]
fn test_phrase_search_with_stop_words_all_criteria() {
let criteria = [Proximity, Attribute, Exactness];
test_phrase_search_with_stop_words_given_criteria(&criteria);
}