[WIP] Fix phrase search containing stop words

Fixes #661 and meilisearch/meilisearch#2905
This commit is contained in:
Samyak S Sarnayak 2022-10-26 19:08:06 +05:30
parent 365f44c39b
commit 62816dddde
No known key found for this signature in database
GPG Key ID: 365873F2F0C6153B
6 changed files with 43 additions and 16 deletions

View File

@ -579,6 +579,7 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
Phrase(words) => { Phrase(words) => {
let queries = words let queries = words
.iter() .iter()
.filter_map(|w| w.as_ref())
.map(|word| vec![Query { prefix: false, kind: QueryKind::exact(word.clone()) }]) .map(|word| vec![Query { prefix: false, kind: QueryKind::exact(word.clone()) }])
.collect(); .collect();
vec![queries] vec![queries]

View File

@ -298,7 +298,7 @@ fn attribute_start_with_docids(
pos += 1; pos += 1;
} }
Phrase(phrase) => { Phrase(phrase) => {
for word in phrase { for word in phrase.iter().filter_map(|w| w.as_ref()) {
let wc = ctx.word_position_docids(word, pos)?; let wc = ctx.word_position_docids(word, pos)?;
if let Some(word_candidates) = wc { if let Some(word_candidates) = wc {
attribute_candidates_array.push(word_candidates); attribute_candidates_array.push(word_candidates);
@ -323,7 +323,7 @@ fn intersection_of(mut rbs: Vec<&RoaringBitmap>) -> RoaringBitmap {
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub enum ExactQueryPart { pub enum ExactQueryPart {
Phrase(Vec<String>), Phrase(Vec<Option<String>>),
Synonyms(Vec<String>), Synonyms(Vec<String>),
} }

View File

@ -418,15 +418,21 @@ pub fn resolve_query_tree(
resolve_operation(ctx, query_tree, wdcache) resolve_operation(ctx, query_tree, wdcache)
} }
pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result<RoaringBitmap> { pub fn resolve_phrase(ctx: &dyn Context, phrase: &[Option<String>]) -> Result<RoaringBitmap> {
let mut candidates = RoaringBitmap::new(); let mut candidates = RoaringBitmap::new();
let mut first_iter = true; let mut first_iter = true;
let winsize = phrase.len().min(3); let winsize = phrase.len().min(3);
for win in phrase.windows(winsize) { for win in phrase.windows(winsize) {
// Get all the documents with the matching distance for each word pairs. // Get all the documents with the matching distance for each word pairs.
let mut bitmaps = Vec::with_capacity(winsize.pow(2)); let mut bitmaps = Vec::with_capacity(winsize.pow(2));
for (offset, s1) in win.iter().enumerate() { for (offset, s1) in win.iter().filter_map(|w| w.as_ref()).enumerate() {
for (dist, s2) in win.iter().skip(offset + 1).enumerate() { for (dist, s2) in win.iter().skip(offset + 1).enumerate().filter_map(|(index, word)| {
if let Some(word) = word {
Some((index, word))
} else {
None
}
}) {
if dist == 0 { if dist == 0 {
match ctx.word_pair_proximity_docids(s1, s2, 1)? { match ctx.word_pair_proximity_docids(s1, s2, 1)? {
Some(m) => bitmaps.push(m), Some(m) => bitmaps.push(m),

View File

@ -188,9 +188,13 @@ fn resolve_candidates<'t>(
if proximity == 0 { if proximity == 0 {
let most_left = words let most_left = words
.first() .first()
.map(|o| o.as_ref())
.flatten()
.map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
let most_right = words let most_right = words
.last() .last()
.map(|o| o.as_ref())
.flatten()
.map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
match (most_left, most_right) { match (most_left, most_right) {
@ -473,7 +477,7 @@ fn resolve_plane_sweep_candidates(
} }
Phrase(words) => { Phrase(words) => {
let mut groups_positions = Vec::with_capacity(words.len()); let mut groups_positions = Vec::with_capacity(words.len());
for word in words { for word in words.iter().filter_map(|w| w.as_ref()) {
let positions = match words_positions.get(word) { let positions = match words_positions.get(word) {
Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(), Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(),
None => return Ok(vec![]), None => return Ok(vec![]),

View File

@ -2,6 +2,7 @@ use std::borrow::Cow;
use std::collections::HashMap; use std::collections::HashMap;
use std::mem::take; use std::mem::take;
use itertools::Itertools;
use log::debug; use log::debug;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
@ -259,8 +260,7 @@ fn resolve_candidates<'t>(
Phrase(words) => { Phrase(words) => {
let mut candidates = RoaringBitmap::new(); let mut candidates = RoaringBitmap::new();
let mut first_loop = true; let mut first_loop = true;
for slice in words.windows(2) { for (left, right) in words.iter().filter_map(|w| w.as_ref()).tuple_windows() {
let (left, right) = (&slice[0], &slice[1]);
match ctx.word_pair_proximity_docids(left, right, 1)? { match ctx.word_pair_proximity_docids(left, right, 1)? {
Some(pair_docids) => { Some(pair_docids) => {
if pair_docids.is_empty() { if pair_docids.is_empty() {

View File

@ -18,8 +18,9 @@ type IsPrefix = bool;
#[derive(Clone, PartialEq, Eq, Hash)] #[derive(Clone, PartialEq, Eq, Hash)]
pub enum Operation { pub enum Operation {
And(Vec<Operation>), And(Vec<Operation>),
// serie of consecutive non prefix and exact words // series of consecutive non prefix and exact words
Phrase(Vec<String>), // `None` means a stop word.
Phrase(Vec<Option<String>>),
Or(IsOptionalWord, Vec<Operation>), Or(IsOptionalWord, Vec<Operation>),
Query(Query), Query(Query),
} }
@ -75,9 +76,13 @@ impl Operation {
} }
} }
fn phrase(mut words: Vec<String>) -> Self { fn phrase(mut words: Vec<Option<String>>) -> Self {
if words.len() == 1 { if words.len() == 1 {
Self::Query(Query { prefix: false, kind: QueryKind::exact(words.pop().unwrap()) }) if let Some(word) = words.pop().unwrap() {
Self::Query(Query { prefix: false, kind: QueryKind::exact(word) })
} else {
Self::Phrase(words)
}
} else { } else {
Self::Phrase(words) Self::Phrase(words)
} }
@ -370,7 +375,10 @@ fn create_query_tree(
PrimitiveQueryPart::Word(word, prefix) => { PrimitiveQueryPart::Word(word, prefix) => {
let mut children = synonyms(ctx, &[&word])?.unwrap_or_default(); let mut children = synonyms(ctx, &[&word])?.unwrap_or_default();
if let Some((left, right)) = split_best_frequency(ctx, &word)? { if let Some((left, right)) = split_best_frequency(ctx, &word)? {
children.push(Operation::Phrase(vec![left.to_string(), right.to_string()])); children.push(Operation::Phrase(vec![
Some(left.to_string()),
Some(right.to_string()),
]));
} }
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
let exact_words = ctx.exact_words(); let exact_words = ctx.exact_words();
@ -583,7 +591,11 @@ fn create_matching_words(
PrimitiveQueryPart::Phrase(words) => { PrimitiveQueryPart::Phrase(words) => {
let ids: Vec<_> = let ids: Vec<_> =
(0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect(); (0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect();
let words = words.into_iter().map(|w| MatchingWord::new(w, 0, false)).collect(); let words = words
.into_iter()
.filter_map(|w| w)
.map(|w| MatchingWord::new(w, 0, false))
.collect();
matching_words.push((words, ids)); matching_words.push((words, ids));
} }
} }
@ -685,7 +697,7 @@ pub type PrimitiveQuery = Vec<PrimitiveQueryPart>;
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub enum PrimitiveQueryPart { pub enum PrimitiveQueryPart {
Phrase(Vec<String>), Phrase(Vec<Option<String>>),
Word(String, IsPrefix), Word(String, IsPrefix),
} }
@ -735,7 +747,11 @@ where
// 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word,
// 3. if the word is the last token of the query we push it as a prefix word. // 3. if the word is the last token of the query we push it as a prefix word.
if quoted { if quoted {
phrase.push(token.lemma().to_string()); if stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) {
phrase.push(None)
} else {
phrase.push(Some(token.lemma().to_string()));
}
} else if peekable.peek().is_some() { } else if peekable.peek().is_some() {
if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) { if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) {
primitive_query primitive_query