mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-27 07:00:05 +01:00
[WIP] Fix phrase search containing stop words
Fixes #661 and meilisearch/meilisearch#2905
This commit is contained in:
parent
365f44c39b
commit
62816dddde
@ -579,6 +579,7 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
|
||||
Phrase(words) => {
|
||||
let queries = words
|
||||
.iter()
|
||||
.filter_map(|w| w.as_ref())
|
||||
.map(|word| vec![Query { prefix: false, kind: QueryKind::exact(word.clone()) }])
|
||||
.collect();
|
||||
vec![queries]
|
||||
|
@ -298,7 +298,7 @@ fn attribute_start_with_docids(
|
||||
pos += 1;
|
||||
}
|
||||
Phrase(phrase) => {
|
||||
for word in phrase {
|
||||
for word in phrase.iter().filter_map(|w| w.as_ref()) {
|
||||
let wc = ctx.word_position_docids(word, pos)?;
|
||||
if let Some(word_candidates) = wc {
|
||||
attribute_candidates_array.push(word_candidates);
|
||||
@ -323,7 +323,7 @@ fn intersection_of(mut rbs: Vec<&RoaringBitmap>) -> RoaringBitmap {
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum ExactQueryPart {
|
||||
Phrase(Vec<String>),
|
||||
Phrase(Vec<Option<String>>),
|
||||
Synonyms(Vec<String>),
|
||||
}
|
||||
|
||||
|
@ -418,15 +418,21 @@ pub fn resolve_query_tree(
|
||||
resolve_operation(ctx, query_tree, wdcache)
|
||||
}
|
||||
|
||||
pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result<RoaringBitmap> {
|
||||
pub fn resolve_phrase(ctx: &dyn Context, phrase: &[Option<String>]) -> Result<RoaringBitmap> {
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
let mut first_iter = true;
|
||||
let winsize = phrase.len().min(3);
|
||||
for win in phrase.windows(winsize) {
|
||||
// Get all the documents with the matching distance for each word pairs.
|
||||
let mut bitmaps = Vec::with_capacity(winsize.pow(2));
|
||||
for (offset, s1) in win.iter().enumerate() {
|
||||
for (dist, s2) in win.iter().skip(offset + 1).enumerate() {
|
||||
for (offset, s1) in win.iter().filter_map(|w| w.as_ref()).enumerate() {
|
||||
for (dist, s2) in win.iter().skip(offset + 1).enumerate().filter_map(|(index, word)| {
|
||||
if let Some(word) = word {
|
||||
Some((index, word))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}) {
|
||||
if dist == 0 {
|
||||
match ctx.word_pair_proximity_docids(s1, s2, 1)? {
|
||||
Some(m) => bitmaps.push(m),
|
||||
|
@ -188,9 +188,13 @@ fn resolve_candidates<'t>(
|
||||
if proximity == 0 {
|
||||
let most_left = words
|
||||
.first()
|
||||
.map(|o| o.as_ref())
|
||||
.flatten()
|
||||
.map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
|
||||
let most_right = words
|
||||
.last()
|
||||
.map(|o| o.as_ref())
|
||||
.flatten()
|
||||
.map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
|
||||
|
||||
match (most_left, most_right) {
|
||||
@ -473,7 +477,7 @@ fn resolve_plane_sweep_candidates(
|
||||
}
|
||||
Phrase(words) => {
|
||||
let mut groups_positions = Vec::with_capacity(words.len());
|
||||
for word in words {
|
||||
for word in words.iter().filter_map(|w| w.as_ref()) {
|
||||
let positions = match words_positions.get(word) {
|
||||
Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(),
|
||||
None => return Ok(vec![]),
|
||||
|
@ -2,6 +2,7 @@ use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use std::mem::take;
|
||||
|
||||
use itertools::Itertools;
|
||||
use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
@ -259,8 +260,7 @@ fn resolve_candidates<'t>(
|
||||
Phrase(words) => {
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
let mut first_loop = true;
|
||||
for slice in words.windows(2) {
|
||||
let (left, right) = (&slice[0], &slice[1]);
|
||||
for (left, right) in words.iter().filter_map(|w| w.as_ref()).tuple_windows() {
|
||||
match ctx.word_pair_proximity_docids(left, right, 1)? {
|
||||
Some(pair_docids) => {
|
||||
if pair_docids.is_empty() {
|
||||
|
@ -18,8 +18,9 @@ type IsPrefix = bool;
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub enum Operation {
|
||||
And(Vec<Operation>),
|
||||
// serie of consecutive non prefix and exact words
|
||||
Phrase(Vec<String>),
|
||||
// series of consecutive non prefix and exact words
|
||||
// `None` means a stop word.
|
||||
Phrase(Vec<Option<String>>),
|
||||
Or(IsOptionalWord, Vec<Operation>),
|
||||
Query(Query),
|
||||
}
|
||||
@ -75,9 +76,13 @@ impl Operation {
|
||||
}
|
||||
}
|
||||
|
||||
fn phrase(mut words: Vec<String>) -> Self {
|
||||
fn phrase(mut words: Vec<Option<String>>) -> Self {
|
||||
if words.len() == 1 {
|
||||
Self::Query(Query { prefix: false, kind: QueryKind::exact(words.pop().unwrap()) })
|
||||
if let Some(word) = words.pop().unwrap() {
|
||||
Self::Query(Query { prefix: false, kind: QueryKind::exact(word) })
|
||||
} else {
|
||||
Self::Phrase(words)
|
||||
}
|
||||
} else {
|
||||
Self::Phrase(words)
|
||||
}
|
||||
@ -370,7 +375,10 @@ fn create_query_tree(
|
||||
PrimitiveQueryPart::Word(word, prefix) => {
|
||||
let mut children = synonyms(ctx, &[&word])?.unwrap_or_default();
|
||||
if let Some((left, right)) = split_best_frequency(ctx, &word)? {
|
||||
children.push(Operation::Phrase(vec![left.to_string(), right.to_string()]));
|
||||
children.push(Operation::Phrase(vec![
|
||||
Some(left.to_string()),
|
||||
Some(right.to_string()),
|
||||
]));
|
||||
}
|
||||
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
|
||||
let exact_words = ctx.exact_words();
|
||||
@ -583,7 +591,11 @@ fn create_matching_words(
|
||||
PrimitiveQueryPart::Phrase(words) => {
|
||||
let ids: Vec<_> =
|
||||
(0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect();
|
||||
let words = words.into_iter().map(|w| MatchingWord::new(w, 0, false)).collect();
|
||||
let words = words
|
||||
.into_iter()
|
||||
.filter_map(|w| w)
|
||||
.map(|w| MatchingWord::new(w, 0, false))
|
||||
.collect();
|
||||
matching_words.push((words, ids));
|
||||
}
|
||||
}
|
||||
@ -685,7 +697,7 @@ pub type PrimitiveQuery = Vec<PrimitiveQueryPart>;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum PrimitiveQueryPart {
|
||||
Phrase(Vec<String>),
|
||||
Phrase(Vec<Option<String>>),
|
||||
Word(String, IsPrefix),
|
||||
}
|
||||
|
||||
@ -735,7 +747,11 @@ where
|
||||
// 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word,
|
||||
// 3. if the word is the last token of the query we push it as a prefix word.
|
||||
if quoted {
|
||||
phrase.push(token.lemma().to_string());
|
||||
if stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) {
|
||||
phrase.push(None)
|
||||
} else {
|
||||
phrase.push(Some(token.lemma().to_string()));
|
||||
}
|
||||
} else if peekable.peek().is_some() {
|
||||
if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) {
|
||||
primitive_query
|
||||
|
Loading…
x
Reference in New Issue
Block a user