mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-22 12:54:26 +01:00
Merge #3525
3525: Fix phrase search containing stop words r=ManyTheFish a=ManyTheFish # Summary A search with a phrase containing only stop words was returning an HTTP error 500, this PR filters the phrase containing only stop words dropping them before the search starts, a query with a phrase containing only stop words now behaves like a placeholder search. fixes https://github.com/meilisearch/meilisearch/issues/3521 related v1.0.2 PR on milli: https://github.com/meilisearch/milli/pull/779 Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
commit
4f1ccbc495
@ -29,7 +29,7 @@ fn bench_formatting(c: &mut criterion::Criterion) {
|
||||
(vec![Rc::new(MatchingWord::new("thedoord".to_string(), 1, true).unwrap())], vec![0, 1, 2]),
|
||||
(vec![Rc::new(MatchingWord::new("doord".to_string(), 1, true).unwrap())], vec![1, 2]),
|
||||
]
|
||||
), TokenizerBuilder::default().build()),
|
||||
).unwrap(), TokenizerBuilder::default().build()),
|
||||
},
|
||||
];
|
||||
|
||||
|
@ -149,6 +149,27 @@ async fn simple_search() {
|
||||
.await;
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn phrase_search_with_stop_word() {
|
||||
// related to https://github.com/meilisearch/meilisearch/issues/3521
|
||||
let server = Server::new().await;
|
||||
let index = server.index("test");
|
||||
|
||||
let (_, code) = index.update_settings(json!({"stopWords": ["the", "of"]})).await;
|
||||
meili_snap::snapshot!(code, @"202 Accepted");
|
||||
|
||||
let documents = DOCUMENTS.clone();
|
||||
index.add_documents(documents, None).await;
|
||||
index.wait_task(1).await;
|
||||
|
||||
index
|
||||
.search(json!({"q": "how \"to\" train \"the" }), |response, code| {
|
||||
assert_eq!(code, 200, "{}", response);
|
||||
assert_eq!(response["hits"].as_array().unwrap().len(), 1);
|
||||
})
|
||||
.await;
|
||||
}
|
||||
|
||||
#[cfg(feature = "default")]
|
||||
#[actix_rt::test]
|
||||
async fn test_kanji_language_detection() {
|
||||
|
@ -59,6 +59,8 @@ pub enum InternalError {
|
||||
Utf8(#[from] str::Utf8Error),
|
||||
#[error("An indexation process was explicitly aborted.")]
|
||||
AbortedIndexation,
|
||||
#[error("The matching words list contains at least one invalid member.")]
|
||||
InvalidMatchingWords,
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
|
@ -7,6 +7,7 @@ use std::rc::Rc;
|
||||
use charabia::Token;
|
||||
use levenshtein_automata::{Distance, DFA};
|
||||
|
||||
use crate::error::InternalError;
|
||||
use crate::search::build_dfa;
|
||||
use crate::MAX_WORD_LENGTH;
|
||||
|
||||
@ -31,12 +32,19 @@ impl fmt::Debug for MatchingWords {
|
||||
}
|
||||
|
||||
impl MatchingWords {
|
||||
pub fn new(mut matching_words: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>) -> Self {
|
||||
pub fn new(
|
||||
mut matching_words: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
|
||||
) -> crate::Result<Self> {
|
||||
// if one of the matching_words vec doesn't contain a word.
|
||||
if matching_words.iter().any(|(mw, _)| mw.is_empty()) {
|
||||
return Err(InternalError::InvalidMatchingWords.into());
|
||||
}
|
||||
|
||||
// Sort word by len in DESC order prioritizing the longuest matches,
|
||||
// in order to highlight the longuest part of the matched word.
|
||||
matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len())));
|
||||
|
||||
Self { inner: matching_words }
|
||||
Ok(Self { inner: matching_words })
|
||||
}
|
||||
|
||||
/// Returns an iterator over terms that match or partially match the given token.
|
||||
@ -360,7 +368,7 @@ mod tests {
|
||||
(vec![all[2].clone()], vec![2]),
|
||||
];
|
||||
|
||||
let matching_words = MatchingWords::new(matching_words);
|
||||
let matching_words = MatchingWords::new(matching_words).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
matching_words
|
||||
|
@ -513,7 +513,7 @@ mod tests {
|
||||
(vec![all[2].clone()], vec![2]),
|
||||
];
|
||||
|
||||
MatchingWords::new(matching_words)
|
||||
MatchingWords::new(matching_words).unwrap()
|
||||
}
|
||||
|
||||
impl MatcherBuilder<'_, Vec<u8>> {
|
||||
@ -600,7 +600,7 @@ mod tests {
|
||||
];
|
||||
let matching_words = vec![(vec![all[0].clone()], vec![0]), (vec![all[1].clone()], vec![1])];
|
||||
|
||||
let matching_words = MatchingWords::new(matching_words);
|
||||
let matching_words = MatchingWords::new(matching_words).unwrap();
|
||||
|
||||
let builder = MatcherBuilder::from_matching_words(matching_words);
|
||||
|
||||
@ -847,7 +847,7 @@ mod tests {
|
||||
(vec![all[4].clone()], vec![2]),
|
||||
];
|
||||
|
||||
let matching_words = MatchingWords::new(matching_words);
|
||||
let matching_words = MatchingWords::new(matching_words).unwrap();
|
||||
|
||||
let mut builder = MatcherBuilder::from_matching_words(matching_words);
|
||||
builder.highlight_prefix("_".to_string());
|
||||
|
@ -747,7 +747,7 @@ fn create_matching_words(
|
||||
let mut matching_word_cache = MatchingWordCache::default();
|
||||
let mut matching_words = Vec::new();
|
||||
ngrams(ctx, authorize_typos, query, &mut matching_words, &mut matching_word_cache, 0)?;
|
||||
Ok(MatchingWords::new(matching_words))
|
||||
MatchingWords::new(matching_words)
|
||||
}
|
||||
|
||||
pub type PrimitiveQuery = Vec<PrimitiveQueryPart>;
|
||||
@ -825,9 +825,13 @@ where
|
||||
quoted = !quoted;
|
||||
}
|
||||
// if there is a quote or a hard separator we close the phrase.
|
||||
if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard)
|
||||
{
|
||||
primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase)));
|
||||
if quote_count > 0 || separator_kind == SeparatorKind::Hard {
|
||||
let phrase = mem::take(&mut phrase);
|
||||
|
||||
// if the phrase only contains stop words, we don't keep it in the query.
|
||||
if phrase.iter().any(|w| w.is_some()) {
|
||||
primitive_query.push(PrimitiveQueryPart::Phrase(phrase));
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
@ -835,7 +839,7 @@ where
|
||||
}
|
||||
|
||||
// If a quote is never closed, we consider all of the end of the query as a phrase.
|
||||
if !phrase.is_empty() {
|
||||
if phrase.iter().any(|w| w.is_some()) {
|
||||
primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase)));
|
||||
}
|
||||
|
||||
|
@ -32,15 +32,6 @@ fn test_phrase_search_with_stop_words_given_criteria(criteria: &[Criterion]) {
|
||||
let result = search.execute().unwrap();
|
||||
// 1 document should match
|
||||
assert_eq!(result.documents_ids.len(), 1);
|
||||
|
||||
// test for a single stop word only, no other search terms
|
||||
let mut search = Search::new(&txn, &index);
|
||||
search.query("\"the\"");
|
||||
search.limit(10);
|
||||
search.authorize_typos(false);
|
||||
search.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
let result = search.execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
Loading…
Reference in New Issue
Block a user