From 6841f167b42436d070245ecfa834c689e29f08c9 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 21 Feb 2023 18:02:10 +0100 Subject: [PATCH 1/4] Add test --- meilisearch/tests/search/mod.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index 91ff64d37..9ed3ffcd2 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -148,6 +148,27 @@ async fn simple_search() { .await; } +#[actix_rt::test] +async fn phrase_search_with_stop_word() { + // related to https://github.com/meilisearch/meilisearch/issues/3521 + let server = Server::new().await; + let index = server.index("test"); + + let (_, code) = index.update_settings(json!({"stopWords": ["the", "of"]})).await; + meili_snap::snapshot!(code, @"202 Accepted"); + + let documents = DOCUMENTS.clone(); + index.add_documents(documents, None).await; + index.wait_task(1).await; + + index + .search(json!({"q": "how \"to\" train \"the" }), |response, code| { + assert_eq!(code, 200, "{}", response); + assert_eq!(response["hits"].as_array().unwrap().len(), 1); + }) + .await; +} + #[cfg(feature = "default")] #[actix_rt::test] async fn test_kanji_language_detection() { From 28b7d73d4a8d12a91b07b8120d1dde4e168be47f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 21 Feb 2023 18:16:22 +0100 Subject: [PATCH 2/4] Remove an unefficient part of a test on milli --- milli/tests/search/phrase_search.rs | 9 --------- 1 file changed, 9 deletions(-) diff --git a/milli/tests/search/phrase_search.rs b/milli/tests/search/phrase_search.rs index 2e63c96c4..5b987ad30 100644 --- a/milli/tests/search/phrase_search.rs +++ b/milli/tests/search/phrase_search.rs @@ -32,15 +32,6 @@ fn test_phrase_search_with_stop_words_given_criteria(criteria: &[Criterion]) { let result = search.execute().unwrap(); // 1 document should match assert_eq!(result.documents_ids.len(), 1); - - // test for a single stop word only, no other search terms - let mut search = Search::new(&txn, &index); - search.query("\"the\""); - search.limit(10); - search.authorize_typos(false); - search.terms_matching_strategy(TermsMatchingStrategy::All); - let result = search.execute().unwrap(); - assert_eq!(result.documents_ids.len(), 0); } #[test] From 900bae3d9ddd30bdd501b5d5fabd76e125539df5 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 21 Feb 2023 17:52:59 +0100 Subject: [PATCH 3/4] keep phrases that has at least one word --- milli/src/search/query_tree.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 00c85aaba..431012bdb 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -825,9 +825,13 @@ where quoted = !quoted; } // if there is a quote or a hard separator we close the phrase. - if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard) - { - primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase))); + if quote_count > 0 || separator_kind == SeparatorKind::Hard { + let phrase = mem::take(&mut phrase); + + // if the phrase only contains stop words, we don't keep it in the query. + if phrase.iter().any(|w| w.is_some()) { + primitive_query.push(PrimitiveQueryPart::Phrase(phrase)); + } } } _ => (), @@ -835,7 +839,7 @@ where } // If a quote is never closed, we consider all of the end of the query as a phrase. - if !phrase.is_empty() { + if phrase.iter().any(|w| w.is_some()) { primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase))); } From 37489fd4958e09a8231bdc4842c40b23522ae0ac Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 1 Mar 2023 18:52:14 +0100 Subject: [PATCH 4/4] Return an internal error in the case of matching word is invalid --- benchmarks/benches/formatting.rs | 2 +- milli/src/error.rs | 2 ++ milli/src/search/matches/matching_words.rs | 14 +++++++++++--- milli/src/search/matches/mod.rs | 6 +++--- milli/src/search/query_tree.rs | 2 +- 5 files changed, 18 insertions(+), 8 deletions(-) diff --git a/benchmarks/benches/formatting.rs b/benchmarks/benches/formatting.rs index 25d162411..2e0fa0ce7 100644 --- a/benchmarks/benches/formatting.rs +++ b/benchmarks/benches/formatting.rs @@ -29,7 +29,7 @@ fn bench_formatting(c: &mut criterion::Criterion) { (vec![Rc::new(MatchingWord::new("thedoord".to_string(), 1, true).unwrap())], vec![0, 1, 2]), (vec![Rc::new(MatchingWord::new("doord".to_string(), 1, true).unwrap())], vec![1, 2]), ] - ), TokenizerBuilder::default().build()), + ).unwrap(), TokenizerBuilder::default().build()), }, ]; diff --git a/milli/src/error.rs b/milli/src/error.rs index 1685f984a..9b1ec0a87 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -59,6 +59,8 @@ pub enum InternalError { Utf8(#[from] str::Utf8Error), #[error("An indexation process was explicitly aborted.")] AbortedIndexation, + #[error("The matching words list contains at least one invalid member.")] + InvalidMatchingWords, } #[derive(Error, Debug)] diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs index 22ba973b5..22241c457 100644 --- a/milli/src/search/matches/matching_words.rs +++ b/milli/src/search/matches/matching_words.rs @@ -7,6 +7,7 @@ use std::rc::Rc; use charabia::Token; use levenshtein_automata::{Distance, DFA}; +use crate::error::InternalError; use crate::search::build_dfa; use crate::MAX_WORD_LENGTH; @@ -31,12 +32,19 @@ impl fmt::Debug for MatchingWords { } impl MatchingWords { - pub fn new(mut matching_words: Vec<(Vec>, Vec)>) -> Self { + pub fn new( + mut matching_words: Vec<(Vec>, Vec)>, + ) -> crate::Result { + // if one of the matching_words vec doesn't contain a word. + if matching_words.iter().any(|(mw, _)| mw.is_empty()) { + return Err(InternalError::InvalidMatchingWords.into()); + } + // Sort word by len in DESC order prioritizing the longuest matches, // in order to highlight the longuest part of the matched word. matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len()))); - Self { inner: matching_words } + Ok(Self { inner: matching_words }) } /// Returns an iterator over terms that match or partially match the given token. @@ -360,7 +368,7 @@ mod tests { (vec![all[2].clone()], vec![2]), ]; - let matching_words = MatchingWords::new(matching_words); + let matching_words = MatchingWords::new(matching_words).unwrap(); assert_eq!( matching_words diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 6ac5123a8..c634ae297 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -513,7 +513,7 @@ mod tests { (vec![all[2].clone()], vec![2]), ]; - MatchingWords::new(matching_words) + MatchingWords::new(matching_words).unwrap() } impl MatcherBuilder<'_, Vec> { @@ -600,7 +600,7 @@ mod tests { ]; let matching_words = vec![(vec![all[0].clone()], vec![0]), (vec![all[1].clone()], vec![1])]; - let matching_words = MatchingWords::new(matching_words); + let matching_words = MatchingWords::new(matching_words).unwrap(); let builder = MatcherBuilder::from_matching_words(matching_words); @@ -847,7 +847,7 @@ mod tests { (vec![all[4].clone()], vec![2]), ]; - let matching_words = MatchingWords::new(matching_words); + let matching_words = MatchingWords::new(matching_words).unwrap(); let mut builder = MatcherBuilder::from_matching_words(matching_words); builder.highlight_prefix("_".to_string()); diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 431012bdb..541dd8f7a 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -747,7 +747,7 @@ fn create_matching_words( let mut matching_word_cache = MatchingWordCache::default(); let mut matching_words = Vec::new(); ngrams(ctx, authorize_typos, query, &mut matching_words, &mut matching_word_cache, 0)?; - Ok(MatchingWords::new(matching_words)) + MatchingWords::new(matching_words) } pub type PrimitiveQuery = Vec;