From 6841f167b42436d070245ecfa834c689e29f08c9 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 21 Feb 2023 18:02:10 +0100
Subject: [PATCH 1/4] Add test

---
 meilisearch/tests/search/mod.rs | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs
index 91ff64d37..9ed3ffcd2 100644
--- a/meilisearch/tests/search/mod.rs
+++ b/meilisearch/tests/search/mod.rs
@@ -148,6 +148,27 @@ async fn simple_search() {
         .await;
 }
 
+#[actix_rt::test]
+async fn phrase_search_with_stop_word() {
+    // related to https://github.com/meilisearch/meilisearch/issues/3521
+    let server = Server::new().await;
+    let index = server.index("test");
+
+    let (_, code) = index.update_settings(json!({"stopWords": ["the", "of"]})).await;
+    meili_snap::snapshot!(code, @"202 Accepted");
+
+    let documents = DOCUMENTS.clone();
+    index.add_documents(documents, None).await;
+    index.wait_task(1).await;
+
+    index
+        .search(json!({"q": "how \"to\" train \"the" }), |response, code| {
+            assert_eq!(code, 200, "{}", response);
+            assert_eq!(response["hits"].as_array().unwrap().len(), 1);
+        })
+        .await;
+}
+
 #[cfg(feature = "default")]
 #[actix_rt::test]
 async fn test_kanji_language_detection() {

From 28b7d73d4a8d12a91b07b8120d1dde4e168be47f Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 21 Feb 2023 18:16:22 +0100
Subject: [PATCH 2/4] Remove an unefficient part of a test on milli

---
 milli/tests/search/phrase_search.rs | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/milli/tests/search/phrase_search.rs b/milli/tests/search/phrase_search.rs
index 2e63c96c4..5b987ad30 100644
--- a/milli/tests/search/phrase_search.rs
+++ b/milli/tests/search/phrase_search.rs
@@ -32,15 +32,6 @@ fn test_phrase_search_with_stop_words_given_criteria(criteria: &[Criterion]) {
     let result = search.execute().unwrap();
     // 1 document should match
     assert_eq!(result.documents_ids.len(), 1);
-
-    // test for a single stop word only, no other search terms
-    let mut search = Search::new(&txn, &index);
-    search.query("\"the\"");
-    search.limit(10);
-    search.authorize_typos(false);
-    search.terms_matching_strategy(TermsMatchingStrategy::All);
-    let result = search.execute().unwrap();
-    assert_eq!(result.documents_ids.len(), 0);
 }
 
 #[test]

From 900bae3d9ddd30bdd501b5d5fabd76e125539df5 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 21 Feb 2023 17:52:59 +0100
Subject: [PATCH 3/4] keep phrases that has at least one word

---
 milli/src/search/query_tree.rs | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs
index 00c85aaba..431012bdb 100755
--- a/milli/src/search/query_tree.rs
+++ b/milli/src/search/query_tree.rs
@@ -825,9 +825,13 @@ where
                     quoted = !quoted;
                 }
                 // if there is a quote or a hard separator we close the phrase.
-                if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard)
-                {
-                    primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase)));
+                if quote_count > 0 || separator_kind == SeparatorKind::Hard {
+                    let phrase = mem::take(&mut phrase);
+
+                    // if the phrase only contains stop words, we don't keep it in the query.
+                    if phrase.iter().any(|w| w.is_some()) {
+                        primitive_query.push(PrimitiveQueryPart::Phrase(phrase));
+                    }
                 }
             }
             _ => (),
@@ -835,7 +839,7 @@ where
     }
 
     // If a quote is never closed, we consider all of the end of the query as a phrase.
-    if !phrase.is_empty() {
+    if phrase.iter().any(|w| w.is_some()) {
         primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase)));
     }
 

From 37489fd4958e09a8231bdc4842c40b23522ae0ac Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 1 Mar 2023 18:52:14 +0100
Subject: [PATCH 4/4] Return an internal error in the case of matching word is
 invalid

---
 benchmarks/benches/formatting.rs           |  2 +-
 milli/src/error.rs                         |  2 ++
 milli/src/search/matches/matching_words.rs | 14 +++++++++++---
 milli/src/search/matches/mod.rs            |  6 +++---
 milli/src/search/query_tree.rs             |  2 +-
 5 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/benchmarks/benches/formatting.rs b/benchmarks/benches/formatting.rs
index 25d162411..2e0fa0ce7 100644
--- a/benchmarks/benches/formatting.rs
+++ b/benchmarks/benches/formatting.rs
@@ -29,7 +29,7 @@ fn bench_formatting(c: &mut criterion::Criterion) {
 	            (vec![Rc::new(MatchingWord::new("thedoord".to_string(), 1, true).unwrap())], vec![0, 1, 2]),
 	            (vec![Rc::new(MatchingWord::new("doord".to_string(), 1, true).unwrap())], vec![1, 2]),
         	]
-            ), TokenizerBuilder::default().build()),
+            ).unwrap(), TokenizerBuilder::default().build()),
 		},
     ];
 
diff --git a/milli/src/error.rs b/milli/src/error.rs
index 1685f984a..9b1ec0a87 100644
--- a/milli/src/error.rs
+++ b/milli/src/error.rs
@@ -59,6 +59,8 @@ pub enum InternalError {
     Utf8(#[from] str::Utf8Error),
     #[error("An indexation process was explicitly aborted.")]
     AbortedIndexation,
+    #[error("The matching words list contains at least one invalid member.")]
+    InvalidMatchingWords,
 }
 
 #[derive(Error, Debug)]
diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs
index 22ba973b5..22241c457 100644
--- a/milli/src/search/matches/matching_words.rs
+++ b/milli/src/search/matches/matching_words.rs
@@ -7,6 +7,7 @@ use std::rc::Rc;
 use charabia::Token;
 use levenshtein_automata::{Distance, DFA};
 
+use crate::error::InternalError;
 use crate::search::build_dfa;
 use crate::MAX_WORD_LENGTH;
 
@@ -31,12 +32,19 @@ impl fmt::Debug for MatchingWords {
 }
 
 impl MatchingWords {
-    pub fn new(mut matching_words: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>) -> Self {
+    pub fn new(
+        mut matching_words: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
+    ) -> crate::Result<Self> {
+        // if one of the matching_words vec doesn't contain a word.
+        if matching_words.iter().any(|(mw, _)| mw.is_empty()) {
+            return Err(InternalError::InvalidMatchingWords.into());
+        }
+
         // Sort word by len in DESC order prioritizing the longuest matches,
         // in order to highlight the longuest part of the matched word.
         matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len())));
 
-        Self { inner: matching_words }
+        Ok(Self { inner: matching_words })
     }
 
     /// Returns an iterator over terms that match or partially match the given token.
@@ -360,7 +368,7 @@ mod tests {
             (vec![all[2].clone()], vec![2]),
         ];
 
-        let matching_words = MatchingWords::new(matching_words);
+        let matching_words = MatchingWords::new(matching_words).unwrap();
 
         assert_eq!(
             matching_words
diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs
index 6ac5123a8..c634ae297 100644
--- a/milli/src/search/matches/mod.rs
+++ b/milli/src/search/matches/mod.rs
@@ -513,7 +513,7 @@ mod tests {
             (vec![all[2].clone()], vec![2]),
         ];
 
-        MatchingWords::new(matching_words)
+        MatchingWords::new(matching_words).unwrap()
     }
 
     impl MatcherBuilder<'_, Vec<u8>> {
@@ -600,7 +600,7 @@ mod tests {
         ];
         let matching_words = vec![(vec![all[0].clone()], vec![0]), (vec![all[1].clone()], vec![1])];
 
-        let matching_words = MatchingWords::new(matching_words);
+        let matching_words = MatchingWords::new(matching_words).unwrap();
 
         let builder = MatcherBuilder::from_matching_words(matching_words);
 
@@ -847,7 +847,7 @@ mod tests {
             (vec![all[4].clone()], vec![2]),
         ];
 
-        let matching_words = MatchingWords::new(matching_words);
+        let matching_words = MatchingWords::new(matching_words).unwrap();
 
         let mut builder = MatcherBuilder::from_matching_words(matching_words);
         builder.highlight_prefix("_".to_string());
diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs
index 431012bdb..541dd8f7a 100755
--- a/milli/src/search/query_tree.rs
+++ b/milli/src/search/query_tree.rs
@@ -747,7 +747,7 @@ fn create_matching_words(
     let mut matching_word_cache = MatchingWordCache::default();
     let mut matching_words = Vec::new();
     ngrams(ctx, authorize_typos, query, &mut matching_words, &mut matching_word_cache, 0)?;
-    Ok(MatchingWords::new(matching_words))
+    MatchingWords::new(matching_words)
 }
 
 pub type PrimitiveQuery = Vec<PrimitiveQueryPart>;