From 62816dddded714266c4af396ac995def5d0d0379 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Wed, 26 Oct 2022 19:08:06 +0530 Subject: [PATCH 01/17] [WIP] Fix phrase search containing stop words Fixes #661 and meilisearch/meilisearch#2905 --- milli/src/search/criteria/attribute.rs | 1 + milli/src/search/criteria/exactness.rs | 4 ++-- milli/src/search/criteria/mod.rs | 12 +++++++--- milli/src/search/criteria/proximity.rs | 6 ++++- milli/src/search/criteria/typo.rs | 4 ++-- milli/src/search/query_tree.rs | 32 +++++++++++++++++++------- 6 files changed, 43 insertions(+), 16 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 7e55a1038..679381838 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -579,6 +579,7 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { Phrase(words) => { let queries = words .iter() + .filter_map(|w| w.as_ref()) .map(|word| vec![Query { prefix: false, kind: QueryKind::exact(word.clone()) }]) .collect(); vec![queries] diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index d5b2ff0ee..0f0c24723 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -298,7 +298,7 @@ fn attribute_start_with_docids( pos += 1; } Phrase(phrase) => { - for word in phrase { + for word in phrase.iter().filter_map(|w| w.as_ref()) { let wc = ctx.word_position_docids(word, pos)?; if let Some(word_candidates) = wc { attribute_candidates_array.push(word_candidates); @@ -323,7 +323,7 @@ fn intersection_of(mut rbs: Vec<&RoaringBitmap>) -> RoaringBitmap { #[derive(Debug, Clone)] pub enum ExactQueryPart { - Phrase(Vec), + Phrase(Vec>), Synonyms(Vec), } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 1b46c8441..96ed0bf6c 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -418,15 +418,21 @@ pub fn resolve_query_tree( resolve_operation(ctx, query_tree, wdcache) } -pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result { +pub fn resolve_phrase(ctx: &dyn Context, phrase: &[Option]) -> Result { let mut candidates = RoaringBitmap::new(); let mut first_iter = true; let winsize = phrase.len().min(3); for win in phrase.windows(winsize) { // Get all the documents with the matching distance for each word pairs. let mut bitmaps = Vec::with_capacity(winsize.pow(2)); - for (offset, s1) in win.iter().enumerate() { - for (dist, s2) in win.iter().skip(offset + 1).enumerate() { + for (offset, s1) in win.iter().filter_map(|w| w.as_ref()).enumerate() { + for (dist, s2) in win.iter().skip(offset + 1).enumerate().filter_map(|(index, word)| { + if let Some(word) = word { + Some((index, word)) + } else { + None + } + }) { if dist == 0 { match ctx.word_pair_proximity_docids(s1, s2, 1)? { Some(m) => bitmaps.push(m), diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index b7c10a2e0..db8592a1d 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -188,9 +188,13 @@ fn resolve_candidates<'t>( if proximity == 0 { let most_left = words .first() + .map(|o| o.as_ref()) + .flatten() .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); let most_right = words .last() + .map(|o| o.as_ref()) + .flatten() .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); match (most_left, most_right) { @@ -473,7 +477,7 @@ fn resolve_plane_sweep_candidates( } Phrase(words) => { let mut groups_positions = Vec::with_capacity(words.len()); - for word in words { + for word in words.iter().filter_map(|w| w.as_ref()) { let positions = match words_positions.get(word) { Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(), None => return Ok(vec![]), diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 76bd04d20..758069642 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -2,6 +2,7 @@ use std::borrow::Cow; use std::collections::HashMap; use std::mem::take; +use itertools::Itertools; use log::debug; use roaring::RoaringBitmap; @@ -259,8 +260,7 @@ fn resolve_candidates<'t>( Phrase(words) => { let mut candidates = RoaringBitmap::new(); let mut first_loop = true; - for slice in words.windows(2) { - let (left, right) = (&slice[0], &slice[1]); + for (left, right) in words.iter().filter_map(|w| w.as_ref()).tuple_windows() { match ctx.word_pair_proximity_docids(left, right, 1)? { Some(pair_docids) => { if pair_docids.is_empty() { diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 9b4b38f76..4da4b3317 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -18,8 +18,9 @@ type IsPrefix = bool; #[derive(Clone, PartialEq, Eq, Hash)] pub enum Operation { And(Vec), - // serie of consecutive non prefix and exact words - Phrase(Vec), + // series of consecutive non prefix and exact words + // `None` means a stop word. + Phrase(Vec>), Or(IsOptionalWord, Vec), Query(Query), } @@ -75,9 +76,13 @@ impl Operation { } } - fn phrase(mut words: Vec) -> Self { + fn phrase(mut words: Vec>) -> Self { if words.len() == 1 { - Self::Query(Query { prefix: false, kind: QueryKind::exact(words.pop().unwrap()) }) + if let Some(word) = words.pop().unwrap() { + Self::Query(Query { prefix: false, kind: QueryKind::exact(word) }) + } else { + Self::Phrase(words) + } } else { Self::Phrase(words) } @@ -370,7 +375,10 @@ fn create_query_tree( PrimitiveQueryPart::Word(word, prefix) => { let mut children = synonyms(ctx, &[&word])?.unwrap_or_default(); if let Some((left, right)) = split_best_frequency(ctx, &word)? { - children.push(Operation::Phrase(vec![left.to_string(), right.to_string()])); + children.push(Operation::Phrase(vec![ + Some(left.to_string()), + Some(right.to_string()), + ])); } let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; let exact_words = ctx.exact_words(); @@ -583,7 +591,11 @@ fn create_matching_words( PrimitiveQueryPart::Phrase(words) => { let ids: Vec<_> = (0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect(); - let words = words.into_iter().map(|w| MatchingWord::new(w, 0, false)).collect(); + let words = words + .into_iter() + .filter_map(|w| w) + .map(|w| MatchingWord::new(w, 0, false)) + .collect(); matching_words.push((words, ids)); } } @@ -685,7 +697,7 @@ pub type PrimitiveQuery = Vec; #[derive(Debug, Clone)] pub enum PrimitiveQueryPart { - Phrase(Vec), + Phrase(Vec>), Word(String, IsPrefix), } @@ -735,7 +747,11 @@ where // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, // 3. if the word is the last token of the query we push it as a prefix word. if quoted { - phrase.push(token.lemma().to_string()); + if stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) { + phrase.push(None) + } else { + phrase.push(Some(token.lemma().to_string())); + } } else if peekable.peek().is_some() { if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) { primitive_query From 6a10b679ca3330f2cc083b6c5b1674c654153b7e Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Thu, 13 Oct 2022 19:13:40 +0530 Subject: [PATCH 02/17] Add test for phrase search with stop words Originally written by ManyTheFish here: https://gist.github.com/ManyTheFish/f840e37cb2d2e029ce05396b4d540762 Co-authored-by: ManyTheFish --- milli/tests/search/mod.rs | 1 + milli/tests/search/phrase_search.rs | 35 +++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 milli/tests/search/phrase_search.rs diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index c8b01648c..78640cfb9 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -18,6 +18,7 @@ mod filters; mod query_criteria; mod sort; mod typo_tolerance; +mod phrase_search; pub const TEST_QUERY: &'static str = "hello world america"; diff --git a/milli/tests/search/phrase_search.rs b/milli/tests/search/phrase_search.rs new file mode 100644 index 000000000..87b3fd511 --- /dev/null +++ b/milli/tests/search/phrase_search.rs @@ -0,0 +1,35 @@ +use milli::update::{IndexerConfig, Settings}; +use milli::{Index, Search, TermsMatchingStrategy}; + +fn set_stop_words(index: &Index, stop_words: &[&str]) { + let mut wtxn = index.write_txn().unwrap(); + let config = IndexerConfig::default(); + + let mut builder = Settings::new(&mut wtxn, &index, &config); + let stop_words = stop_words.into_iter().map(|s| s.to_string()).collect(); + builder.set_stop_words(stop_words); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); +} + +#[test] +fn test_phrase_search_with_stop_words() { + let criteria = []; + let index = super::setup_search_index_with_criteria(&criteria); + + // Add stop_words + set_stop_words(&index, &["a", "an", "the", "of"]); + + // Phrase search containing stop words + let txn = index.read_txn().unwrap(); + + let mut search = Search::new(&txn, &index); + search.query("\"the use of force\""); + search.limit(10); + search.authorize_typos(false); + search.terms_matching_strategy(TermsMatchingStrategy::All); + + let result = search.execute().unwrap(); + // 1 document should match + assert_eq!(result.documents_ids.len(), 1); +} From ef13c6a5b602192ddd2424036424884a1be31a4a Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Thu, 13 Oct 2022 22:39:55 +0530 Subject: [PATCH 03/17] Perform filter after enumerate to keep origin indices --- milli/src/search/criteria/mod.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 96ed0bf6c..631dd2385 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -425,7 +425,13 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[Option]) -> Result Date: Thu, 13 Oct 2022 22:40:25 +0530 Subject: [PATCH 04/17] Increment position even when it's a stop word in exactness criteria --- milli/src/search/criteria/exactness.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index 0f0c24723..580031697 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -298,10 +298,12 @@ fn attribute_start_with_docids( pos += 1; } Phrase(phrase) => { - for word in phrase.iter().filter_map(|w| w.as_ref()) { - let wc = ctx.word_position_docids(word, pos)?; - if let Some(word_candidates) = wc { - attribute_candidates_array.push(word_candidates); + for word in phrase { + if let Some(word) = word { + let wc = ctx.word_position_docids(word, pos)?; + if let Some(word_candidates) = wc { + attribute_candidates_array.push(word_candidates); + } } pos += 1; } From 3e190503e6a3b73126476def1be8e866f9c63464 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Thu, 13 Oct 2022 22:47:41 +0530 Subject: [PATCH 05/17] Search for closest non-stop words in proximity criteria --- milli/src/search/criteria/proximity.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index db8592a1d..5aa3cc8b3 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -187,14 +187,15 @@ fn resolve_candidates<'t>( Phrase(words) => { if proximity == 0 { let most_left = words - .first() - .map(|o| o.as_ref()) - .flatten() + .iter() + .filter_map(|o| o.as_ref()) + .next() .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); let most_right = words - .last() - .map(|o| o.as_ref()) - .flatten() + .iter() + .rev() + .filter_map(|o| o.as_ref()) + .next() .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); match (most_left, most_right) { From c8c666c6a6e93a28122b034b5f386fe42e94c0b1 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Thu, 13 Oct 2022 23:17:28 +0530 Subject: [PATCH 06/17] Use resolve_phrase in exactness and typo criteria --- milli/src/search/criteria/typo.rs | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 758069642..2ae35e418 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -2,7 +2,6 @@ use std::borrow::Cow; use std::collections::HashMap; use std::mem::take; -use itertools::Itertools; use log::debug; use roaring::RoaringBitmap; @@ -10,6 +9,7 @@ use super::{ query_docids, resolve_query_tree, Candidates, Context, Criterion, CriterionParameters, CriterionResult, }; +use crate::search::criteria::resolve_phrase; use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; use crate::search::{word_derivations, WordDerivationsCache}; use crate::Result; @@ -257,26 +257,7 @@ fn resolve_candidates<'t>( match query_tree { And(ops) => mdfs(ctx, ops, number_typos, cache, wdcache), - Phrase(words) => { - let mut candidates = RoaringBitmap::new(); - let mut first_loop = true; - for (left, right) in words.iter().filter_map(|w| w.as_ref()).tuple_windows() { - match ctx.word_pair_proximity_docids(left, right, 1)? { - Some(pair_docids) => { - if pair_docids.is_empty() { - return Ok(RoaringBitmap::new()); - } else if first_loop { - candidates = pair_docids; - first_loop = false; - } else { - candidates &= pair_docids; - } - } - None => return Ok(RoaringBitmap::new()), - } - } - Ok(candidates) - } + Phrase(words) => resolve_phrase(ctx, words), Or(_, ops) => { let mut candidates = RoaringBitmap::new(); for op in ops { From d187b32a2847f9a5887ffe7806b19def927ad7c3 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Thu, 13 Oct 2022 23:30:58 +0530 Subject: [PATCH 07/17] Fix snapshots to use new phrase type --- milli/src/search/query_tree.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 4da4b3317..25366461c 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1081,7 +1081,7 @@ mod test { OR AND OR - PHRASE ["word", "split"] + PHRASE [Some("word"), Some("split")] Tolerant { word: "wordsplit", max typo: 2 } Exact { word: "fish" } Tolerant { word: "wordsplitfish", max typo: 1 } @@ -1100,7 +1100,7 @@ mod test { insta::assert_debug_snapshot!(query_tree, @r###" OR - PHRASE ["quickbrown", "fox"] + PHRASE [Some("quickbrown"), Some("fox")] PrefixTolerant { word: "quickbrownfox", max typo: 2 } "###); } @@ -1117,7 +1117,7 @@ mod test { insta::assert_debug_snapshot!(query_tree, @r###" AND - PHRASE ["hey", "friends"] + PHRASE [Some("hey"), Some("friends")] Exact { word: "wooop" } "###); } @@ -1154,8 +1154,8 @@ mod test { insta::assert_debug_snapshot!(query_tree, @r###" AND - PHRASE ["hey", "friends"] - PHRASE ["wooop", "wooop"] + PHRASE [Some("hey"), Some("friends")] + PHRASE [Some("wooop"), Some("wooop")] "###); } @@ -1203,7 +1203,7 @@ mod test { .unwrap(); insta::assert_debug_snapshot!(query_tree, @r###" - PHRASE ["hey", "my"] + PHRASE [Some("hey"), Some("my")] "###); } @@ -1268,7 +1268,7 @@ mod test { insta::assert_debug_snapshot!(query_tree, @r###" AND - PHRASE ["hey", "my"] + PHRASE [Some("hey"), Some("my")] Exact { word: "good" } "###); } From bb9ce3c5c57a7507c84af9a237b3e227d8832b97 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Thu, 13 Oct 2022 23:34:17 +0530 Subject: [PATCH 08/17] Run cargo fmt --- milli/src/search/criteria/mod.rs | 12 ++++++------ milli/tests/search/mod.rs | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 631dd2385..a6a0c7f92 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -426,12 +426,12 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[Option]) -> Result Date: Thu, 13 Oct 2022 23:54:49 +0530 Subject: [PATCH 09/17] Fix panic when phrase contains only one stop word and nothing else --- milli/src/search/criteria/mod.rs | 5 +++++ milli/tests/search/phrase_search.rs | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index a6a0c7f92..3657df73e 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -422,6 +422,11 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[Option]) -> Result Date: Thu, 20 Oct 2022 18:35:39 +0530 Subject: [PATCH 10/17] Simplify stop word checking in create_primitive_query --- milli/src/search/query_tree.rs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 25366461c..5042f4762 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -4,7 +4,6 @@ use std::{fmt, mem}; use charabia::classifier::ClassifiedTokenIter; use charabia::{SeparatorKind, TokenKind}; -use fst::Set; use roaring::RoaringBitmap; use slice_group_by::GroupBy; @@ -269,8 +268,7 @@ impl<'a> QueryTreeBuilder<'a> { &self, query: ClassifiedTokenIter, ) -> Result> { - let stop_words = self.index.stop_words(self.rtxn)?; - let primitive_query = create_primitive_query(query, stop_words, self.words_limit); + let primitive_query = create_primitive_query(query, self.words_limit); if !primitive_query.is_empty() { let qt = create_query_tree( self, @@ -722,7 +720,6 @@ impl PrimitiveQueryPart { /// the primitive query is an intermediate state to build the query tree. fn create_primitive_query( query: ClassifiedTokenIter, - stop_words: Option>, words_limit: Option, ) -> PrimitiveQuery where @@ -747,13 +744,14 @@ where // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, // 3. if the word is the last token of the query we push it as a prefix word. if quoted { - if stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) { + if let TokenKind::StopWord = token.kind { phrase.push(None) } else { phrase.push(Some(token.lemma().to_string())); } } else if peekable.peek().is_some() { - if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) { + if let TokenKind::StopWord = token.kind { + } else { primitive_query .push(PrimitiveQueryPart::Word(token.lemma().to_string(), false)); } @@ -836,7 +834,7 @@ mod test { words_limit: Option, query: ClassifiedTokenIter, ) -> Result> { - let primitive_query = create_primitive_query(query, None, words_limit); + let primitive_query = create_primitive_query(query, words_limit); if !primitive_query.is_empty() { let qt = create_query_tree( self, From f1da623af3edbf0e5f5ffee9e00f4a7db0babbf0 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Thu, 20 Oct 2022 18:41:37 +0530 Subject: [PATCH 11/17] Add test for phrase search with stop words and all criteria at once Moved the actual test into a separate function used by both the existing test and the new test. --- milli/tests/search/phrase_search.rs | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/milli/tests/search/phrase_search.rs b/milli/tests/search/phrase_search.rs index 313833543..e255927d8 100644 --- a/milli/tests/search/phrase_search.rs +++ b/milli/tests/search/phrase_search.rs @@ -1,5 +1,6 @@ +use crate::search::Criterion::{Attribute, Exactness, Proximity}; use milli::update::{IndexerConfig, Settings}; -use milli::{Index, Search, TermsMatchingStrategy}; +use milli::{Criterion, Index, Search, TermsMatchingStrategy}; fn set_stop_words(index: &Index, stop_words: &[&str]) { let mut wtxn = index.write_txn().unwrap(); @@ -12,9 +13,7 @@ fn set_stop_words(index: &Index, stop_words: &[&str]) { wtxn.commit().unwrap(); } -#[test] -fn test_phrase_search_with_stop_words() { - let criteria = []; +fn test_phrase_search_with_stop_words_given_criteria(criteria: &[Criterion]) { let index = super::setup_search_index_with_criteria(&criteria); // Add stop_words @@ -42,3 +41,15 @@ fn test_phrase_search_with_stop_words() { let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 0); } + +#[test] +fn test_phrase_search_with_stop_words_no_criteria() { + let criteria = []; + test_phrase_search_with_stop_words_given_criteria(&criteria); +} + +#[test] +fn test_phrase_search_with_stop_words_all_criteria() { + let criteria = [Proximity, Attribute, Exactness]; + test_phrase_search_with_stop_words_given_criteria(&criteria); +} From af33d22f2582f4ef40e9280100f8b72db6514daa Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Thu, 20 Oct 2022 19:02:08 +0530 Subject: [PATCH 12/17] Consecutive is false when at least 1 stop word is surrounded by words --- milli/src/search/criteria/proximity.rs | 29 +++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 5aa3cc8b3..5f414d84c 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -478,14 +478,29 @@ fn resolve_plane_sweep_candidates( } Phrase(words) => { let mut groups_positions = Vec::with_capacity(words.len()); - for word in words.iter().filter_map(|w| w.as_ref()) { - let positions = match words_positions.get(word) { - Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(), - None => return Ok(vec![]), - }; - groups_positions.push(positions); + let mut consecutive = true; + let mut was_last_word_a_stop_word = false; + for word in words.iter() { + if let Some(word) = word { + let positions = match words_positions.get(word) { + Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(), + None => return Ok(vec![]), + }; + groups_positions.push(positions); + + if was_last_word_a_stop_word { + consecutive = false; + } + was_last_word_a_stop_word = false; + } else { + if !was_last_word_a_stop_word { + consecutive = false; + } + + was_last_word_a_stop_word = true; + } } - plane_sweep(groups_positions, true)? + plane_sweep(groups_positions, consecutive)? } Or(_, ops) => { let mut result = Vec::new(); From 488d31ecdf7ce4a060e507505634e2a752b13cb0 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Thu, 20 Oct 2022 19:08:21 +0530 Subject: [PATCH 13/17] Run cargo fmt --- milli/tests/search/phrase_search.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/milli/tests/search/phrase_search.rs b/milli/tests/search/phrase_search.rs index e255927d8..e4f5d73f9 100644 --- a/milli/tests/search/phrase_search.rs +++ b/milli/tests/search/phrase_search.rs @@ -1,7 +1,8 @@ -use crate::search::Criterion::{Attribute, Exactness, Proximity}; use milli::update::{IndexerConfig, Settings}; use milli::{Criterion, Index, Search, TermsMatchingStrategy}; +use crate::search::Criterion::{Attribute, Exactness, Proximity}; + fn set_stop_words(index: &Index, stop_words: &[&str]) { let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); From 752d031010fecdf48f823bee809fb2dfdae8be26 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Wed, 26 Oct 2022 23:07:20 +0530 Subject: [PATCH 14/17] Update phrase search to use new `execute` method --- milli/tests/search/phrase_search.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/tests/search/phrase_search.rs b/milli/tests/search/phrase_search.rs index e4f5d73f9..ca5eaad48 100644 --- a/milli/tests/search/phrase_search.rs +++ b/milli/tests/search/phrase_search.rs @@ -10,7 +10,7 @@ fn set_stop_words(index: &Index, stop_words: &[&str]) { let mut builder = Settings::new(&mut wtxn, &index, &config); let stop_words = stop_words.into_iter().map(|s| s.to_string()).collect(); builder.set_stop_words(stop_words); - builder.execute(|_| ()).unwrap(); + builder.execute(|_| (), || false).unwrap(); wtxn.commit().unwrap(); } From d35afa0cf532b357a710bbe53983e3508a11be02 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Wed, 26 Oct 2022 23:10:48 +0530 Subject: [PATCH 15/17] Change consecutive phrase search grouping logic Co-authored-by: ManyTheFish --- milli/src/search/criteria/proximity.rs | 42 ++++++++++++++------------ 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 5f414d84c..b9cf47c8e 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -4,6 +4,7 @@ use std::mem::take; use log::debug; use roaring::RoaringBitmap; +use slice_group_by::GroupBy; use super::{ query_docids, query_pair_proximity_docids, resolve_phrase, resolve_query_tree, Context, @@ -478,29 +479,30 @@ fn resolve_plane_sweep_candidates( } Phrase(words) => { let mut groups_positions = Vec::with_capacity(words.len()); - let mut consecutive = true; - let mut was_last_word_a_stop_word = false; - for word in words.iter() { - if let Some(word) = word { - let positions = match words_positions.get(word) { - Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(), - None => return Ok(vec![]), - }; - groups_positions.push(positions); - if was_last_word_a_stop_word { - consecutive = false; - } - was_last_word_a_stop_word = false; - } else { - if !was_last_word_a_stop_word { - consecutive = false; - } - - was_last_word_a_stop_word = true; + // group stop_words together. + for words in words.linear_group_by_key(Option::is_none) { + // skip if it's a group of stop words. + if matches!(words.first(), None | Some(None)) { + continue; } + // make a consecutive plane-sweep on the subgroup of words. + let mut subgroup = Vec::with_capacity(words.len()); + for word in words.into_iter().map(|w| w.as_deref().unwrap()) { + match words_positions.get(word) { + Some(positions) => { + subgroup.push(positions.iter().map(|p| (p, 0, p)).collect()) + } + None => return Ok(vec![]), + } + } + groups_positions.push(plane_sweep(subgroup, true)?); + } + match groups_positions.len() { + 0 => vec![], + 1 => groups_positions.pop().unwrap(), + _ => plane_sweep(groups_positions, false)?, } - plane_sweep(groups_positions, consecutive)? } Or(_, ops) => { let mut result = Vec::new(); From 03eb5d87c17c545dc28aa6d886e5e6ce0befb9b7 Mon Sep 17 00:00:00 2001 From: Samyak Sarnayak Date: Fri, 28 Oct 2022 19:32:05 +0530 Subject: [PATCH 16/17] Only call plane_sweep on subgroups when 2 or more are present --- milli/src/search/criteria/proximity.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index b9cf47c8e..6b09ee2fe 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -496,7 +496,11 @@ fn resolve_plane_sweep_candidates( None => return Ok(vec![]), } } - groups_positions.push(plane_sweep(subgroup, true)?); + match subgroup.len() { + 0 => {}, + 1 => groups_positions.push(subgroup.pop().unwrap()), + _ => groups_positions.push(plane_sweep(subgroup, true)?), + } } match groups_positions.len() { 0 => vec![], From ecb88143f9e5b1be49db8a0d1d59fa15b47a0c99 Mon Sep 17 00:00:00 2001 From: Samyak Sarnayak Date: Fri, 28 Oct 2022 19:37:02 +0530 Subject: [PATCH 17/17] Run cargo fmt --- milli/src/search/criteria/proximity.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 6b09ee2fe..d51047821 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -497,7 +497,7 @@ fn resolve_plane_sweep_candidates( } } match subgroup.len() { - 0 => {}, + 0 => {} 1 => groups_positions.push(subgroup.pop().unwrap()), _ => groups_positions.push(plane_sweep(subgroup, true)?), }