diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 7e55a1038..679381838 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -579,6 +579,7 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { Phrase(words) => { let queries = words .iter() + .filter_map(|w| w.as_ref()) .map(|word| vec![Query { prefix: false, kind: QueryKind::exact(word.clone()) }]) .collect(); vec![queries] diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index d5b2ff0ee..580031697 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -299,9 +299,11 @@ fn attribute_start_with_docids( } Phrase(phrase) => { for word in phrase { - let wc = ctx.word_position_docids(word, pos)?; - if let Some(word_candidates) = wc { - attribute_candidates_array.push(word_candidates); + if let Some(word) = word { + let wc = ctx.word_position_docids(word, pos)?; + if let Some(word_candidates) = wc { + attribute_candidates_array.push(word_candidates); + } } pos += 1; } @@ -323,7 +325,7 @@ fn intersection_of(mut rbs: Vec<&RoaringBitmap>) -> RoaringBitmap { #[derive(Debug, Clone)] pub enum ExactQueryPart { - Phrase(Vec), + Phrase(Vec>), Synonyms(Vec), } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 1b46c8441..3657df73e 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -418,15 +418,32 @@ pub fn resolve_query_tree( resolve_operation(ctx, query_tree, wdcache) } -pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result { +pub fn resolve_phrase(ctx: &dyn Context, phrase: &[Option]) -> Result { let mut candidates = RoaringBitmap::new(); let mut first_iter = true; let winsize = phrase.len().min(3); + + if phrase.is_empty() { + return Ok(candidates); + } + for win in phrase.windows(winsize) { // Get all the documents with the matching distance for each word pairs. let mut bitmaps = Vec::with_capacity(winsize.pow(2)); - for (offset, s1) in win.iter().enumerate() { - for (dist, s2) in win.iter().skip(offset + 1).enumerate() { + for (offset, s1) in win.iter().enumerate().filter_map(|(index, word)| { + if let Some(word) = word { + Some((index, word)) + } else { + None + } + }) { + for (dist, s2) in win.iter().skip(offset + 1).enumerate().filter_map(|(index, word)| { + if let Some(word) = word { + Some((index, word)) + } else { + None + } + }) { if dist == 0 { match ctx.word_pair_proximity_docids(s1, s2, 1)? { Some(m) => bitmaps.push(m), diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index b7c10a2e0..d51047821 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -4,6 +4,7 @@ use std::mem::take; use log::debug; use roaring::RoaringBitmap; +use slice_group_by::GroupBy; use super::{ query_docids, query_pair_proximity_docids, resolve_phrase, resolve_query_tree, Context, @@ -187,10 +188,15 @@ fn resolve_candidates<'t>( Phrase(words) => { if proximity == 0 { let most_left = words - .first() + .iter() + .filter_map(|o| o.as_ref()) + .next() .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); let most_right = words - .last() + .iter() + .rev() + .filter_map(|o| o.as_ref()) + .next() .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); match (most_left, most_right) { @@ -473,14 +479,34 @@ fn resolve_plane_sweep_candidates( } Phrase(words) => { let mut groups_positions = Vec::with_capacity(words.len()); - for word in words { - let positions = match words_positions.get(word) { - Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(), - None => return Ok(vec![]), - }; - groups_positions.push(positions); + + // group stop_words together. + for words in words.linear_group_by_key(Option::is_none) { + // skip if it's a group of stop words. + if matches!(words.first(), None | Some(None)) { + continue; + } + // make a consecutive plane-sweep on the subgroup of words. + let mut subgroup = Vec::with_capacity(words.len()); + for word in words.into_iter().map(|w| w.as_deref().unwrap()) { + match words_positions.get(word) { + Some(positions) => { + subgroup.push(positions.iter().map(|p| (p, 0, p)).collect()) + } + None => return Ok(vec![]), + } + } + match subgroup.len() { + 0 => {} + 1 => groups_positions.push(subgroup.pop().unwrap()), + _ => groups_positions.push(plane_sweep(subgroup, true)?), + } + } + match groups_positions.len() { + 0 => vec![], + 1 => groups_positions.pop().unwrap(), + _ => plane_sweep(groups_positions, false)?, } - plane_sweep(groups_positions, true)? } Or(_, ops) => { let mut result = Vec::new(); diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 76bd04d20..2ae35e418 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -9,6 +9,7 @@ use super::{ query_docids, resolve_query_tree, Candidates, Context, Criterion, CriterionParameters, CriterionResult, }; +use crate::search::criteria::resolve_phrase; use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; use crate::search::{word_derivations, WordDerivationsCache}; use crate::Result; @@ -256,27 +257,7 @@ fn resolve_candidates<'t>( match query_tree { And(ops) => mdfs(ctx, ops, number_typos, cache, wdcache), - Phrase(words) => { - let mut candidates = RoaringBitmap::new(); - let mut first_loop = true; - for slice in words.windows(2) { - let (left, right) = (&slice[0], &slice[1]); - match ctx.word_pair_proximity_docids(left, right, 1)? { - Some(pair_docids) => { - if pair_docids.is_empty() { - return Ok(RoaringBitmap::new()); - } else if first_loop { - candidates = pair_docids; - first_loop = false; - } else { - candidates &= pair_docids; - } - } - None => return Ok(RoaringBitmap::new()), - } - } - Ok(candidates) - } + Phrase(words) => resolve_phrase(ctx, words), Or(_, ops) => { let mut candidates = RoaringBitmap::new(); for op in ops { diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 9b4b38f76..5042f4762 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -4,7 +4,6 @@ use std::{fmt, mem}; use charabia::classifier::ClassifiedTokenIter; use charabia::{SeparatorKind, TokenKind}; -use fst::Set; use roaring::RoaringBitmap; use slice_group_by::GroupBy; @@ -18,8 +17,9 @@ type IsPrefix = bool; #[derive(Clone, PartialEq, Eq, Hash)] pub enum Operation { And(Vec), - // serie of consecutive non prefix and exact words - Phrase(Vec), + // series of consecutive non prefix and exact words + // `None` means a stop word. + Phrase(Vec>), Or(IsOptionalWord, Vec), Query(Query), } @@ -75,9 +75,13 @@ impl Operation { } } - fn phrase(mut words: Vec) -> Self { + fn phrase(mut words: Vec>) -> Self { if words.len() == 1 { - Self::Query(Query { prefix: false, kind: QueryKind::exact(words.pop().unwrap()) }) + if let Some(word) = words.pop().unwrap() { + Self::Query(Query { prefix: false, kind: QueryKind::exact(word) }) + } else { + Self::Phrase(words) + } } else { Self::Phrase(words) } @@ -264,8 +268,7 @@ impl<'a> QueryTreeBuilder<'a> { &self, query: ClassifiedTokenIter, ) -> Result> { - let stop_words = self.index.stop_words(self.rtxn)?; - let primitive_query = create_primitive_query(query, stop_words, self.words_limit); + let primitive_query = create_primitive_query(query, self.words_limit); if !primitive_query.is_empty() { let qt = create_query_tree( self, @@ -370,7 +373,10 @@ fn create_query_tree( PrimitiveQueryPart::Word(word, prefix) => { let mut children = synonyms(ctx, &[&word])?.unwrap_or_default(); if let Some((left, right)) = split_best_frequency(ctx, &word)? { - children.push(Operation::Phrase(vec![left.to_string(), right.to_string()])); + children.push(Operation::Phrase(vec![ + Some(left.to_string()), + Some(right.to_string()), + ])); } let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; let exact_words = ctx.exact_words(); @@ -583,7 +589,11 @@ fn create_matching_words( PrimitiveQueryPart::Phrase(words) => { let ids: Vec<_> = (0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect(); - let words = words.into_iter().map(|w| MatchingWord::new(w, 0, false)).collect(); + let words = words + .into_iter() + .filter_map(|w| w) + .map(|w| MatchingWord::new(w, 0, false)) + .collect(); matching_words.push((words, ids)); } } @@ -685,7 +695,7 @@ pub type PrimitiveQuery = Vec; #[derive(Debug, Clone)] pub enum PrimitiveQueryPart { - Phrase(Vec), + Phrase(Vec>), Word(String, IsPrefix), } @@ -710,7 +720,6 @@ impl PrimitiveQueryPart { /// the primitive query is an intermediate state to build the query tree. fn create_primitive_query( query: ClassifiedTokenIter, - stop_words: Option>, words_limit: Option, ) -> PrimitiveQuery where @@ -735,9 +744,14 @@ where // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, // 3. if the word is the last token of the query we push it as a prefix word. if quoted { - phrase.push(token.lemma().to_string()); + if let TokenKind::StopWord = token.kind { + phrase.push(None) + } else { + phrase.push(Some(token.lemma().to_string())); + } } else if peekable.peek().is_some() { - if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) { + if let TokenKind::StopWord = token.kind { + } else { primitive_query .push(PrimitiveQueryPart::Word(token.lemma().to_string(), false)); } @@ -820,7 +834,7 @@ mod test { words_limit: Option, query: ClassifiedTokenIter, ) -> Result> { - let primitive_query = create_primitive_query(query, None, words_limit); + let primitive_query = create_primitive_query(query, words_limit); if !primitive_query.is_empty() { let qt = create_query_tree( self, @@ -1065,7 +1079,7 @@ mod test { OR AND OR - PHRASE ["word", "split"] + PHRASE [Some("word"), Some("split")] Tolerant { word: "wordsplit", max typo: 2 } Exact { word: "fish" } Tolerant { word: "wordsplitfish", max typo: 1 } @@ -1084,7 +1098,7 @@ mod test { insta::assert_debug_snapshot!(query_tree, @r###" OR - PHRASE ["quickbrown", "fox"] + PHRASE [Some("quickbrown"), Some("fox")] PrefixTolerant { word: "quickbrownfox", max typo: 2 } "###); } @@ -1101,7 +1115,7 @@ mod test { insta::assert_debug_snapshot!(query_tree, @r###" AND - PHRASE ["hey", "friends"] + PHRASE [Some("hey"), Some("friends")] Exact { word: "wooop" } "###); } @@ -1138,8 +1152,8 @@ mod test { insta::assert_debug_snapshot!(query_tree, @r###" AND - PHRASE ["hey", "friends"] - PHRASE ["wooop", "wooop"] + PHRASE [Some("hey"), Some("friends")] + PHRASE [Some("wooop"), Some("wooop")] "###); } @@ -1187,7 +1201,7 @@ mod test { .unwrap(); insta::assert_debug_snapshot!(query_tree, @r###" - PHRASE ["hey", "my"] + PHRASE [Some("hey"), Some("my")] "###); } @@ -1252,7 +1266,7 @@ mod test { insta::assert_debug_snapshot!(query_tree, @r###" AND - PHRASE ["hey", "my"] + PHRASE [Some("hey"), Some("my")] Exact { word: "good" } "###); } diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index c8b01648c..8de4c80aa 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -15,6 +15,7 @@ use slice_group_by::GroupBy; mod distinct; mod facet_distribution; mod filters; +mod phrase_search; mod query_criteria; mod sort; mod typo_tolerance; diff --git a/milli/tests/search/phrase_search.rs b/milli/tests/search/phrase_search.rs new file mode 100644 index 000000000..ca5eaad48 --- /dev/null +++ b/milli/tests/search/phrase_search.rs @@ -0,0 +1,56 @@ +use milli::update::{IndexerConfig, Settings}; +use milli::{Criterion, Index, Search, TermsMatchingStrategy}; + +use crate::search::Criterion::{Attribute, Exactness, Proximity}; + +fn set_stop_words(index: &Index, stop_words: &[&str]) { + let mut wtxn = index.write_txn().unwrap(); + let config = IndexerConfig::default(); + + let mut builder = Settings::new(&mut wtxn, &index, &config); + let stop_words = stop_words.into_iter().map(|s| s.to_string()).collect(); + builder.set_stop_words(stop_words); + builder.execute(|_| (), || false).unwrap(); + wtxn.commit().unwrap(); +} + +fn test_phrase_search_with_stop_words_given_criteria(criteria: &[Criterion]) { + let index = super::setup_search_index_with_criteria(&criteria); + + // Add stop_words + set_stop_words(&index, &["a", "an", "the", "of"]); + + // Phrase search containing stop words + let txn = index.read_txn().unwrap(); + + let mut search = Search::new(&txn, &index); + search.query("\"the use of force\""); + search.limit(10); + search.authorize_typos(false); + search.terms_matching_strategy(TermsMatchingStrategy::All); + + let result = search.execute().unwrap(); + // 1 document should match + assert_eq!(result.documents_ids.len(), 1); + + // test for a single stop word only, no other search terms + let mut search = Search::new(&txn, &index); + search.query("\"the\""); + search.limit(10); + search.authorize_typos(false); + search.terms_matching_strategy(TermsMatchingStrategy::All); + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 0); +} + +#[test] +fn test_phrase_search_with_stop_words_no_criteria() { + let criteria = []; + test_phrase_search_with_stop_words_given_criteria(&criteria); +} + +#[test] +fn test_phrase_search_with_stop_words_all_criteria() { + let criteria = [Proximity, Attribute, Exactness]; + test_phrase_search_with_stop_words_given_criteria(&criteria); +}