diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 7e55a1038..679381838 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -579,6 +579,7 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { Phrase(words) => { let queries = words .iter() + .filter_map(|w| w.as_ref()) .map(|word| vec![Query { prefix: false, kind: QueryKind::exact(word.clone()) }]) .collect(); vec![queries] diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index d5b2ff0ee..0f0c24723 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -298,7 +298,7 @@ fn attribute_start_with_docids( pos += 1; } Phrase(phrase) => { - for word in phrase { + for word in phrase.iter().filter_map(|w| w.as_ref()) { let wc = ctx.word_position_docids(word, pos)?; if let Some(word_candidates) = wc { attribute_candidates_array.push(word_candidates); @@ -323,7 +323,7 @@ fn intersection_of(mut rbs: Vec<&RoaringBitmap>) -> RoaringBitmap { #[derive(Debug, Clone)] pub enum ExactQueryPart { - Phrase(Vec), + Phrase(Vec>), Synonyms(Vec), } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 1b46c8441..96ed0bf6c 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -418,15 +418,21 @@ pub fn resolve_query_tree( resolve_operation(ctx, query_tree, wdcache) } -pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result { +pub fn resolve_phrase(ctx: &dyn Context, phrase: &[Option]) -> Result { let mut candidates = RoaringBitmap::new(); let mut first_iter = true; let winsize = phrase.len().min(3); for win in phrase.windows(winsize) { // Get all the documents with the matching distance for each word pairs. let mut bitmaps = Vec::with_capacity(winsize.pow(2)); - for (offset, s1) in win.iter().enumerate() { - for (dist, s2) in win.iter().skip(offset + 1).enumerate() { + for (offset, s1) in win.iter().filter_map(|w| w.as_ref()).enumerate() { + for (dist, s2) in win.iter().skip(offset + 1).enumerate().filter_map(|(index, word)| { + if let Some(word) = word { + Some((index, word)) + } else { + None + } + }) { if dist == 0 { match ctx.word_pair_proximity_docids(s1, s2, 1)? { Some(m) => bitmaps.push(m), diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index b7c10a2e0..db8592a1d 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -188,9 +188,13 @@ fn resolve_candidates<'t>( if proximity == 0 { let most_left = words .first() + .map(|o| o.as_ref()) + .flatten() .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); let most_right = words .last() + .map(|o| o.as_ref()) + .flatten() .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); match (most_left, most_right) { @@ -473,7 +477,7 @@ fn resolve_plane_sweep_candidates( } Phrase(words) => { let mut groups_positions = Vec::with_capacity(words.len()); - for word in words { + for word in words.iter().filter_map(|w| w.as_ref()) { let positions = match words_positions.get(word) { Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(), None => return Ok(vec![]), diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 76bd04d20..758069642 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -2,6 +2,7 @@ use std::borrow::Cow; use std::collections::HashMap; use std::mem::take; +use itertools::Itertools; use log::debug; use roaring::RoaringBitmap; @@ -259,8 +260,7 @@ fn resolve_candidates<'t>( Phrase(words) => { let mut candidates = RoaringBitmap::new(); let mut first_loop = true; - for slice in words.windows(2) { - let (left, right) = (&slice[0], &slice[1]); + for (left, right) in words.iter().filter_map(|w| w.as_ref()).tuple_windows() { match ctx.word_pair_proximity_docids(left, right, 1)? { Some(pair_docids) => { if pair_docids.is_empty() { diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 9b4b38f76..4da4b3317 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -18,8 +18,9 @@ type IsPrefix = bool; #[derive(Clone, PartialEq, Eq, Hash)] pub enum Operation { And(Vec), - // serie of consecutive non prefix and exact words - Phrase(Vec), + // series of consecutive non prefix and exact words + // `None` means a stop word. + Phrase(Vec>), Or(IsOptionalWord, Vec), Query(Query), } @@ -75,9 +76,13 @@ impl Operation { } } - fn phrase(mut words: Vec) -> Self { + fn phrase(mut words: Vec>) -> Self { if words.len() == 1 { - Self::Query(Query { prefix: false, kind: QueryKind::exact(words.pop().unwrap()) }) + if let Some(word) = words.pop().unwrap() { + Self::Query(Query { prefix: false, kind: QueryKind::exact(word) }) + } else { + Self::Phrase(words) + } } else { Self::Phrase(words) } @@ -370,7 +375,10 @@ fn create_query_tree( PrimitiveQueryPart::Word(word, prefix) => { let mut children = synonyms(ctx, &[&word])?.unwrap_or_default(); if let Some((left, right)) = split_best_frequency(ctx, &word)? { - children.push(Operation::Phrase(vec![left.to_string(), right.to_string()])); + children.push(Operation::Phrase(vec![ + Some(left.to_string()), + Some(right.to_string()), + ])); } let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; let exact_words = ctx.exact_words(); @@ -583,7 +591,11 @@ fn create_matching_words( PrimitiveQueryPart::Phrase(words) => { let ids: Vec<_> = (0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect(); - let words = words.into_iter().map(|w| MatchingWord::new(w, 0, false)).collect(); + let words = words + .into_iter() + .filter_map(|w| w) + .map(|w| MatchingWord::new(w, 0, false)) + .collect(); matching_words.push((words, ids)); } } @@ -685,7 +697,7 @@ pub type PrimitiveQuery = Vec; #[derive(Debug, Clone)] pub enum PrimitiveQueryPart { - Phrase(Vec), + Phrase(Vec>), Word(String, IsPrefix), } @@ -735,7 +747,11 @@ where // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, // 3. if the word is the last token of the query we push it as a prefix word. if quoted { - phrase.push(token.lemma().to_string()); + if stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) { + phrase.push(None) + } else { + phrase.push(Some(token.lemma().to_string())); + } } else if peekable.peek().is_some() { if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) { primitive_query