From c2b025946aa6dd64aebc1dfe8e9f591b0bd79290 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 30 Mar 2023 10:42:16 +0200 Subject: [PATCH 1/2] `located_query_terms_from_string`: use u16 for positions, hard limit number of iterated tokens. - Refactor phrase logic to reduce number of possible states --- milli/src/search/new/query_graph.rs | 2 +- milli/src/search/new/query_term.rs | 151 ++++++++++++++++++---------- milli/src/search/new/words.rs | 4 +- 3 files changed, 102 insertions(+), 55 deletions(-) diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 0f06b9b95..acadb508c 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -267,7 +267,7 @@ impl QueryGraph { /// Remove all the nodes that correspond to a word starting at the given position, and connect /// the predecessors of these nodes to their successors. /// Return `true` if any node was removed. - pub fn remove_words_starting_at_position(&mut self, position: i8) -> bool { + pub fn remove_words_starting_at_position(&mut self, position: u16) -> bool { let mut nodes_to_remove_keeping_edges = vec![]; for (node_idx, node) in self.nodes.iter() { let QueryNodeData::Term(LocatedQueryTerm { value: _, positions }) = &node.data else { continue }; diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 0850b2181..049b96646 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -1,5 +1,4 @@ use std::collections::HashSet; -use std::mem; use std::ops::RangeInclusive; use charabia::normalizer::NormalizedTokenIter; @@ -414,8 +413,7 @@ impl QueryTerm { #[derive(Clone)] pub struct LocatedQueryTerm { pub value: Interned, - // TODO: consider changing to u8, or even a u16 - pub positions: RangeInclusive, + pub positions: RangeInclusive, } impl LocatedQueryTerm { @@ -425,9 +423,55 @@ impl LocatedQueryTerm { } } +struct PhraseBuilder { + words: Vec>>, + start: u16, + end: u16, +} + +impl PhraseBuilder { + fn empty() -> Self { + Self { words: Default::default(), start: u16::MAX, end: u16::MAX } + } + + fn is_empty(&self) -> bool { + self.words.is_empty() + } + + // precondition: token has kind Word or StopWord + fn push_word(&mut self, ctx: &mut SearchContext, token: &charabia::Token, position: u16) { + if self.is_empty() { + self.start = position; + } + self.end = position; + if let TokenKind::StopWord = token.kind { + self.words.push(None); + } else { + // token has kind Word + let word = ctx.word_interner.insert(token.lemma().to_string()); + // TODO: in a phrase, check that every word exists + // otherwise return an empty term + self.words.push(Some(word)); + } + } + + fn build(self, ctx: &mut SearchContext) -> Option { + if self.is_empty() { + return None; + } + Some(LocatedQueryTerm { + value: ctx.term_interner.insert(QueryTerm::phrase( + &mut ctx.word_interner, + &mut ctx.phrase_interner, + Phrase { words: self.words }, + )), + positions: self.start..=self.end, + }) + } +} + /// Convert the tokenised search query into a list of located query terms. // TODO: checking if the positions are correct for phrases, separators, ngrams -// hard-limit the number of tokens that are considered pub fn located_query_terms_from_string( ctx: &mut SearchContext, query: NormalizedTokenIter<&[u8]>, @@ -437,16 +481,17 @@ pub fn located_query_terms_from_string( let mut located_terms = Vec::new(); - let mut phrase = Vec::new(); - let mut quoted = false; + let mut phrase: Option = None; let parts_limit = words_limit.unwrap_or(usize::MAX); - let mut position = -1i8; - let mut phrase_start = -1i8; - let mut phrase_end = -1i8; + // start with the last position as we will wrap around to position 0 at the beginning of the loop below. + let mut position = u16::MAX; - let mut peekable = query.peekable(); + // TODO: Loic, find proper value here so we don't overflow the interner. + const MAX_TOKEN_COUNT: usize = 1_000; + + let mut peekable = query.take(MAX_TOKEN_COUNT).peekable(); while let Some(token) = peekable.next() { // early return if word limit is exceeded if located_terms.len() >= parts_limit { @@ -455,23 +500,14 @@ pub fn located_query_terms_from_string( match token.kind { TokenKind::Word | TokenKind::StopWord => { - position += 1; + // On first loop, goes from u16::MAX to 0, then normal increment. + position = position.wrapping_add(1); + // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, // 3. if the word is the last token of the query we push it as a prefix word. - if quoted { - phrase_end = position; - if phrase.is_empty() { - phrase_start = position; - } - if let TokenKind::StopWord = token.kind { - phrase.push(None); - } else { - let word = ctx.word_interner.insert(token.lemma().to_string()); - // TODO: in a phrase, check that every word exists - // otherwise return an empty term - phrase.push(Some(word)); - } + if let Some(phrase) = &mut phrase { + phrase.push_word(ctx, &token, position) } else if peekable.peek().is_some() { match token.kind { TokenKind::Word => { @@ -505,41 +541,52 @@ pub fn located_query_terms_from_string( position += 0; } } - let quote_count = token.lemma().chars().filter(|&s| s == '"').count(); - // swap quoted state if we encounter a double quote - if quote_count % 2 != 0 { - quoted = !quoted; - } - // if there is a quote or a hard separator we close the phrase. - // TODO: limit phrase size? - if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard) - { - let located_query_term = LocatedQueryTerm { - value: ctx.term_interner.insert(QueryTerm::phrase( - &mut ctx.word_interner, - &mut ctx.phrase_interner, - Phrase { words: mem::take(&mut phrase) }, - )), - positions: phrase_start..=phrase_end, + + phrase = 'phrase: { + let phrase = phrase.take(); + + // If we have a hard separator inside a phrase, we immediately start a new phrase + let phrase = if separator_kind == SeparatorKind::Hard { + if let Some(phrase) = phrase { + if let Some(located_query_term) = phrase.build(ctx) { + located_terms.push(located_query_term) + } + Some(PhraseBuilder::empty()) + } else { + None + } + } else { + phrase }; - located_terms.push(located_query_term); - } + + // We close and start a new phrase depending on the number of double quotes + let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count(); + if quote_count == 0 { + break 'phrase phrase; + } + + // Consume the closing quote and the phrase + if let Some(phrase) = phrase { + // Per the check above, quote_count > 0 + quote_count -= 1; + if let Some(located_query_term) = phrase.build(ctx) { + located_terms.push(located_query_term) + } + } + + // Start new phrase if the token ends with an opening quote + (quote_count % 2 == 1).then_some(PhraseBuilder::empty()) + }; } _ => (), } } // If a quote is never closed, we consider all of the end of the query as a phrase. - if !phrase.is_empty() { - let located_query_term = LocatedQueryTerm { - value: ctx.term_interner.insert(QueryTerm::phrase( - &mut ctx.word_interner, - &mut ctx.phrase_interner, - Phrase { words: mem::take(&mut phrase) }, - )), - positions: phrase_start..=phrase_end, - }; - located_terms.push(located_query_term); + if let Some(phrase) = phrase.take() { + if let Some(located_query_term) = phrase.build(ctx) { + located_terms.push(located_query_term); + } } Ok(located_terms) diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index fb2c62f11..ba8086c00 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -12,7 +12,7 @@ pub struct Words { exhausted: bool, // TODO: remove query_graph: Option, iterating: bool, // TODO: remove - positions_to_remove: Vec, + positions_to_remove: Vec, terms_matching_strategy: TermsMatchingStrategy, } impl Words { @@ -52,7 +52,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => {} } } - let mut r: Vec = all_positions.into_iter().collect(); + let mut r: Vec = all_positions.into_iter().collect(); // don't remove the first term r.remove(0); r From 9b87c36200e6db60a5519f5a72feaa2062694cc7 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Fri, 31 Mar 2023 09:19:18 +0200 Subject: [PATCH 2/2] Limit the number of derivations for a single word. --- milli/src/search/new/limits.rs | 18 +++++++++ milli/src/search/new/mod.rs | 1 + milli/src/search/new/query_term.rs | 62 +++++++++++++++++++++--------- 3 files changed, 63 insertions(+), 18 deletions(-) create mode 100644 milli/src/search/new/limits.rs diff --git a/milli/src/search/new/limits.rs b/milli/src/search/new/limits.rs new file mode 100644 index 000000000..33a5a4a6c --- /dev/null +++ b/milli/src/search/new/limits.rs @@ -0,0 +1,18 @@ +/// Maximum number of tokens we consider in a single search. +// TODO: Loic, find proper value here so we don't overflow the interner. +pub const MAX_TOKEN_COUNT: usize = 1_000; + +/// Maximum number of prefixes that can be derived from a single word. +pub const MAX_PREFIX_COUNT: usize = 1_000; +/// Maximum number of words that can be derived from a single word with a distance of one to that word. +pub const MAX_ONE_TYPO_COUNT: usize = 150; +/// Maximum number of words that can be derived from a single word with a distance of two to that word. +pub const MAX_TWO_TYPOS_COUNT: usize = 50; + +/// Maximum amount of synonym phrases that can be derived from a single word. +pub const MAX_SYNONYM_PHRASE_COUNT: usize = 50; + +/// Maximum amount of words inside of all the synonym phrases that can be derived from a single word. +/// +/// This limit is meant to gracefully handle the case where a word would have very long phrases as synonyms. +pub const MAX_SYNONYM_WORD_COUNT: usize = 100; diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index ef3f6c047..707ba4ea6 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -2,6 +2,7 @@ mod db_cache; mod distinct; mod graph_based_ranking_rule; mod interner; +mod limits; mod logger; mod query_graph; mod query_term; diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 049b96646..2c1b52414 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -10,7 +10,7 @@ use heed::RoTxn; use itertools::Itertools; use super::interner::{DedupInterner, Interned}; -use super::SearchContext; +use super::{limits, SearchContext}; use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; use crate::search::{build_dfa, get_first}; use crate::{CboRoaringBitmapLenCodec, Index, Result, MAX_WORD_LENGTH}; @@ -266,6 +266,9 @@ pub fn query_term_from_word( let mut stream = fst.search(prefix).into_stream(); while let Some(derived_word) = stream.next() { + if prefix_of.len() >= limits::MAX_PREFIX_COUNT { + break; + } let derived_word = std::str::from_utf8(derived_word)?.to_owned(); let derived_word_interned = ctx.word_interner.insert(derived_word); if derived_word_interned != word_interned { @@ -277,23 +280,31 @@ pub fn query_term_from_word( let dfa = build_dfa(word, 1, is_prefix); let starts = StartsWith(Str::new(get_first(word))); let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream(); - // TODO: There may be wayyy too many matches (e.g. in the thousands), how to reduce them? while let Some((derived_word, state)) = stream.next() { + if prefix_of.len() >= limits::MAX_PREFIX_COUNT + && one_typo.len() >= limits::MAX_ONE_TYPO_COUNT + { + break; + } let derived_word = std::str::from_utf8(derived_word)?; let d = dfa.distance(state.1); let derived_word_interned = ctx.word_interner.insert(derived_word.to_owned()); match d.to_u8() { 0 => { - if derived_word_interned != word_interned { + if derived_word_interned != word_interned + && prefix_of.len() < limits::MAX_PREFIX_COUNT + { prefix_of.push(derived_word_interned); } } 1 => { - one_typo.push(derived_word_interned); + if one_typo.len() < limits::MAX_PREFIX_COUNT { + one_typo.push(derived_word_interned); + } } - _ => panic!(), + _ => unreachable!("One typo dfa produced multiple typos"), } } } else { @@ -304,14 +315,21 @@ pub fn query_term_from_word( let automaton = Union(first, &second); let mut stream = fst.search_with_state(automaton).into_stream(); - // TODO: There may be wayyy too many matches (e.g. in the thousands), how to reduce them? while let Some((derived_word, state)) = stream.next() { + if prefix_of.len() >= limits::MAX_PREFIX_COUNT + && one_typo.len() >= limits::MAX_ONE_TYPO_COUNT + && two_typos.len() >= limits::MAX_TWO_TYPOS_COUNT + { + break; + } let derived_word = std::str::from_utf8(derived_word)?; let derived_word_interned = ctx.word_interner.insert(derived_word.to_owned()); // in the case the typo is on the first letter, we know the number of typo // is two - if get_first(derived_word) != get_first(word) { + if get_first(derived_word) != get_first(word) + && two_typos.len() < limits::MAX_TWO_TYPOS_COUNT + { two_typos.push(derived_word_interned); } else { // Else, we know that it is the second dfa that matched and compute the @@ -319,17 +337,23 @@ pub fn query_term_from_word( let d = second_dfa.distance((state.1).0); match d.to_u8() { 0 => { - if derived_word_interned != word_interned { + if derived_word_interned != word_interned + && prefix_of.len() < limits::MAX_PREFIX_COUNT + { prefix_of.push(derived_word_interned); } } 1 => { - one_typo.push(derived_word_interned); + if one_typo.len() < limits::MAX_ONE_TYPO_COUNT { + one_typo.push(derived_word_interned); + } } 2 => { - two_typos.push(derived_word_interned); + if two_typos.len() < limits::MAX_TWO_TYPOS_COUNT { + two_typos.push(derived_word_interned); + } } - _ => panic!(), + _ => unreachable!("2 typos DFA produced a distance greater than 2"), } } } @@ -341,15 +365,20 @@ pub fn query_term_from_word( }); let synonyms = ctx.index.synonyms(ctx.txn)?; - + let mut synonym_word_count = 0; let synonyms = synonyms .get(&vec![word.to_owned()]) .cloned() .unwrap_or_default() .into_iter() - .map(|words| { + .take(limits::MAX_SYNONYM_PHRASE_COUNT) + .filter_map(|words| { + if synonym_word_count + words.len() > limits::MAX_SYNONYM_WORD_COUNT { + return None; + } + synonym_word_count += words.len(); let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); - ctx.phrase_interner.insert(Phrase { words }) + Some(ctx.phrase_interner.insert(Phrase { words })) }) .collect(); @@ -488,10 +517,7 @@ pub fn located_query_terms_from_string( // start with the last position as we will wrap around to position 0 at the beginning of the loop below. let mut position = u16::MAX; - // TODO: Loic, find proper value here so we don't overflow the interner. - const MAX_TOKEN_COUNT: usize = 1_000; - - let mut peekable = query.take(MAX_TOKEN_COUNT).peekable(); + let mut peekable = query.take(super::limits::MAX_TOKEN_COUNT).peekable(); while let Some(token) = peekable.next() { // early return if word limit is exceeded if located_terms.len() >= parts_limit {