From c2b025946aa6dd64aebc1dfe8e9f591b0bd79290 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 30 Mar 2023 10:42:16 +0200 Subject: [PATCH] `located_query_terms_from_string`: use u16 for positions, hard limit number of iterated tokens. - Refactor phrase logic to reduce number of possible states --- milli/src/search/new/query_graph.rs | 2 +- milli/src/search/new/query_term.rs | 151 ++++++++++++++++++---------- milli/src/search/new/words.rs | 4 +- 3 files changed, 102 insertions(+), 55 deletions(-) diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 0f06b9b95..acadb508c 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -267,7 +267,7 @@ impl QueryGraph { /// Remove all the nodes that correspond to a word starting at the given position, and connect /// the predecessors of these nodes to their successors. /// Return `true` if any node was removed. - pub fn remove_words_starting_at_position(&mut self, position: i8) -> bool { + pub fn remove_words_starting_at_position(&mut self, position: u16) -> bool { let mut nodes_to_remove_keeping_edges = vec![]; for (node_idx, node) in self.nodes.iter() { let QueryNodeData::Term(LocatedQueryTerm { value: _, positions }) = &node.data else { continue }; diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 0850b2181..049b96646 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -1,5 +1,4 @@ use std::collections::HashSet; -use std::mem; use std::ops::RangeInclusive; use charabia::normalizer::NormalizedTokenIter; @@ -414,8 +413,7 @@ impl QueryTerm { #[derive(Clone)] pub struct LocatedQueryTerm { pub value: Interned, - // TODO: consider changing to u8, or even a u16 - pub positions: RangeInclusive, + pub positions: RangeInclusive, } impl LocatedQueryTerm { @@ -425,9 +423,55 @@ impl LocatedQueryTerm { } } +struct PhraseBuilder { + words: Vec>>, + start: u16, + end: u16, +} + +impl PhraseBuilder { + fn empty() -> Self { + Self { words: Default::default(), start: u16::MAX, end: u16::MAX } + } + + fn is_empty(&self) -> bool { + self.words.is_empty() + } + + // precondition: token has kind Word or StopWord + fn push_word(&mut self, ctx: &mut SearchContext, token: &charabia::Token, position: u16) { + if self.is_empty() { + self.start = position; + } + self.end = position; + if let TokenKind::StopWord = token.kind { + self.words.push(None); + } else { + // token has kind Word + let word = ctx.word_interner.insert(token.lemma().to_string()); + // TODO: in a phrase, check that every word exists + // otherwise return an empty term + self.words.push(Some(word)); + } + } + + fn build(self, ctx: &mut SearchContext) -> Option { + if self.is_empty() { + return None; + } + Some(LocatedQueryTerm { + value: ctx.term_interner.insert(QueryTerm::phrase( + &mut ctx.word_interner, + &mut ctx.phrase_interner, + Phrase { words: self.words }, + )), + positions: self.start..=self.end, + }) + } +} + /// Convert the tokenised search query into a list of located query terms. // TODO: checking if the positions are correct for phrases, separators, ngrams -// hard-limit the number of tokens that are considered pub fn located_query_terms_from_string( ctx: &mut SearchContext, query: NormalizedTokenIter<&[u8]>, @@ -437,16 +481,17 @@ pub fn located_query_terms_from_string( let mut located_terms = Vec::new(); - let mut phrase = Vec::new(); - let mut quoted = false; + let mut phrase: Option = None; let parts_limit = words_limit.unwrap_or(usize::MAX); - let mut position = -1i8; - let mut phrase_start = -1i8; - let mut phrase_end = -1i8; + // start with the last position as we will wrap around to position 0 at the beginning of the loop below. + let mut position = u16::MAX; - let mut peekable = query.peekable(); + // TODO: Loic, find proper value here so we don't overflow the interner. + const MAX_TOKEN_COUNT: usize = 1_000; + + let mut peekable = query.take(MAX_TOKEN_COUNT).peekable(); while let Some(token) = peekable.next() { // early return if word limit is exceeded if located_terms.len() >= parts_limit { @@ -455,23 +500,14 @@ pub fn located_query_terms_from_string( match token.kind { TokenKind::Word | TokenKind::StopWord => { - position += 1; + // On first loop, goes from u16::MAX to 0, then normal increment. + position = position.wrapping_add(1); + // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, // 3. if the word is the last token of the query we push it as a prefix word. - if quoted { - phrase_end = position; - if phrase.is_empty() { - phrase_start = position; - } - if let TokenKind::StopWord = token.kind { - phrase.push(None); - } else { - let word = ctx.word_interner.insert(token.lemma().to_string()); - // TODO: in a phrase, check that every word exists - // otherwise return an empty term - phrase.push(Some(word)); - } + if let Some(phrase) = &mut phrase { + phrase.push_word(ctx, &token, position) } else if peekable.peek().is_some() { match token.kind { TokenKind::Word => { @@ -505,41 +541,52 @@ pub fn located_query_terms_from_string( position += 0; } } - let quote_count = token.lemma().chars().filter(|&s| s == '"').count(); - // swap quoted state if we encounter a double quote - if quote_count % 2 != 0 { - quoted = !quoted; - } - // if there is a quote or a hard separator we close the phrase. - // TODO: limit phrase size? - if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard) - { - let located_query_term = LocatedQueryTerm { - value: ctx.term_interner.insert(QueryTerm::phrase( - &mut ctx.word_interner, - &mut ctx.phrase_interner, - Phrase { words: mem::take(&mut phrase) }, - )), - positions: phrase_start..=phrase_end, + + phrase = 'phrase: { + let phrase = phrase.take(); + + // If we have a hard separator inside a phrase, we immediately start a new phrase + let phrase = if separator_kind == SeparatorKind::Hard { + if let Some(phrase) = phrase { + if let Some(located_query_term) = phrase.build(ctx) { + located_terms.push(located_query_term) + } + Some(PhraseBuilder::empty()) + } else { + None + } + } else { + phrase }; - located_terms.push(located_query_term); - } + + // We close and start a new phrase depending on the number of double quotes + let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count(); + if quote_count == 0 { + break 'phrase phrase; + } + + // Consume the closing quote and the phrase + if let Some(phrase) = phrase { + // Per the check above, quote_count > 0 + quote_count -= 1; + if let Some(located_query_term) = phrase.build(ctx) { + located_terms.push(located_query_term) + } + } + + // Start new phrase if the token ends with an opening quote + (quote_count % 2 == 1).then_some(PhraseBuilder::empty()) + }; } _ => (), } } // If a quote is never closed, we consider all of the end of the query as a phrase. - if !phrase.is_empty() { - let located_query_term = LocatedQueryTerm { - value: ctx.term_interner.insert(QueryTerm::phrase( - &mut ctx.word_interner, - &mut ctx.phrase_interner, - Phrase { words: mem::take(&mut phrase) }, - )), - positions: phrase_start..=phrase_end, - }; - located_terms.push(located_query_term); + if let Some(phrase) = phrase.take() { + if let Some(located_query_term) = phrase.build(ctx) { + located_terms.push(located_query_term); + } } Ok(located_terms) diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index fb2c62f11..ba8086c00 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -12,7 +12,7 @@ pub struct Words { exhausted: bool, // TODO: remove query_graph: Option, iterating: bool, // TODO: remove - positions_to_remove: Vec, + positions_to_remove: Vec, terms_matching_strategy: TermsMatchingStrategy, } impl Words { @@ -52,7 +52,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => {} } } - let mut r: Vec = all_positions.into_iter().collect(); + let mut r: Vec = all_positions.into_iter().collect(); // don't remove the first term r.remove(0); r