From cab2b6bcdac8291c5a0ab71ec5becec488ad98a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 6 Mar 2023 08:35:01 +0100 Subject: [PATCH] Fix: computation of initial universe, code organisation --- .../search/new/graph_based_ranking_rule.rs | 4 +- milli/src/search/new/logger/detailed.rs | 54 ++-- milli/src/search/new/logger/mod.rs | 23 +- milli/src/search/new/mod.rs | 124 +++++--- milli/src/search/new/query_graph.rs | 22 +- milli/src/search/new/query_term.rs | 275 ++++++++++-------- .../ranking_rule_graph/edge_docids_cache.rs | 3 +- .../src/search/new/ranking_rule_graph/mod.rs | 23 +- .../new/ranking_rule_graph/proximity/build.rs | 1 + milli/src/search/new/ranking_rules.rs | 75 ++--- milli/src/search/new/words.rs | 12 +- 11 files changed, 341 insertions(+), 275 deletions(-) diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index e5a0fbad6..d1f5864aa 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -3,8 +3,8 @@ use roaring::RoaringBitmap; use super::db_cache::DatabaseCache; use super::logger::SearchLogger; -use super::ranking_rule_graph::edge_docids_cache::EdgeDocidsCache; -use super::ranking_rule_graph::empty_paths_cache::EmptyPathsCache; +use super::ranking_rule_graph::EdgeDocidsCache; +use super::ranking_rule_graph::EmptyPathsCache; use super::ranking_rule_graph::{RankingRuleGraph, RankingRuleGraphTrait}; use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput}; diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index c2415837d..4282db27f 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -5,13 +5,13 @@ use std::fs::File; use std::time::Instant; use std::{io::Write, path::PathBuf}; -use crate::new::ranking_rule_graph::typo::TypoGraph; +use crate::new::ranking_rule_graph::TypoGraph; use crate::new::{QueryNode, QueryGraph}; use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; -use crate::new::ranking_rule_graph::empty_paths_cache::EmptyPathsCache; +use crate::new::ranking_rule_graph::EmptyPathsCache; use crate::new::ranking_rule_graph::{Edge, EdgeDetails, RankingRuleGraphTrait}; use crate::new::ranking_rule_graph::{ - proximity::ProximityGraph, RankingRuleGraph, + ProximityGraph, RankingRuleGraph, }; use super::{RankingRule, SearchLogger}; @@ -21,18 +21,18 @@ pub enum SearchEvents { ranking_rule_idx: usize, query: QueryGraph, universe: RoaringBitmap, - time: Instant, + time: Instant }, RankingRuleNextBucket { ranking_rule_idx: usize, universe: RoaringBitmap, candidates: RoaringBitmap, - time: Instant, + time: Instant }, RankingRuleEndIteration { ranking_rule_idx: usize, universe: RoaringBitmap, - time: Instant, + time: Instant }, ExtendResults { new: Vec, @@ -56,13 +56,14 @@ pub enum SearchEvents { distances: Vec>, cost: u64, }, - RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap, time: Instant, }, + RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap, time: Instant }, } pub struct DetailedSearchLogger { folder_path: PathBuf, initial_query: Option, initial_query_time: Option, + query_for_universe: Option, initial_universe: Option, ranking_rules_ids: Option>, events: Vec, @@ -73,6 +74,7 @@ impl DetailedSearchLogger { folder_path: PathBuf::new().join(folder_path), initial_query: None, initial_query_time: None, + query_for_universe: None, initial_universe: None, ranking_rules_ids: None, events: vec![], @@ -81,9 +83,13 @@ impl DetailedSearchLogger { } impl SearchLogger for DetailedSearchLogger { - fn initial_query(&mut self, query: &QueryGraph, time: Instant) { + fn initial_query(&mut self, query: &QueryGraph) { self.initial_query = Some(query.clone()); - self.initial_query_time = Some(time); + self.initial_query_time = Some(Instant::now()); + } + + fn query_for_universe(&mut self, query: &QueryGraph) { + self.query_for_universe = Some(query.clone()); } fn initial_universe(&mut self, universe: &RoaringBitmap) { @@ -99,13 +105,13 @@ impl SearchLogger for DetailedSearchLogger { _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, query: &QueryGraph, universe: &RoaringBitmap, - time: Instant, + ) { self.events.push(SearchEvents::RankingRuleStartIteration { ranking_rule_idx, query: query.clone(), universe: universe.clone(), - time, + time: Instant::now(), }) } @@ -115,13 +121,13 @@ impl SearchLogger for DetailedSearchLogger { _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, universe: &RoaringBitmap, candidates: &RoaringBitmap, - time: Instant, + ) { self.events.push(SearchEvents::RankingRuleNextBucket { ranking_rule_idx, universe: universe.clone(), candidates: candidates.clone(), - time, + time: Instant::now(), }) } fn skip_bucket_ranking_rule<'transaction>( @@ -129,12 +135,12 @@ impl SearchLogger for DetailedSearchLogger { ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, candidates: &RoaringBitmap, - time: Instant, + ) { self.events.push(SearchEvents::RankingRuleSkipBucket { ranking_rule_idx, candidates: candidates.clone(), - time + time: Instant::now() }) } @@ -143,12 +149,12 @@ impl SearchLogger for DetailedSearchLogger { ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, universe: &RoaringBitmap, - time: Instant, + ) { self.events.push(SearchEvents::RankingRuleEndIteration { ranking_rule_idx, universe: universe.clone(), - time + time: Instant::now() }) } fn add_to_results(&mut self, docids: &[u32]) { @@ -184,6 +190,20 @@ impl DetailedSearchLogger { let index_path = self.folder_path.join("index.d2"); let mut file = std::fs::File::create(index_path).unwrap(); + writeln!(&mut file, "direction: right").unwrap(); + writeln!(&mut file, "Initial Query Graph: {{").unwrap(); + let initial_query_graph = self.initial_query.as_ref().unwrap(); + Self::query_graph_d2_description(initial_query_graph, &mut file); + writeln!(&mut file, "}}").unwrap(); + + writeln!(&mut file, "Query Graph Used To Compute Universe: {{").unwrap(); + let query_graph_for_universe = self.query_for_universe.as_ref().unwrap(); + Self::query_graph_d2_description(query_graph_for_universe, &mut file); + writeln!(&mut file, "}}").unwrap(); + + let initial_universe = self.initial_universe.as_ref().unwrap(); + writeln!(&mut file, "Initial Universe Length {}", initial_universe.len()).unwrap(); + writeln!(&mut file, "Control Flow Between Ranking Rules: {{").unwrap(); writeln!(&mut file, "shape: sequence_diagram").unwrap(); for (idx, rr_id) in self.ranking_rules_ids.as_ref().unwrap().iter().enumerate() { diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 079bb892c..9a141c1c6 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -2,19 +2,17 @@ pub mod detailed; use roaring::RoaringBitmap; -use std::time::Instant; use super::{ - ranking_rule_graph::{ - empty_paths_cache::EmptyPathsCache, proximity::ProximityGraph, typo::TypoGraph, - RankingRuleGraph, - }, + ranking_rule_graph::{EmptyPathsCache, ProximityGraph, RankingRuleGraph, TypoGraph}, RankingRule, RankingRuleQueryTrait, }; pub struct DefaultSearchLogger; impl SearchLogger for DefaultSearchLogger { - fn initial_query(&mut self, _query: &Q, _time: Instant) {} + fn initial_query(&mut self, _query: &Q) {} + + fn query_for_universe(&mut self, _query: &Q) {} fn initial_universe(&mut self, _universe: &RoaringBitmap) {} @@ -26,7 +24,6 @@ impl SearchLogger for DefaultSearchLogger { _ranking_rule: &dyn RankingRule<'transaction, Q>, _query: &Q, _universe: &RoaringBitmap, - _time: Instant, ) { } @@ -36,7 +33,6 @@ impl SearchLogger for DefaultSearchLogger { _ranking_rule: &dyn RankingRule<'transaction, Q>, _universe: &RoaringBitmap, _candidates: &RoaringBitmap, - _time: Instant, ) { } fn skip_bucket_ranking_rule<'transaction>( @@ -44,7 +40,6 @@ impl SearchLogger for DefaultSearchLogger { _ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule<'transaction, Q>, _candidates: &RoaringBitmap, - _time: Instant, ) { } @@ -53,7 +48,6 @@ impl SearchLogger for DefaultSearchLogger { _ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule<'transaction, Q>, _universe: &RoaringBitmap, - _time: Instant, ) { } @@ -85,7 +79,10 @@ impl SearchLogger for DefaultSearchLogger { } pub trait SearchLogger { - fn initial_query(&mut self, query: &Q, time: Instant); + fn initial_query(&mut self, query: &Q); + + fn query_for_universe(&mut self, query: &Q); + fn initial_universe(&mut self, universe: &RoaringBitmap); fn ranking_rules(&mut self, rr: &[&mut dyn RankingRule]); @@ -96,7 +93,6 @@ pub trait SearchLogger { ranking_rule: &dyn RankingRule<'transaction, Q>, query: &Q, universe: &RoaringBitmap, - time: Instant, ); fn next_bucket_ranking_rule<'transaction>( &mut self, @@ -104,21 +100,18 @@ pub trait SearchLogger { ranking_rule: &dyn RankingRule<'transaction, Q>, universe: &RoaringBitmap, candidates: &RoaringBitmap, - time: Instant, ); fn skip_bucket_ranking_rule<'transaction>( &mut self, ranking_rule_idx: usize, ranking_rule: &dyn RankingRule<'transaction, Q>, candidates: &RoaringBitmap, - time: Instant, ); fn end_iteration_ranking_rule<'transaction>( &mut self, ranking_rule_idx: usize, ranking_rule: &dyn RankingRule<'transaction, Q>, universe: &RoaringBitmap, - time: Instant, ); fn add_to_results(&mut self, docids: &[u32]); diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 94120cd8a..3e9b43f1b 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -9,55 +9,113 @@ mod resolve_query_graph; mod sort; mod words; -use charabia::Tokenize; -use heed::RoTxn; +use std::collections::BTreeSet; -use query_graph::{QueryGraph, QueryNode}; pub use ranking_rules::{ - execute_search, RankingRule, RankingRuleOutput, RankingRuleOutputIter, + apply_ranking_rules, RankingRule, RankingRuleOutput, RankingRuleOutputIter, RankingRuleOutputIterWrapper, RankingRuleQueryTrait, }; + +use crate::{ + new::query_term::located_query_terms_from_string, Filter, Index, Result, TermsMatchingStrategy, +}; +use charabia::Tokenize; +use db_cache::DatabaseCache; +use heed::RoTxn; +use query_graph::{QueryGraph, QueryNode}; use roaring::RoaringBitmap; -use self::db_cache::DatabaseCache; -use self::query_term::{word_derivations, LocatedQueryTerm}; -use crate::{Index, Result}; +use self::{ + logger::SearchLogger, + resolve_query_graph::{resolve_query_graph, NodeDocIdsCache}, +}; pub enum BitmapOrAllRef<'s> { Bitmap(&'s RoaringBitmap), All, } -pub fn make_query_graph<'transaction>( +#[allow(clippy::too_many_arguments)] +pub fn resolve_maximally_reduced_query_graph<'transaction>( index: &Index, - txn: &RoTxn, + txn: &'transaction heed::RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + universe: &RoaringBitmap, + query_graph: &QueryGraph, + node_docids_cache: &mut NodeDocIdsCache, + matching_strategy: TermsMatchingStrategy, + logger: &mut dyn SearchLogger, +) -> Result { + let mut graph = query_graph.clone(); + let mut positions_to_remove = match matching_strategy { + TermsMatchingStrategy::Last => { + let mut all_positions = BTreeSet::new(); + for n in query_graph.nodes.iter() { + match n { + QueryNode::Term(term) => { + all_positions.extend(term.positions.clone().into_iter()); + } + QueryNode::Deleted | QueryNode::Start | QueryNode::End => {} + } + } + all_positions.into_iter().collect() + } + TermsMatchingStrategy::All => vec![], + }; + // don't remove the first term + positions_to_remove.remove(0); + loop { + if positions_to_remove.is_empty() { + break; + } else { + let position_to_remove = positions_to_remove.pop().unwrap(); + let _ = graph.remove_words_at_position(position_to_remove); + } + } + logger.query_for_universe(&graph); + let docids = resolve_query_graph(index, txn, db_cache, node_docids_cache, &graph, universe)?; + + Ok(docids) +} + +#[allow(clippy::too_many_arguments)] +pub fn execute_search<'transaction>( + index: &Index, + txn: &'transaction RoTxn, db_cache: &mut DatabaseCache<'transaction>, query: &str, -) -> Result { + filters: Option, + from: usize, + length: usize, + logger: &mut dyn SearchLogger, +) -> Result> { assert!(!query.is_empty()); - let authorize_typos = index.authorize_typos(txn)?; - let min_len_one_typo = index.min_word_len_one_typo(txn)?; - let min_len_two_typos = index.min_word_len_two_typos(txn)?; + let query_terms = located_query_terms_from_string(index, txn, query.tokenize(), None).unwrap(); + let graph = QueryGraph::from_query(index, txn, db_cache, query_terms)?; - let exact_words = index.exact_words(txn)?; - let fst = index.words_fst(txn)?; + logger.initial_query(&graph); - // TODO: get rid of this closure - // also, ngrams can have one typo? - let query = LocatedQueryTerm::from_query(query.tokenize(), None, move |word, is_prefix| { - let typos = if !authorize_typos - || word.len() < min_len_one_typo as usize - || exact_words.as_ref().map_or(false, |fst| fst.contains(word)) - { - 0 - } else if word.len() < min_len_two_typos as usize { - 1 - } else { - 2 - }; - word_derivations(index, txn, word, typos, is_prefix, &fst) - }) - .unwrap(); - let graph = QueryGraph::from_query(index, txn, db_cache, query)?; - Ok(graph) + let universe = if let Some(filters) = filters { + filters.evaluate(txn, index)? + } else { + index.documents_ids(txn)? + }; + + let mut node_docids_cache = NodeDocIdsCache::default(); + + let universe = resolve_maximally_reduced_query_graph( + index, + txn, + db_cache, + &universe, + &graph, + &mut node_docids_cache, + TermsMatchingStrategy::Last, + logger, + )?; + // TODO: create ranking rules here, reuse the node docids cache for the words ranking rule + + logger.initial_universe(&universe); + + apply_ranking_rules(index, txn, db_cache, &graph, &universe, from, length, logger) } diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 8178f8ded..e86c175af 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -4,7 +4,7 @@ use heed::RoTxn; use roaring::RoaringBitmap; use super::db_cache::DatabaseCache; -use super::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; +use super::query_term::{self, LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::{Index, Result}; #[derive(Debug, Clone)] @@ -31,6 +31,7 @@ pub struct QueryGraph { } fn _assert_sizes() { + // TODO: QueryNodes are too big now, 184B is an unreasonable size let _: [u8; 184] = [0; std::mem::size_of::()]; let _: [u8; 48] = [0; std::mem::size_of::()]; } @@ -75,7 +76,7 @@ impl QueryGraph { index: &Index, txn: &RoTxn, _db_cache: &mut DatabaseCache<'transaction>, - query: Vec, + terms: Vec, ) -> Result { // TODO: maybe empty nodes should not be removed here, to compute // the score of the `words` ranking rule correctly @@ -90,8 +91,8 @@ impl QueryGraph { (vec![], vec![], vec![graph.root_node]); // TODO: split words / synonyms - for length in 1..=query.len() { - let query = &query[..length]; + for length in 1..=terms.len() { + let query = &terms[..length]; let term0 = query.last().unwrap(); @@ -104,7 +105,7 @@ impl QueryGraph { if !prev1.is_empty() { if let Some((ngram2_str, ngram2_pos)) = - LocatedQueryTerm::ngram2(&query[length - 2], &query[length - 1]) + query_term::ngram2(&query[length - 2], &query[length - 1]) { if word_set.contains(ngram2_str.as_bytes()) { let ngram2 = LocatedQueryTerm { @@ -128,11 +129,9 @@ impl QueryGraph { } } if !prev2.is_empty() { - if let Some((ngram3_str, ngram3_pos)) = LocatedQueryTerm::ngram3( - &query[length - 3], - &query[length - 2], - &query[length - 1], - ) { + if let Some((ngram3_str, ngram3_pos)) = + query_term::ngram3(&query[length - 3], &query[length - 2], &query[length - 1]) + { if word_set.contains(ngram3_str.as_bytes()) { let ngram3 = LocatedQueryTerm { value: QueryTerm::Word { @@ -143,8 +142,9 @@ impl QueryGraph { one_typo: vec![], two_typos: vec![], use_prefix_db: false, - synonyms: vec![], // TODO: ngram synonyms + synonyms: vec![], // TODO: ngram synonyms split_words: None, // TODO: maybe ngram split words? + // would be nice for typos like su nflower }, }, positions: ngram3_pos, diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 9ea72aa3a..3820b8ed0 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -178,9 +178,15 @@ fn split_best_frequency( #[derive(Debug, Clone)] pub enum QueryTerm { + // TODO: should there be SplitWord, NGram2, and NGram3 variants? + // NGram2 can have 1 typo and synonyms + // NGram3 cannot have typos but can have synonyms + // SplitWords are a phrase + // Can NGrams be prefixes? Phrase { phrase: Phrase }, Word { derivations: WordDerivations }, } + impl QueryTerm { pub fn original_single_word(&self) -> Option<&str> { match self { @@ -209,53 +215,77 @@ impl LocatedQueryTerm { QueryTerm::Word { derivations, .. } => derivations.is_empty(), } } - /// Create primitive query from tokenized query string, - /// the primitive query is an intermediate state to build the query tree. - pub fn from_query( - query: NormalizedTokenIter>, - words_limit: Option, - // TODO:` use index + txn + ? instead of closure - derivations: impl Fn(&str, bool) -> Result, - ) -> Result> { - let mut primitive_query = Vec::new(); - let mut phrase = Vec::new(); +} - let mut quoted = false; +pub fn located_query_terms_from_string<'transaction>( + index: &Index, + txn: &'transaction RoTxn, + query: NormalizedTokenIter>, + words_limit: Option, +) -> Result> { + let authorize_typos = index.authorize_typos(txn)?; + let min_len_one_typo = index.min_word_len_one_typo(txn)?; + let min_len_two_typos = index.min_word_len_two_typos(txn)?; - let parts_limit = words_limit.unwrap_or(usize::MAX); + let exact_words = index.exact_words(txn)?; + let fst = index.words_fst(txn)?; - let mut position = -1i8; - let mut phrase_start = -1i8; - let mut phrase_end = -1i8; + let nbr_typos = |word: &str| { + if !authorize_typos + || word.len() < min_len_one_typo as usize + || exact_words.as_ref().map_or(false, |fst| fst.contains(word)) + { + 0 + } else if word.len() < min_len_two_typos as usize { + 1 + } else { + 2 + } + }; - let mut peekable = query.peekable(); - while let Some(token) = peekable.next() { - // early return if word limit is exceeded - if primitive_query.len() >= parts_limit { - return Ok(primitive_query); - } + let derivations = |word: &str, is_prefix: bool| { + word_derivations(index, txn, word, nbr_typos(word), is_prefix, &fst) + }; - match token.kind { - TokenKind::Word | TokenKind::StopWord => { - position += 1; - // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, - // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, - // 3. if the word is the last token of the query we push it as a prefix word. - if quoted { - phrase_end = position; - if phrase.is_empty() { - phrase_start = position; - } - if let TokenKind::StopWord = token.kind { - phrase.push(None); - } else { - // TODO: in a phrase, check that every word exists - // otherwise return WordDerivations::Empty - phrase.push(Some(token.lemma().to_string())); - } - } else if peekable.peek().is_some() { - if let TokenKind::StopWord = token.kind { - } else { + let mut primitive_query = Vec::new(); + let mut phrase = Vec::new(); + + let mut quoted = false; + + let parts_limit = words_limit.unwrap_or(usize::MAX); + + let mut position = -1i8; + let mut phrase_start = -1i8; + let mut phrase_end = -1i8; + + let mut peekable = query.peekable(); + while let Some(token) = peekable.next() { + // early return if word limit is exceeded + if primitive_query.len() >= parts_limit { + return Ok(primitive_query); + } + + match token.kind { + TokenKind::Word | TokenKind::StopWord => { + position += 1; + // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, + // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, + // 3. if the word is the last token of the query we push it as a prefix word. + if quoted { + phrase_end = position; + if phrase.is_empty() { + phrase_start = position; + } + if let TokenKind::StopWord = token.kind { + phrase.push(None); + } else { + // TODO: in a phrase, check that every word exists + // otherwise return WordDerivations::Empty + phrase.push(Some(token.lemma().to_string())); + } + } else if peekable.peek().is_some() { + match token.kind { + TokenKind::Word => { let derivations = derivations(token.lemma(), false)?; let located_term = LocatedQueryTerm { value: QueryTerm::Word { derivations }, @@ -263,100 +293,91 @@ impl LocatedQueryTerm { }; primitive_query.push(located_term); } - } else { - let derivations = derivations(token.lemma(), true)?; - let located_term = LocatedQueryTerm { - value: QueryTerm::Word { derivations }, - positions: position..=position, - }; - primitive_query.push(located_term); + TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {} + } + } else { + let derivations = derivations(token.lemma(), true)?; + let located_term = LocatedQueryTerm { + value: QueryTerm::Word { derivations }, + positions: position..=position, + }; + primitive_query.push(located_term); + } + } + TokenKind::Separator(separator_kind) => { + match separator_kind { + SeparatorKind::Hard => { + position += 1; + } + SeparatorKind::Soft => { + position += 0; } } - TokenKind::Separator(separator_kind) => { - match separator_kind { - SeparatorKind::Hard => { - position += 1; - } - SeparatorKind::Soft => { - position += 0; - } - } - let quote_count = token.lemma().chars().filter(|&s| s == '"').count(); - // swap quoted state if we encounter a double quote - if quote_count % 2 != 0 { - quoted = !quoted; - } - // if there is a quote or a hard separator we close the phrase. - if !phrase.is_empty() - && (quote_count > 0 || separator_kind == SeparatorKind::Hard) - { - let located_query_term = LocatedQueryTerm { - value: QueryTerm::Phrase { - phrase: Phrase { words: mem::take(&mut phrase) }, - }, - positions: phrase_start..=phrase_end, - }; - primitive_query.push(located_query_term); - } + let quote_count = token.lemma().chars().filter(|&s| s == '"').count(); + // swap quoted state if we encounter a double quote + if quote_count % 2 != 0 { + quoted = !quoted; + } + // if there is a quote or a hard separator we close the phrase. + if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard) + { + let located_query_term = LocatedQueryTerm { + value: QueryTerm::Phrase { + phrase: Phrase { words: mem::take(&mut phrase) }, + }, + positions: phrase_start..=phrase_end, + }; + primitive_query.push(located_query_term); } - _ => (), } + _ => (), } - - // If a quote is never closed, we consider all of the end of the query as a phrase. - if !phrase.is_empty() { - let located_query_term = LocatedQueryTerm { - value: QueryTerm::Phrase { phrase: Phrase { words: mem::take(&mut phrase) } }, - positions: phrase_start..=phrase_end, - }; - primitive_query.push(located_query_term); - } - - Ok(primitive_query) } + + // If a quote is never closed, we consider all of the end of the query as a phrase. + if !phrase.is_empty() { + let located_query_term = LocatedQueryTerm { + value: QueryTerm::Phrase { phrase: Phrase { words: mem::take(&mut phrase) } }, + positions: phrase_start..=phrase_end, + }; + primitive_query.push(located_query_term); + } + + Ok(primitive_query) } -impl LocatedQueryTerm { - pub fn ngram2( - x: &LocatedQueryTerm, - y: &LocatedQueryTerm, - ) -> Option<(String, RangeInclusive)> { - if *x.positions.end() != y.positions.start() - 1 { - println!( - "x positions end: {}, y positions start: {}", - *x.positions.end(), - y.positions.start() - ); - return None; - } - match (&x.value.original_single_word(), &y.value.original_single_word()) { - (Some(w1), Some(w2)) => { - let term = (format!("{w1}{w2}"), *x.positions.start()..=*y.positions.end()); - Some(term) - } - _ => None, - } +// TODO: return a word derivations instead? +pub fn ngram2(x: &LocatedQueryTerm, y: &LocatedQueryTerm) -> Option<(String, RangeInclusive)> { + if *x.positions.end() != y.positions.start() - 1 { + return None; } - pub fn ngram3( - x: &LocatedQueryTerm, - y: &LocatedQueryTerm, - z: &LocatedQueryTerm, - ) -> Option<(String, RangeInclusive)> { - if *x.positions.end() != y.positions.start() - 1 - || *y.positions.end() != z.positions.start() - 1 - { - return None; - } - match ( - &x.value.original_single_word(), - &y.value.original_single_word(), - &z.value.original_single_word(), - ) { - (Some(w1), Some(w2), Some(w3)) => { - let term = (format!("{w1}{w2}{w3}"), *x.positions.start()..=*z.positions.end()); - Some(term) - } - _ => None, + match (&x.value.original_single_word(), &y.value.original_single_word()) { + (Some(w1), Some(w2)) => { + let term = (format!("{w1}{w2}"), *x.positions.start()..=*y.positions.end()); + Some(term) } + _ => None, + } +} +pub fn ngram3( + x: &LocatedQueryTerm, + y: &LocatedQueryTerm, + z: &LocatedQueryTerm, +) -> Option<(String, RangeInclusive)> { + if *x.positions.end() != y.positions.start() - 1 + || *y.positions.end() != z.positions.start() - 1 + { + return None; + } + match ( + &x.value.original_single_word(), + &y.value.original_single_word(), + &z.value.original_single_word(), + ) { + (Some(w1), Some(w2), Some(w3)) => { + let term = (format!("{w1}{w2}{w3}"), *x.positions.start()..=*z.positions.end()); + Some(term) + } + _ => None, } } diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs index ef2eba895..3d48fd69c 100644 --- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs @@ -41,11 +41,12 @@ impl EdgeDocidsCache { EdgeDetails::Unconditional => Ok(BitmapOrAllRef::All), EdgeDetails::Data(details) => { if self.cache.contains_key(&edge_index) { + // TODO: should we update the bitmap in the cache if the new universe + // reduces it? return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index])); } // TODO: maybe universe doesn't belong here let docids = universe & G::compute_docids(index, txn, db_cache, details)?; - let _ = self.cache.insert(edge_index, docids); let docids = &self.cache[&edge_index]; Ok(BitmapOrAllRef::Bitmap(docids)) diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index ac5e1f46b..e65d5f70b 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -1,19 +1,22 @@ -pub mod build; -pub mod cheapest_paths; -pub mod edge_docids_cache; -pub mod empty_paths_cache; -pub mod paths_map; -pub mod proximity; -pub mod resolve_paths; -pub mod typo; +mod build; +mod cheapest_paths; +mod edge_docids_cache; +mod empty_paths_cache; +mod paths_map; +mod proximity; +mod resolve_paths; +mod typo; + +pub use edge_docids_cache::EdgeDocidsCache; +pub use empty_paths_cache::EmptyPathsCache; +pub use proximity::ProximityGraph; +pub use typo::TypoGraph; use std::ops::ControlFlow; use heed::RoTxn; use roaring::RoaringBitmap; -use self::empty_paths_cache::EmptyPathsCache; - use super::db_cache::DatabaseCache; use super::logger::SearchLogger; use super::{QueryGraph, QueryNode}; diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index fbe3c8169..06c860d7e 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -105,6 +105,7 @@ pub fn visit_to_node<'transaction, 'from_data>( assert!(!updb1); let derivations1 = derivations1.all_derivations_except_prefix_db(); + // TODO: eventually, we want to get rid of the uses from `orginal` let original_word_2 = derivations2.original.clone(); let mut cost_proximity_word_pairs = BTreeMap::>>::new(); diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index bc0523b31..9fa840ad5 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -1,5 +1,3 @@ -use std::time::Instant; - use heed::RoTxn; use roaring::RoaringBitmap; @@ -8,11 +6,11 @@ use super::logger::SearchLogger; use super::QueryGraph; use crate::new::graph_based_ranking_rule::GraphBasedRankingRule; -use crate::new::ranking_rule_graph::proximity::ProximityGraph; -use crate::new::ranking_rule_graph::typo::TypoGraph; +use crate::new::ranking_rule_graph::ProximityGraph; +use crate::new::ranking_rule_graph::TypoGraph; use crate::new::words::Words; // use crate::search::new::sort::Sort; -use crate::{Filter, Index, Result, TermsMatchingStrategy}; +use crate::{Index, Result, TermsMatchingStrategy}; pub trait RankingRuleOutputIter<'transaction, Query> { fn next_bucket(&mut self) -> Result>>; @@ -100,18 +98,18 @@ pub struct RankingRuleOutput { // TODO: can make it generic over the query type (either query graph or placeholder) fairly easily #[allow(clippy::too_many_arguments)] -pub fn execute_search<'transaction>( +pub fn apply_ranking_rules<'transaction>( index: &Index, txn: &'transaction heed::RoTxn, // TODO: ranking rules parameter db_cache: &mut DatabaseCache<'transaction>, query_graph: &QueryGraph, - filters: Option, + universe: &RoaringBitmap, from: usize, length: usize, logger: &mut dyn SearchLogger, ) -> Result> { - logger.initial_query(query_graph, Instant::now()); + logger.initial_query(query_graph); let words = &mut Words::new(TermsMatchingStrategy::Last); // let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?; let proximity = &mut GraphBasedRankingRule::::new("proximity".to_owned()); @@ -122,25 +120,13 @@ pub fn execute_search<'transaction>( logger.ranking_rules(&ranking_rules); - let universe = if let Some(filters) = filters { - filters.evaluate(txn, index)? - } else { - index.documents_ids(txn)? - }; - if universe.len() < from as u64 { return Ok(vec![]); } let ranking_rules_len = ranking_rules.len(); - logger.start_iteration_ranking_rule( - 0, - ranking_rules[0], - query_graph, - &universe, - Instant::now(), - ); - ranking_rules[0].start_iteration(index, txn, db_cache, logger, &universe, query_graph)?; + logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, universe); + ranking_rules[0].start_iteration(index, txn, db_cache, logger, universe, query_graph)?; let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len]; candidates[0] = universe.clone(); @@ -154,7 +140,6 @@ pub fn execute_search<'transaction>( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index], &candidates[cur_ranking_rule_index], - Instant::now(), ); candidates[cur_ranking_rule_index].clear(); ranking_rules[cur_ranking_rule_index].end_iteration(index, txn, db_cache, logger); @@ -183,7 +168,6 @@ pub fn execute_search<'transaction>( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index], &candidates, - Instant::now(), ); } else { let all_candidates = candidates.iter().collect::>(); @@ -193,7 +177,6 @@ pub fn execute_search<'transaction>( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index], &skipped_candidates.into_iter().collect(), - Instant::now(), ); let candidates = candidates .iter() @@ -234,7 +217,6 @@ pub fn execute_search<'transaction>( ranking_rules[cur_ranking_rule_index], &candidates[cur_ranking_rule_index], &next_bucket.candidates, - Instant::now(), ); assert!(candidates[cur_ranking_rule_index].is_superset(&next_bucket.candidates)); @@ -255,7 +237,6 @@ pub fn execute_search<'transaction>( ranking_rules[cur_ranking_rule_index], &next_bucket.query, &candidates[cur_ranking_rule_index], - Instant::now(), ); ranking_rules[cur_ranking_rule_index].start_iteration( index, @@ -272,11 +253,11 @@ pub fn execute_search<'transaction>( #[cfg(test)] mod tests { - use super::execute_search; // use crate::allocator::ALLOC; use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::index::tests::TempIndex; use crate::new::db_cache::DatabaseCache; + use crate::new::execute_search; use big_s::S; use heed::EnvOpenOptions; use maplit::hashset; @@ -284,8 +265,7 @@ mod tests { use std::io::{BufRead, BufReader, Cursor, Seek}; use std::time::Instant; // use crate::new::logger::detailed::DetailedSearchLogger; - use crate::new::logger::{DefaultSearchLogger, SearchLogger}; - use crate::new::make_query_graph; + use crate::new::logger::DefaultSearchLogger; use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use crate::{Criterion, Index, Object, Search, TermsMatchingStrategy}; @@ -321,17 +301,20 @@ mod tests { ])) .unwrap(); let txn = index.read_txn().unwrap(); - let mut logger = DefaultSearchLogger; let mut db_cache = DatabaseCache::default(); - let query_graph = - make_query_graph(&index, &txn, &mut db_cache, "releases from poison by the government") - .unwrap(); - logger.initial_query(&query_graph, Instant::now()); + let results = execute_search( + &index, + &txn, + &mut db_cache, + "releases from poison by the government", + None, + 0, + 50, + &mut DefaultSearchLogger, + ) + .unwrap(); - let results = - execute_search(&index, &txn, &mut db_cache, &query_graph, None, 0, 50, &mut logger) - .unwrap(); println!("{results:?}") } @@ -352,21 +335,13 @@ mod tests { let mut db_cache = DatabaseCache::default(); - let query_graph = make_query_graph( - &index, - &txn, - &mut db_cache, - "which a the releases from poison by the government", - ) - .unwrap(); - // let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); let results = execute_search( &index, &txn, &mut db_cache, - &query_graph, + "which a the releases from poison by the government", None, 0, 20, @@ -453,17 +428,13 @@ mod tests { let mut db_cache = DatabaseCache::default(); - let query_graph = - make_query_graph(&index, &txn, &mut db_cache, "releases from poison by the government") - .unwrap(); - let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); let results = execute_search( &index, &txn, &mut db_cache, - &query_graph, + "releases from poison by the government", None, 0, 20, diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index da4599ec5..10c0800ba 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -43,12 +43,9 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { _parent_candidates: &RoaringBitmap, parent_query_graph: &QueryGraph, ) -> Result<()> { - // println!("Words: start iteration"); self.exhausted = false; self.query_graph = Some(parent_query_graph.clone()); - // TODO: a phrase can contain many positions, but represents a single node. - // That's a problem. let positions_to_remove = match self.terms_matching_strategy { TermsMatchingStrategy::Last => { let mut all_positions = BTreeSet::new(); @@ -60,11 +57,13 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { QueryNode::Deleted | QueryNode::Start | QueryNode::End => {} } } - all_positions.into_iter().collect() + let mut r: Vec = all_positions.into_iter().collect(); + // don't remove the first term + r.remove(0); + r } TermsMatchingStrategy::All => vec![], }; - // println!("positions to remove: {positions_to_remove:?}"); self.positions_to_remove = positions_to_remove; self.iterating = true; Ok(()) @@ -78,7 +77,6 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { logger: &mut dyn SearchLogger, universe: &RoaringBitmap, ) -> Result>> { - // println!("Words: next bucket"); assert!(self.iterating); assert!(universe.len() > 1); @@ -122,9 +120,9 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { _db_cache: &mut DatabaseCache<'transaction>, _logger: &mut dyn SearchLogger, ) { - // println!("Words: end iteration"); self.iterating = false; self.exhausted = true; self.positions_to_remove = vec![]; + self.query_graph = None; } }