From 85d96d35a857a45ce70f45d5996b5b30fc3ff1c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 3 May 2023 13:39:19 +0200 Subject: [PATCH 1/6] Highlight ngram matches as well --- milli/src/search/new/mod.rs | 4 ++-- milli/src/search/new/query_graph.rs | 14 ++++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 7e8426bf9..cbc085b12 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -397,8 +397,8 @@ pub fn execute_search( None }; let bucket_sort_output = if let Some(query_terms) = query_terms { - let graph = QueryGraph::from_query(ctx, &query_terms)?; - located_query_terms = Some(query_terms); + let (graph, new_located_query_terms) = QueryGraph::from_query(ctx, &query_terms)?; + located_query_terms = Some(new_located_query_terms); let ranking_rules = get_ranking_rules_for_query_graph_search( ctx, diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 0e7d5a7f3..dc25d1bc3 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -88,12 +88,15 @@ pub struct QueryGraph { } impl QueryGraph { - /// Build the query graph from the parsed user search query. + /// Build the query graph from the parsed user search query, return an updated list of the located query terms + /// which contains ngrams. pub fn from_query( ctx: &mut SearchContext, // NOTE: the terms here must be consecutive terms: &[LocatedQueryTerm], - ) -> Result { + ) -> Result<(QueryGraph, Vec)> { + let mut new_located_query_terms = terms.to_vec(); + let nbr_typos = number_of_typos_allowed(ctx)?; let mut nodes_data: Vec = vec![QueryNodeData::Start, QueryNodeData::End]; @@ -107,10 +110,11 @@ impl QueryGraph { let original_terms_len = terms.len(); for term_idx in 0..original_terms_len { let mut new_nodes = vec![]; + let new_node_idx = add_node( &mut nodes_data, QueryNodeData::Term(LocatedQueryTermSubset { - term_subset: QueryTermSubset::full(Interned::from_raw(term_idx as u16)), + term_subset: QueryTermSubset::full(terms[term_idx].value), positions: terms[term_idx].positions.clone(), term_ids: term_idx as u8..=term_idx as u8, }), @@ -121,6 +125,7 @@ impl QueryGraph { if let Some(ngram) = query_term::make_ngram(ctx, &terms[term_idx - 1..=term_idx], &nbr_typos)? { + new_located_query_terms.push(ngram.clone()); let ngram_idx = add_node( &mut nodes_data, QueryNodeData::Term(LocatedQueryTermSubset { @@ -136,6 +141,7 @@ impl QueryGraph { if let Some(ngram) = query_term::make_ngram(ctx, &terms[term_idx - 2..=term_idx], &nbr_typos)? { + new_located_query_terms.push(ngram.clone()); let ngram_idx = add_node( &mut nodes_data, QueryNodeData::Term(LocatedQueryTermSubset { @@ -167,7 +173,7 @@ impl QueryGraph { let mut graph = QueryGraph { root_node, end_node, nodes }; graph.build_initial_edges(); - Ok(graph) + Ok((graph, new_located_query_terms)) } /// Remove the given nodes, connecting all their predecessors to all their successors. From a37da36766101d10688e9185e9c822a7e6b77ac7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 8 May 2023 11:52:43 +0200 Subject: [PATCH 2/6] Implement `words` as a graph-based ranking rule and fix some bugs --- .../search/new/graph_based_ranking_rule.rs | 8 +- milli/src/search/new/logger/visual.rs | 24 ++-- milli/src/search/new/mod.rs | 11 +- .../new/ranking_rule_graph/cheapest_paths.rs | 105 ++++++++++++------ .../src/search/new/ranking_rule_graph/mod.rs | 3 + .../new/ranking_rule_graph/words/mod.rs | 49 ++++++++ milli/src/search/new/words.rs | 87 --------------- 7 files changed, 148 insertions(+), 139 deletions(-) create mode 100644 milli/src/search/new/ranking_rule_graph/words/mod.rs delete mode 100644 milli/src/search/new/words.rs diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index d8f6836e7..dd25ddd4a 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -46,7 +46,7 @@ use super::logger::SearchLogger; use super::query_graph::QueryNode; use super::ranking_rule_graph::{ ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, FidGraph, PositionGraph, ProximityGraph, - RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, + RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, WordsGraph, }; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; @@ -54,6 +54,12 @@ use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::ranking_rule_graph::PathVisitor; use crate::{Result, TermsMatchingStrategy}; +pub type Words = GraphBasedRankingRule; +impl GraphBasedRankingRule { + pub fn new(terms_matching_strategy: TermsMatchingStrategy) -> Self { + Self::new_with_id("words".to_owned(), Some(terms_matching_strategy)) + } +} pub type Proximity = GraphBasedRankingRule; impl GraphBasedRankingRule { pub fn new(terms_matching_strategy: Option) -> Self { diff --git a/milli/src/search/new/logger/visual.rs b/milli/src/search/new/logger/visual.rs index 1cbe007d3..f76782e63 100644 --- a/milli/src/search/new/logger/visual.rs +++ b/milli/src/search/new/logger/visual.rs @@ -4,7 +4,6 @@ use std::io::{BufWriter, Write}; use std::path::{Path, PathBuf}; use std::time::Instant; -// use rand::random; use roaring::RoaringBitmap; use crate::search::new::interner::Interned; @@ -13,6 +12,7 @@ use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::ranking_rule_graph::{ Edge, FidCondition, FidGraph, PositionCondition, PositionGraph, ProximityCondition, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoCondition, TypoGraph, + WordsCondition, WordsGraph, }; use crate::search::new::ranking_rules::BoxRankingRule; use crate::search::new::{QueryGraph, QueryNode, RankingRule, SearchContext, SearchLogger}; @@ -24,11 +24,12 @@ pub enum SearchEvents { RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 }, RankingRuleEndIteration { ranking_rule_idx: usize, universe_len: u64 }, ExtendResults { new: Vec }, - WordsGraph { query_graph: QueryGraph }, ProximityGraph { graph: RankingRuleGraph }, ProximityPaths { paths: Vec>> }, TypoGraph { graph: RankingRuleGraph }, TypoPaths { paths: Vec>> }, + WordsGraph { graph: RankingRuleGraph }, + WordsPaths { paths: Vec>> }, FidGraph { graph: RankingRuleGraph }, FidPaths { paths: Vec>> }, PositionGraph { graph: RankingRuleGraph }, @@ -139,8 +140,11 @@ impl SearchLogger for VisualSearchLogger { let Some(location) = self.location.last() else { return }; match location { Location::Words => { - if let Some(query_graph) = state.downcast_ref::() { - self.events.push(SearchEvents::WordsGraph { query_graph: query_graph.clone() }); + if let Some(graph) = state.downcast_ref::>() { + self.events.push(SearchEvents::WordsGraph { graph: graph.clone() }); + } + if let Some(paths) = state.downcast_ref::>>>() { + self.events.push(SearchEvents::WordsPaths { paths: paths.clone() }); } } Location::Typo => { @@ -329,7 +333,6 @@ impl<'ctx> DetailedLoggerFinish<'ctx> { SearchEvents::ExtendResults { new } => { self.write_extend_results(new)?; } - SearchEvents::WordsGraph { query_graph } => self.write_words_graph(query_graph)?, SearchEvents::ProximityGraph { graph } => self.write_rr_graph(&graph)?, SearchEvents::ProximityPaths { paths } => { self.write_rr_graph_paths::(paths)?; @@ -338,6 +341,10 @@ impl<'ctx> DetailedLoggerFinish<'ctx> { SearchEvents::TypoPaths { paths } => { self.write_rr_graph_paths::(paths)?; } + SearchEvents::WordsGraph { graph } => self.write_rr_graph(&graph)?, + SearchEvents::WordsPaths { paths } => { + self.write_rr_graph_paths::(paths)?; + } SearchEvents::FidGraph { graph } => self.write_rr_graph(&graph)?, SearchEvents::FidPaths { paths } => { self.write_rr_graph_paths::(paths)?; @@ -482,13 +489,6 @@ fill: \"#B6E2D3\" } Ok(()) } - fn write_words_graph(&mut self, qg: QueryGraph) -> Result<()> { - self.make_new_file_for_internal_state_if_needed()?; - - self.write_query_graph(&qg)?; - - Ok(()) - } fn write_rr_graph( &mut self, graph: &RankingRuleGraph, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index cbc085b12..a28f42f35 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -15,11 +15,7 @@ mod resolve_query_graph; mod small_bitmap; mod exact_attribute; -// TODO: documentation + comments -// implementation is currently an adaptation of the previous implementation to fit with the new model mod sort; -// TODO: documentation + comments -mod words; #[cfg(test)] mod tests; @@ -43,10 +39,10 @@ use ranking_rules::{ use resolve_query_graph::{compute_query_graph_docids, PhraseDocIdsCache}; use roaring::RoaringBitmap; use sort::Sort; -use words::Words; use self::geo_sort::GeoSort; pub use self::geo_sort::Strategy as GeoSortStrategy; +use self::graph_based_ranking_rule::Words; use self::interner::Interned; use crate::search::new::distinct::apply_distinct_rule; use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError}; @@ -202,6 +198,11 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( let mut sorted_fields = HashSet::new(); let mut geo_sorted = false; + // Don't add the `words` ranking rule if the term matching strategy is `All` + if matches!(terms_matching_strategy, TermsMatchingStrategy::All) { + words = true; + } + let mut ranking_rules: Vec> = vec![]; let settings_ranking_rules = ctx.index.criteria(ctx.txn)?; for rr in settings_ranking_rules { diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index c065cc706..30caf0017 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -205,18 +205,12 @@ impl VisitorState { impl RankingRuleGraph { pub fn find_all_costs_to_end(&self) -> MappedInterner> { let mut costs_to_end = self.query_graph.nodes.map(|_| vec![]); - let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len()); - let mut node_stack = VecDeque::new(); - - *costs_to_end.get_mut(self.query_graph.end_node) = vec![0]; - - for prev_node in self.query_graph.nodes.get(self.query_graph.end_node).predecessors.iter() { - node_stack.push_back(prev_node); - enqueued.insert(prev_node); - } - - while let Some(cur_node) = node_stack.pop_front() { + self.traverse_breadth_first_backward(self.query_graph.end_node, |cur_node| { + if cur_node == self.query_graph.end_node { + *costs_to_end.get_mut(self.query_graph.end_node) = vec![0]; + return true; + } let mut self_costs = Vec::::new(); let cur_node_edges = &self.edges_of_node.get(cur_node); @@ -232,13 +226,8 @@ impl RankingRuleGraph { self_costs.dedup(); *costs_to_end.get_mut(cur_node) = self_costs; - for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() { - if !enqueued.contains(prev_node) { - node_stack.push_back(prev_node); - enqueued.insert(prev_node); - } - } - } + true + }); costs_to_end } @@ -247,17 +236,9 @@ impl RankingRuleGraph { node_with_removed_outgoing_conditions: Interned, costs: &mut MappedInterner>, ) { - let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len()); - let mut node_stack = VecDeque::new(); - - enqueued.insert(node_with_removed_outgoing_conditions); - node_stack.push_back(node_with_removed_outgoing_conditions); - - 'main_loop: while let Some(cur_node) = node_stack.pop_front() { + self.traverse_breadth_first_backward(node_with_removed_outgoing_conditions, |cur_node| { let mut costs_to_remove = FxHashSet::default(); - for c in costs.get(cur_node) { - costs_to_remove.insert(*c); - } + costs_to_remove.extend(costs.get(cur_node).iter().copied()); let cur_node_edges = &self.edges_of_node.get(cur_node); for edge_idx in cur_node_edges.iter() { @@ -265,23 +246,79 @@ impl RankingRuleGraph { for cost in costs.get(edge.dest_node).iter() { costs_to_remove.remove(&(*cost + edge.cost as u64)); if costs_to_remove.is_empty() { - continue 'main_loop; + return false; } } } if costs_to_remove.is_empty() { - continue 'main_loop; + return false; } let mut new_costs = BTreeSet::from_iter(costs.get(cur_node).iter().copied()); for c in costs_to_remove { new_costs.remove(&c); } *costs.get_mut(cur_node) = new_costs.into_iter().collect(); + true + }); + } - for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() { - if !enqueued.contains(prev_node) { - node_stack.push_back(prev_node); - enqueued.insert(prev_node); + /// Traverse the graph backwards from the given node such that every time + /// a node is visited, we are guaranteed that all its successors either: + /// 1. have already been visited; OR + /// 2. were not reachable from the given node + pub fn traverse_breadth_first_backward( + &self, + from: Interned, + mut visit: impl FnMut(Interned) -> bool, + ) { + let mut reachable = SmallBitmap::for_interned_values_in(&self.query_graph.nodes); + { + // go backward to get the set of all reachable nodes from the given node + // the nodes that are not reachable will be set as `visited` + let mut stack = VecDeque::new(); + let mut enqueued = SmallBitmap::for_interned_values_in(&self.query_graph.nodes); + enqueued.insert(from); + stack.push_back(from); + while let Some(n) = stack.pop_front() { + if reachable.contains(n) { + continue; + } + reachable.insert(n); + for prev_node in self.query_graph.nodes.get(n).predecessors.iter() { + if !enqueued.contains(prev_node) && !reachable.contains(prev_node) { + stack.push_back(prev_node); + enqueued.insert(prev_node); + } + } + } + }; + let mut unreachable_or_visited = + SmallBitmap::for_interned_values_in(&self.query_graph.nodes); + for (n, _) in self.query_graph.nodes.iter() { + if !reachable.contains(n) { + unreachable_or_visited.insert(n); + } + } + + let mut enqueued = SmallBitmap::for_interned_values_in(&self.query_graph.nodes); + let mut stack = VecDeque::new(); + + enqueued.insert(from); + stack.push_back(from); + + while let Some(cur_node) = stack.pop_front() { + if !self.query_graph.nodes.get(cur_node).successors.is_subset(&unreachable_or_visited) { + stack.push_back(cur_node); + continue; + } + unreachable_or_visited.insert(cur_node); + if visit(cur_node) { + for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() { + if !enqueued.contains(prev_node) && !unreachable_or_visited.contains(prev_node) + { + stack.push_back(prev_node); + enqueued.insert(prev_node); + } } } } diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index f60c481de..8de455822 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -20,6 +20,8 @@ mod position; mod proximity; /// Implementation of the `typo` ranking rule mod typo; +/// Implementation of the `words` ranking rule +mod words; use std::collections::BTreeSet; use std::hash::Hash; @@ -33,6 +35,7 @@ pub use position::{PositionCondition, PositionGraph}; pub use proximity::{ProximityCondition, ProximityGraph}; use roaring::RoaringBitmap; pub use typo::{TypoCondition, TypoGraph}; +pub use words::{WordsCondition, WordsGraph}; use super::interner::{DedupInterner, FixedSizeInterner, Interned, MappedInterner}; use super::query_term::LocatedQueryTermSubset; diff --git a/milli/src/search/new/ranking_rule_graph/words/mod.rs b/milli/src/search/new/ranking_rule_graph/words/mod.rs new file mode 100644 index 000000000..0a0cc112b --- /dev/null +++ b/milli/src/search/new/ranking_rule_graph/words/mod.rs @@ -0,0 +1,49 @@ +use roaring::RoaringBitmap; + +use super::{ComputedCondition, RankingRuleGraphTrait}; +use crate::search::new::interner::{DedupInterner, Interned}; +use crate::search::new::query_term::LocatedQueryTermSubset; +use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; +use crate::search::new::SearchContext; +use crate::Result; + +#[derive(Clone, PartialEq, Eq, Hash)] +pub struct WordsCondition { + term: LocatedQueryTermSubset, +} + +pub enum WordsGraph {} + +impl RankingRuleGraphTrait for WordsGraph { + type Condition = WordsCondition; + + fn resolve_condition( + ctx: &mut SearchContext, + condition: &Self::Condition, + universe: &RoaringBitmap, + ) -> Result { + let WordsCondition { term, .. } = condition; + // maybe compute_query_term_subset_docids should accept a universe as argument + let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?; + docids &= universe; + + Ok(ComputedCondition { + docids, + universe_len: universe.len(), + start_term_subset: None, + end_term_subset: term.clone(), + }) + } + + fn build_edges( + _ctx: &mut SearchContext, + conditions_interner: &mut DedupInterner, + _from: Option<&LocatedQueryTermSubset>, + to_term: &LocatedQueryTermSubset, + ) -> Result)>> { + Ok(vec![( + to_term.term_ids.len() as u32, + conditions_interner.insert(WordsCondition { term: to_term.clone() }), + )]) + } +} diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs deleted file mode 100644 index 72b7b5916..000000000 --- a/milli/src/search/new/words.rs +++ /dev/null @@ -1,87 +0,0 @@ -use roaring::RoaringBitmap; - -use super::logger::SearchLogger; -use super::query_graph::QueryNode; -use super::resolve_query_graph::compute_query_graph_docids; -use super::small_bitmap::SmallBitmap; -use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; -use crate::{Result, TermsMatchingStrategy}; - -pub struct Words { - exhausted: bool, // TODO: remove - query_graph: Option, - nodes_to_remove: Vec>, - terms_matching_strategy: TermsMatchingStrategy, -} -impl Words { - pub fn new(terms_matching_strategy: TermsMatchingStrategy) -> Self { - Self { - exhausted: true, - query_graph: None, - nodes_to_remove: vec![], - terms_matching_strategy, - } - } -} - -impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { - fn id(&self) -> String { - "words".to_owned() - } - fn start_iteration( - &mut self, - ctx: &mut SearchContext<'ctx>, - _logger: &mut dyn SearchLogger, - _universe: &RoaringBitmap, - parent_query_graph: &QueryGraph, - ) -> Result<()> { - self.exhausted = false; - self.query_graph = Some(parent_query_graph.clone()); - self.nodes_to_remove = match self.terms_matching_strategy { - TermsMatchingStrategy::Last => { - let mut ns = parent_query_graph.removal_order_for_terms_matching_strategy_last(ctx); - ns.reverse(); - ns - } - TermsMatchingStrategy::All => { - vec![] - } - }; - Ok(()) - } - - fn next_bucket( - &mut self, - ctx: &mut SearchContext<'ctx>, - logger: &mut dyn SearchLogger, - universe: &RoaringBitmap, - ) -> Result>> { - if self.exhausted { - return Ok(None); - } - let Some(query_graph) = &mut self.query_graph else { panic!() }; - logger.log_internal_state(query_graph); - - let this_bucket = compute_query_graph_docids(ctx, query_graph, universe)?; - - let child_query_graph = query_graph.clone(); - - if self.nodes_to_remove.is_empty() { - self.exhausted = true; - } else { - let nodes_to_remove = self.nodes_to_remove.pop().unwrap(); - query_graph.remove_nodes_keep_edges(&nodes_to_remove.iter().collect::>()); - } - Ok(Some(RankingRuleOutput { query: child_query_graph, candidates: this_bucket })) - } - - fn end_iteration( - &mut self, - _ctx: &mut SearchContext<'ctx>, - _logger: &mut dyn SearchLogger, - ) { - self.exhausted = true; - self.nodes_to_remove = vec![]; - self.query_graph = None; - } -} From 57582688660f7bb0c40865a558933fa4f9a73a27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 16 May 2023 16:22:23 +0200 Subject: [PATCH 3/6] Don't compute split_words for phrases --- milli/src/search/new/logger/visual.rs | 2 +- .../new/query_term/compute_derivations.rs | 37 ++++++++++++++----- milli/src/search/new/query_term/mod.rs | 17 +++++++-- .../src/search/new/query_term/parse_query.rs | 4 +- .../search/new/ranking_rule_graph/typo/mod.rs | 2 +- 5 files changed, 45 insertions(+), 17 deletions(-) diff --git a/milli/src/search/new/logger/visual.rs b/milli/src/search/new/logger/visual.rs index f76782e63..8df56da89 100644 --- a/milli/src/search/new/logger/visual.rs +++ b/milli/src/search/new/logger/visual.rs @@ -462,7 +462,7 @@ fill: \"#B6E2D3\" shape: class max_nbr_typo: {}", term_subset.description(ctx), - term_subset.max_nbr_typos(ctx) + term_subset.max_typo_cost(ctx) )?; for w in term_subset.all_single_words_except_prefix_db(ctx)? { diff --git a/milli/src/search/new/query_term/compute_derivations.rs b/milli/src/search/new/query_term/compute_derivations.rs index c26c4bc6b..d5dfbbcd0 100644 --- a/milli/src/search/new/query_term/compute_derivations.rs +++ b/milli/src/search/new/query_term/compute_derivations.rs @@ -28,14 +28,14 @@ pub enum ZeroOrOneTypo { impl Interned { pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> { let s = ctx.term_interner.get_mut(self); - if s.max_nbr_typos <= 1 && s.one_typo.is_uninit() { + if s.max_levenshtein_distance <= 1 && s.one_typo.is_uninit() { assert!(s.two_typo.is_uninit()); // Initialize one_typo subterm even if max_nbr_typo is 0 because of split words self.initialize_one_typo_subterm(ctx)?; let s = ctx.term_interner.get_mut(self); assert!(s.one_typo.is_init()); s.two_typo = Lazy::Init(TwoTypoTerm::default()); - } else if s.max_nbr_typos > 1 && s.two_typo.is_uninit() { + } else if s.max_levenshtein_distance > 1 && s.two_typo.is_uninit() { assert!(s.two_typo.is_uninit()); self.initialize_one_and_two_typo_subterm(ctx)?; let s = ctx.term_interner.get_mut(self); @@ -185,7 +185,7 @@ pub fn partially_initialized_term_from_word( original: ctx.word_interner.insert(word.to_owned()), ngram_words: None, is_prefix: false, - max_nbr_typos: 0, + max_levenshtein_distance: 0, zero_typo: <_>::default(), one_typo: Lazy::Init(<_>::default()), two_typo: Lazy::Init(<_>::default()), @@ -256,7 +256,7 @@ pub fn partially_initialized_term_from_word( Ok(QueryTerm { original: word_interned, ngram_words: None, - max_nbr_typos: max_typo, + max_levenshtein_distance: max_typo, is_prefix, zero_typo, one_typo: Lazy::Uninit, @@ -275,7 +275,16 @@ fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result { fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { let self_mut = ctx.term_interner.get_mut(self); - let QueryTerm { original, is_prefix, one_typo, max_nbr_typos, .. } = self_mut; + + let allows_split_words = self_mut.allows_split_words(); + let QueryTerm { + original, + is_prefix, + one_typo, + max_levenshtein_distance: max_nbr_typos, + .. + } = self_mut; + let original = *original; let is_prefix = *is_prefix; // let original_str = ctx.word_interner.get(*original).to_owned(); @@ -300,13 +309,17 @@ impl Interned { })?; } - let original_str = ctx.word_interner.get(original).to_owned(); - let split_words = find_split_words(ctx, original_str.as_str())?; + let split_words = if allows_split_words { + let original_str = ctx.word_interner.get(original).to_owned(); + find_split_words(ctx, original_str.as_str())? + } else { + None + }; let self_mut = ctx.term_interner.get_mut(self); // Only add the split words to the derivations if: - // 1. the term is not an ngram; OR + // 1. the term is neither an ngram nor a phrase; OR // 2. the term is an ngram, but the split words are different from the ngram's component words let split_words = if let Some((ngram_words, split_words)) = self_mut.ngram_words.as_ref().zip(split_words.as_ref()) @@ -328,7 +341,13 @@ impl Interned { } fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { let self_mut = ctx.term_interner.get_mut(self); - let QueryTerm { original, is_prefix, two_typo, max_nbr_typos, .. } = self_mut; + let QueryTerm { + original, + is_prefix, + two_typo, + max_levenshtein_distance: max_nbr_typos, + .. + } = self_mut; let original_str = ctx.word_interner.get(*original).to_owned(); if two_typo.is_init() { return Ok(()); diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs index bf521d9b2..fb749a797 100644 --- a/milli/src/search/new/query_term/mod.rs +++ b/milli/src/search/new/query_term/mod.rs @@ -43,7 +43,7 @@ pub struct QueryTermSubset { pub struct QueryTerm { original: Interned, ngram_words: Option>>, - max_nbr_typos: u8, + max_levenshtein_distance: u8, is_prefix: bool, zero_typo: ZeroTypoTerm, // May not be computed yet @@ -342,10 +342,16 @@ impl QueryTermSubset { } None } - pub fn max_nbr_typos(&self, ctx: &SearchContext) -> u8 { + pub fn max_typo_cost(&self, ctx: &SearchContext) -> u8 { let t = ctx.term_interner.get(self.original); - match t.max_nbr_typos { - 0 => 0, + match t.max_levenshtein_distance { + 0 => { + if t.allows_split_words() { + 1 + } else { + 0 + } + } 1 => { if self.one_typo_subset.is_empty() { 0 @@ -438,6 +444,9 @@ impl QueryTerm { self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty() } + fn allows_split_words(&self) -> bool { + self.zero_typo.phrase.is_none() + } } impl Interned { diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index dc317a0fb..bf90748e4 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -217,7 +217,7 @@ pub fn make_ngram( original: ngram_str_interned, ngram_words: Some(words_interned), is_prefix, - max_nbr_typos, + max_levenshtein_distance: max_nbr_typos, zero_typo: term.zero_typo, one_typo: Lazy::Uninit, two_typo: Lazy::Uninit, @@ -271,7 +271,7 @@ impl PhraseBuilder { QueryTerm { original: ctx.word_interner.insert(phrase_desc), ngram_words: None, - max_nbr_typos: 0, + max_levenshtein_distance: 0, is_prefix: false, zero_typo: ZeroTypoTerm { phrase: Some(phrase), diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index da5198c23..a44be6015 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -50,7 +50,7 @@ impl RankingRuleGraphTrait for TypoGraph { // 3-gram -> equivalent to 2 typos let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 }; - for nbr_typos in 0..=term.term_subset.max_nbr_typos(ctx) { + for nbr_typos in 0..=term.term_subset.max_typo_cost(ctx) { let mut term = term.clone(); match nbr_typos { 0 => { From ec8f685d8404673e4961daeec364a993c46124e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 16 May 2023 17:01:30 +0200 Subject: [PATCH 4/6] Fix bug in cheapest path algorithm --- .../new/ranking_rule_graph/cheapest_paths.rs | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 30caf0017..4a696b3dd 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -209,7 +209,7 @@ impl RankingRuleGraph { self.traverse_breadth_first_backward(self.query_graph.end_node, |cur_node| { if cur_node == self.query_graph.end_node { *costs_to_end.get_mut(self.query_graph.end_node) = vec![0]; - return true; + return; } let mut self_costs = Vec::::new(); @@ -226,7 +226,6 @@ impl RankingRuleGraph { self_costs.dedup(); *costs_to_end.get_mut(cur_node) = self_costs; - true }); costs_to_end } @@ -246,19 +245,18 @@ impl RankingRuleGraph { for cost in costs.get(edge.dest_node).iter() { costs_to_remove.remove(&(*cost + edge.cost as u64)); if costs_to_remove.is_empty() { - return false; + return; } } } if costs_to_remove.is_empty() { - return false; + return; } let mut new_costs = BTreeSet::from_iter(costs.get(cur_node).iter().copied()); for c in costs_to_remove { new_costs.remove(&c); } *costs.get_mut(cur_node) = new_costs.into_iter().collect(); - true }); } @@ -269,7 +267,7 @@ impl RankingRuleGraph { pub fn traverse_breadth_first_backward( &self, from: Interned, - mut visit: impl FnMut(Interned) -> bool, + mut visit: impl FnMut(Interned), ) { let mut reachable = SmallBitmap::for_interned_values_in(&self.query_graph.nodes); { @@ -312,13 +310,11 @@ impl RankingRuleGraph { continue; } unreachable_or_visited.insert(cur_node); - if visit(cur_node) { - for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() { - if !enqueued.contains(prev_node) && !unreachable_or_visited.contains(prev_node) - { - stack.push_back(prev_node); - enqueued.insert(prev_node); - } + visit(cur_node); + for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() { + if !enqueued.contains(prev_node) && !unreachable_or_visited.contains(prev_node) { + stack.push_back(prev_node); + enqueued.insert(prev_node); } } } From a490a11325cebfde7ead969c95ebc95af78e1352 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 23 May 2023 15:24:24 +0200 Subject: [PATCH 5/6] Add explanatory comment on the way we're recomputing costs --- milli/src/search/new/ranking_rule_graph/cheapest_paths.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 4a696b3dd..738b53016 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -235,6 +235,9 @@ impl RankingRuleGraph { node_with_removed_outgoing_conditions: Interned, costs: &mut MappedInterner>, ) { + // Traverse the graph backward from the target node, recomputing the cost for each of its predecessors. + // We first check that no other node is contributing the same total cost to a predecessor before removing + // the cost from the predecessor. self.traverse_breadth_first_backward(node_with_removed_outgoing_conditions, |cur_node| { let mut costs_to_remove = FxHashSet::default(); costs_to_remove.extend(costs.get(cur_node).iter().copied()); From 51043f78f0159ab18f7d64051aa076514b5019aa Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 23 May 2023 15:27:25 +0200 Subject: [PATCH 6/6] Remove trailing whitespace --- milli/src/search/new/ranking_rule_graph/cheapest_paths.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 738b53016..8fd943e6e 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -237,7 +237,7 @@ impl RankingRuleGraph { ) { // Traverse the graph backward from the target node, recomputing the cost for each of its predecessors. // We first check that no other node is contributing the same total cost to a predecessor before removing - // the cost from the predecessor. + // the cost from the predecessor. self.traverse_breadth_first_backward(node_with_removed_outgoing_conditions, |cur_node| { let mut costs_to_remove = FxHashSet::default(); costs_to_remove.extend(costs.get(cur_node).iter().copied());