diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 2cedbffa5..d8f881b07 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -85,15 +85,15 @@ fn remove_empty_edges<'search, G: RankingRuleGraphTrait>( universe: &RoaringBitmap, empty_paths_cache: &mut EmptyPathsCache, ) -> Result<()> { - for edge_index in 0..graph.all_edges.len() as u16 { - if graph.all_edges[edge_index as usize].is_none() { + for edge_index in 0..graph.edges_store.len() as u16 { + if graph.edges_store[edge_index as usize].is_none() { continue; } let docids = edge_docids_cache.get_edge_docids(ctx, edge_index, &*graph, universe)?; match docids { BitmapOrAllRef::Bitmap(docids) => { if docids.is_disjoint(universe) { - graph.remove_edge(edge_index); + graph.remove_ranking_rule_edge(edge_index); empty_paths_cache.forbid_edge(edge_index); edge_docids_cache.cache.remove(&edge_index); continue; @@ -120,7 +120,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> ) -> Result<()> { let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?; let mut edge_docids_cache = EdgeDocidsCache::default(); - let mut empty_paths_cache = EmptyPathsCache::new(graph.all_edges.len() as u16); + let mut empty_paths_cache = EmptyPathsCache::new(graph.edges_store.len() as u16); // First simplify the graph as much as possible, by computing the docids of the edges // within the rule's universe and removing the edges that have no associated docids. @@ -242,7 +242,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> // 1. Store in the cache that this edge is empty for this universe empty_paths_cache.forbid_edge(edge_index); // 2. remove this edge from the ranking rule graph - graph.remove_edge(edge_index); + graph.remove_ranking_rule_edge(edge_index); // 3. Also remove the entry from the edge_docids_cache, since we don't need it anymore edge_docids_cache.cache.remove(&edge_index); return Ok(()); diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index c6570ef54..468bc0343 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -8,7 +8,7 @@ use roaring::RoaringBitmap; use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::search::new::ranking_rule_graph::{ - Edge, EdgeDetails, EmptyPathsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, + Edge, EdgeCondition, EmptyPathsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, }; use crate::search::new::small_bitmap::SmallBitmap; @@ -534,24 +534,24 @@ shape: class" let distances = &distances[node_idx]; Self::query_node_d2_desc(ctx, node_idx, node, distances.as_slice(), file); } - for edge in graph.all_edges.iter().flatten() { - let Edge { from_node, to_node, details, .. } = edge; + for edge in graph.edges_store.iter().flatten() { + let Edge { source_node, dest_node, condition: details, .. } = edge; match &details { - EdgeDetails::Unconditional => { + EdgeCondition::Unconditional => { writeln!( file, - "{from_node} -> {to_node} : \"always cost {cost}\"", + "{source_node} -> {dest_node} : \"always cost {cost}\"", cost = edge.cost, ) .unwrap(); } - EdgeDetails::Data(details) => { + EdgeCondition::Conditional(details) => { writeln!( file, - "{from_node} -> {to_node} : \"cost {cost} {edge_label}\"", + "{source_node} -> {dest_node} : \"cost {cost} {edge_label}\"", cost = edge.cost, - edge_label = R::graphviz_edge_details_label(details) + edge_label = R::label_for_edge_condition(details) ) .unwrap(); } @@ -589,10 +589,10 @@ shape: class" edge_idx: u16, file: &mut File, ) { - let Edge { from_node, to_node, cost, .. } = - graph.all_edges[edge_idx as usize].as_ref().unwrap(); - let from_node = &graph.query_graph.nodes[*from_node as usize]; - let from_node_desc = match from_node { + let Edge { source_node, dest_node, cost, .. } = + graph.edges_store[edge_idx as usize].as_ref().unwrap(); + let source_node = &graph.query_graph.nodes[*source_node as usize]; + let source_node_desc = match source_node { QueryNode::Term(term) => match &term.value { QueryTerm::Phrase { phrase } => { let phrase = ctx.phrase_interner.get(*phrase); @@ -606,8 +606,8 @@ shape: class" QueryNode::Start => "START".to_owned(), QueryNode::End => "END".to_owned(), }; - let to_node = &graph.query_graph.nodes[*to_node as usize]; - let to_node_desc = match to_node { + let dest_node = &graph.query_graph.nodes[*dest_node as usize]; + let dest_node_desc = match dest_node { QueryNode::Term(term) => match &term.value { QueryTerm::Phrase { phrase } => { let phrase = ctx.phrase_interner.get(*phrase); @@ -623,7 +623,7 @@ shape: class" }; writeln!( file, - "{edge_idx}: \"{from_node_desc}->{to_node_desc} [{cost}]\" {{ + "{edge_idx}: \"{source_node_desc}->{dest_node_desc} [{cost}]\" {{ shape: class }}" ) diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 46a62b4a9..446f34e68 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -1,6 +1,3 @@ -// TODO: put primitive query part in here - -use std::borrow::Cow; use std::mem; use std::ops::RangeInclusive; @@ -18,6 +15,8 @@ use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; use crate::search::{build_dfa, get_first}; use crate::{CboRoaringBitmapLenCodec, Index, Result}; +/// A phrase in the user's search query, consisting of several words +/// that must appear side-by-side in the search results. #[derive(Default, Clone, PartialEq, Eq, Hash)] pub struct Phrase { pub words: Vec>>, @@ -28,18 +27,38 @@ impl Phrase { } } +/// A structure storing all the different ways to match +/// a term in the user's search query. #[derive(Clone)] pub struct WordDerivations { + /// The original word pub original: Interned, - // TODO: pub prefix_of: Vec, + // TODO: original should only be used for debugging purposes? + // TODO: pub zero_typo: Option>, + // TODO: pub prefix_of: Box<[Interned]>, + /// All the synonyms of the original word pub synonyms: Box<[Interned]>, + + /// The original word split into multiple consecutive words pub split_words: Option>, + + /// The original words and words which are prefixed by it pub zero_typo: Box<[Interned]>, + + /// Words that are 1 typo away from the original word pub one_typo: Box<[Interned]>, + + /// Words that are 2 typos away from the original word pub two_typos: Box<[Interned]>, + + /// True if the prefix databases must be used to retrieve + /// the words which are prefixed by the original word. pub use_prefix_db: bool, } impl WordDerivations { + /// Return an iterator over all the single words derived from the original word. + /// + /// This excludes synonyms, split words, and words stored in the prefix databases. pub fn all_derivations_except_prefix_db( &'_ self, ) -> impl Iterator> + Clone + '_ { @@ -49,17 +68,20 @@ impl WordDerivations { self.zero_typo.is_empty() && self.one_typo.is_empty() && self.two_typos.is_empty() + && self.synonyms.is_empty() + && self.split_words.is_none() && !self.use_prefix_db } } +/// Compute the word derivations for the given word pub fn word_derivations( ctx: &mut SearchContext, word: &str, max_typo: u8, is_prefix: bool, - fst: &fst::Set>, ) -> Result { + let fst = ctx.index.words_fst(ctx.txn)?; let word_interned = ctx.word_interner.insert(word.to_owned()); let use_prefix_db = is_prefix @@ -171,6 +193,10 @@ pub fn word_derivations( }) } +/// Split the original word into the two words that appear the +/// most next to each other in the index. +/// +/// Return `None` if the original word cannot be split. fn split_best_frequency( index: &Index, txn: &RoTxn, @@ -199,16 +225,12 @@ fn split_best_frequency( #[derive(Clone)] pub enum QueryTerm { - // TODO: should there be SplitWord, NGram2, and NGram3 variants? - // NGram2 can have 1 typo and synonyms - // NGram3 cannot have typos but can have synonyms - // SplitWords are a phrase - // Can NGrams be prefixes? Phrase { phrase: Interned }, Word { derivations: WordDerivations }, } impl QueryTerm { + /// Return the original word from the given query term pub fn original_single_word<'interner>( &self, word_interner: &'interner Interner, @@ -226,6 +248,7 @@ impl QueryTerm { } } +/// A query term term coupled with its position in the user's search query. #[derive(Clone)] pub struct LocatedQueryTerm { pub value: QueryTerm, @@ -233,14 +256,18 @@ pub struct LocatedQueryTerm { } impl LocatedQueryTerm { + /// Return `true` iff the word derivations within the query term are empty pub fn is_empty(&self) -> bool { match &self.value { + // TODO: phrases should be greedily computed, so that they can be excluded from + // the query graph right from the start? QueryTerm::Phrase { phrase: _ } => false, QueryTerm::Word { derivations, .. } => derivations.is_empty(), } } } +/// Convert the tokenised search query into a list of located query terms. pub fn located_query_terms_from_string<'search>( ctx: &mut SearchContext<'search>, query: NormalizedTokenIter>, @@ -250,8 +277,8 @@ pub fn located_query_terms_from_string<'search>( let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?; let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?; + // TODO: should `exact_words` also disable prefix search, ngrams, split words, or synonyms? let exact_words = ctx.index.exact_words(ctx.txn)?; - let fst = ctx.index.words_fst(ctx.txn)?; let nbr_typos = |word: &str| { if !authorize_typos @@ -266,9 +293,9 @@ pub fn located_query_terms_from_string<'search>( } }; - let mut primitive_query = Vec::new(); - let mut phrase = Vec::new(); + let mut located_terms = Vec::new(); + let mut phrase = Vec::new(); let mut quoted = false; let parts_limit = words_limit.unwrap_or(usize::MAX); @@ -280,8 +307,8 @@ pub fn located_query_terms_from_string<'search>( let mut peekable = query.peekable(); while let Some(token) = peekable.next() { // early return if word limit is exceeded - if primitive_query.len() >= parts_limit { - return Ok(primitive_query); + if located_terms.len() >= parts_limit { + return Ok(located_terms); } match token.kind { @@ -307,24 +334,23 @@ pub fn located_query_terms_from_string<'search>( match token.kind { TokenKind::Word => { let word = token.lemma(); - let derivations = - word_derivations(ctx, word, nbr_typos(word), false, &fst)?; + let derivations = word_derivations(ctx, word, nbr_typos(word), false)?; let located_term = LocatedQueryTerm { value: QueryTerm::Word { derivations }, positions: position..=position, }; - primitive_query.push(located_term); + located_terms.push(located_term); } TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {} } } else { let word = token.lemma(); - let derivations = word_derivations(ctx, word, nbr_typos(word), true, &fst)?; + let derivations = word_derivations(ctx, word, nbr_typos(word), true)?; let located_term = LocatedQueryTerm { value: QueryTerm::Word { derivations }, positions: position..=position, }; - primitive_query.push(located_term); + located_terms.push(located_term); } } TokenKind::Separator(separator_kind) => { @@ -352,7 +378,7 @@ pub fn located_query_terms_from_string<'search>( }, positions: phrase_start..=phrase_end, }; - primitive_query.push(located_query_term); + located_terms.push(located_query_term); } } _ => (), @@ -367,10 +393,10 @@ pub fn located_query_terms_from_string<'search>( }, positions: phrase_start..=phrase_end, }; - primitive_query.push(located_query_term); + located_terms.push(located_query_term); } - Ok(primitive_query) + Ok(located_terms) } // TODO: return a word derivations instead? @@ -396,6 +422,8 @@ pub fn ngram2( _ => None, } } + +// TODO: return a word derivations instead? pub fn ngram3( ctx: &mut SearchContext, x: &LocatedQueryTerm, diff --git a/milli/src/search/new/ranking_rule_graph/build.rs b/milli/src/search/new/ranking_rule_graph/build.rs index d9732b010..49c78a32f 100644 --- a/milli/src/search/new/ranking_rule_graph/build.rs +++ b/milli/src/search/new/ranking_rule_graph/build.rs @@ -6,49 +6,43 @@ use crate::search::new::{QueryGraph, SearchContext}; use crate::Result; impl RankingRuleGraph { + /// Build the ranking rule graph from the given query graph pub fn build(ctx: &mut SearchContext, query_graph: QueryGraph) -> Result { let QueryGraph { nodes: graph_nodes, edges: graph_edges, .. } = &query_graph; - let mut all_edges = vec![]; - let mut node_edges = vec![]; - let mut successors = vec![]; + let mut edges_store = vec![]; + let mut edges_of_node = vec![]; for (node_idx, node) in graph_nodes.iter().enumerate() { - node_edges.push(HashSet::new()); - successors.push(HashSet::new()); - let new_edges = node_edges.last_mut().unwrap(); - let new_successors = successors.last_mut().unwrap(); + edges_of_node.push(HashSet::new()); + let new_edges = edges_of_node.last_mut().unwrap(); - let Some(from_node_data) = G::build_visit_from_node(ctx, node)? else { continue }; + let Some(source_node_data) = G::build_step_visit_source_node(ctx, node)? else { continue }; for successor_idx in graph_edges[node_idx].successors.iter() { - let to_node = &graph_nodes[successor_idx as usize]; - let mut edges = G::build_visit_to_node(ctx, to_node, &from_node_data)?; + let dest_node = &graph_nodes[successor_idx as usize]; + let edges = + G::build_step_visit_destination_node(ctx, dest_node, &source_node_data)?; if edges.is_empty() { continue; } - edges.sort_by_key(|e| e.0); + for (cost, details) in edges { - all_edges.push(Some(Edge { - from_node: node_idx as u16, - to_node: successor_idx, + edges_store.push(Some(Edge { + source_node: node_idx as u16, + dest_node: successor_idx, cost, - details, + condition: details, })); - new_edges.insert(all_edges.len() as u16 - 1); - new_successors.insert(successor_idx); + new_edges.insert(edges_store.len() as u16 - 1); } } } - let node_edges = node_edges + let edges_of_node = edges_of_node .into_iter() - .map(|edges| SmallBitmap::from_iter(edges.into_iter(), all_edges.len() as u16)) - .collect(); - let successors = successors - .into_iter() - .map(|edges| SmallBitmap::from_iter(edges.into_iter(), all_edges.len() as u16)) + .map(|edges| SmallBitmap::from_iter(edges.into_iter(), edges_store.len() as u16)) .collect(); - Ok(RankingRuleGraph { query_graph, all_edges, node_edges, successors }) + Ok(RankingRuleGraph { query_graph, edges_store, edges_of_node }) } } diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 8627860e7..529bb32c4 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -30,7 +30,7 @@ impl RankingRuleGraph { empty_paths_cache, &mut visit, &mut vec![], - &mut SmallBitmap::new(self.all_edges.len() as u16), + &mut SmallBitmap::new(self.edges_store.len() as u16), empty_paths_cache.empty_edges.clone(), )?; Ok(()) @@ -48,12 +48,12 @@ impl RankingRuleGraph { ) -> Result { let mut any_valid = false; - let edges = self.node_edges[from].clone(); + let edges = self.edges_of_node[from].clone(); for edge_idx in edges.iter() { - let Some(edge) = self.all_edges[edge_idx as usize].as_ref() else { continue }; + let Some(edge) = self.edges_store[edge_idx as usize].as_ref() else { continue }; if cost < edge.cost as u16 || forbidden_edges.contains(edge_idx) - || !all_distances[edge.to_node as usize].iter().any( + || !all_distances[edge.dest_node as usize].iter().any( |(next_cost, necessary_edges)| { (*next_cost == cost - edge.cost as u16) && !forbidden_edges.intersects(necessary_edges) @@ -71,13 +71,13 @@ impl RankingRuleGraph { new_forbidden_edges.insert(x); }); - let next_any_valid = if edge.to_node == self.query_graph.end_node { + let next_any_valid = if edge.dest_node == self.query_graph.end_node { any_valid = true; visit(prev_edges, self, empty_paths_cache)?; true } else { self.visit_paths_of_cost_rec( - edge.to_node as usize, + edge.dest_node as usize, cost - edge.cost as u16, all_distances, empty_paths_cache, @@ -115,7 +115,7 @@ impl RankingRuleGraph { let mut node_stack = VecDeque::new(); distances_to_end[self.query_graph.end_node as usize] = - vec![(0, SmallBitmap::new(self.all_edges.len() as u16))]; + vec![(0, SmallBitmap::new(self.edges_store.len() as u16))]; for prev_node in self.query_graph.edges[self.query_graph.end_node as usize].predecessors.iter() @@ -127,15 +127,15 @@ impl RankingRuleGraph { while let Some(cur_node) = node_stack.pop_front() { let mut self_distances = BTreeMap::::new(); - let cur_node_edges = &self.node_edges[cur_node]; + let cur_node_edges = &self.edges_of_node[cur_node]; for edge_idx in cur_node_edges.iter() { - let edge = self.all_edges[edge_idx as usize].as_ref().unwrap(); - let succ_node = edge.to_node; + let edge = self.edges_store[edge_idx as usize].as_ref().unwrap(); + let succ_node = edge.dest_node; let succ_distances = &distances_to_end[succ_node as usize]; for (succ_distance, succ_necessary_edges) in succ_distances { let potential_necessary_edges = SmallBitmap::from_iter( std::iter::once(edge_idx).chain(succ_necessary_edges.iter()), - self.all_edges.len() as u16, + self.edges_store.len() as u16, ); match self_distances.entry(edge.cost as u16 + succ_distance) { Entry::Occupied(mut prev_necessary_edges) => { diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs index c0c46289c..f7bf1b002 100644 --- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs @@ -3,28 +3,13 @@ use std::marker::PhantomData; use fxhash::FxHashMap; use roaring::RoaringBitmap; -use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait}; +use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::{BitmapOrAllRef, SearchContext}; use crate::Result; -// TODO: the cache should have a G::EdgeDetails as key -// but then it means that we should have a quick way of -// computing their hash and comparing them -// which can be done... -// by using a pointer (real, Rc, bumpalo, or in a vector)??? -// -// But actually.... the edge details' docids are a subset of the universe at the -// moment they were computed. -// But the universes between two iterations of a ranking rule are completely different -// Thus, there is no point in doing this. -// UNLESS... -// we compute the whole docids corresponding to the edge details (potentially expensive in time and memory -// in the common case) -// -// But we could still benefit within a single iteration for requests like: -// `a a a a a a a a a` where we have many of the same edge details, repeated - +/// A cache storing the document ids associated with each ranking rule edge pub struct EdgeDocidsCache { + // TODO: should be FxHashMap, RoaringBitmap> pub cache: FxHashMap, _phantom: PhantomData, } @@ -34,19 +19,24 @@ impl Default for EdgeDocidsCache { } } impl EdgeDocidsCache { + /// Retrieve the document ids for the given edge condition. + /// + /// If the cache does not yet contain these docids, they are computed + /// and inserted in the cache. pub fn get_edge_docids<'s, 'search>( &'s mut self, ctx: &mut SearchContext<'search>, + // TODO: should be Interned edge_index: u16, graph: &RankingRuleGraph, // TODO: maybe universe doesn't belong here universe: &RoaringBitmap, ) -> Result> { - let edge = graph.all_edges[edge_index as usize].as_ref().unwrap(); + let edge = graph.edges_store[edge_index as usize].as_ref().unwrap(); - match &edge.details { - EdgeDetails::Unconditional => Ok(BitmapOrAllRef::All), - EdgeDetails::Data(details) => { + match &edge.condition { + EdgeCondition::Unconditional => Ok(BitmapOrAllRef::All), + EdgeCondition::Conditional(details) => { if self.cache.contains_key(&edge_index) { // TODO: should we update the bitmap in the cache if the new universe // reduces it? @@ -56,7 +46,7 @@ impl EdgeDocidsCache { return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index])); } // TODO: maybe universe doesn't belong here - let docids = universe & G::compute_docids(ctx, details, universe)?; + let docids = universe & G::resolve_edge_condition(ctx, details, universe)?; let _ = self.cache.insert(edge_index, docids); let docids = &self.cache[&edge_index]; Ok(BitmapOrAllRef::Bitmap(docids)) diff --git a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs index 659042a01..deac05502 100644 --- a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs @@ -1,20 +1,29 @@ -use super::paths_map::PathsMap; +use super::paths_map::PathSet; use crate::search::new::small_bitmap::SmallBitmap; +/// A cache which stores sufficient conditions for a path +/// to resolve to an empty set of candidates within the current +/// universe. #[derive(Clone)] pub struct EmptyPathsCache { + /// The set of edge indexes that resolve to no documents. pub empty_edges: SmallBitmap, - pub empty_prefixes: PathsMap<()>, + /// A set of path prefixes that resolve to no documents. + pub empty_prefixes: PathSet, + /// A set of empty couple of edge indexes that resolve to no documents. pub empty_couple_edges: Vec, } impl EmptyPathsCache { + /// Create a new cache for a ranking rule graph containing at most `all_edges_len` edges. pub fn new(all_edges_len: u16) -> Self { Self { empty_edges: SmallBitmap::new(all_edges_len), - empty_prefixes: PathsMap::default(), + empty_prefixes: PathSet::default(), empty_couple_edges: vec![SmallBitmap::new(all_edges_len); all_edges_len as usize], } } + + /// Store in the cache that every path containing the given edge resolves to no documents. pub fn forbid_edge(&mut self, edge_idx: u16) { self.empty_edges.insert(edge_idx); self.empty_couple_edges[edge_idx as usize].clear(); @@ -23,12 +32,17 @@ impl EmptyPathsCache { edges2.remove(edge_idx); } } + /// Store in the cache that every path containing the given prefix resolves to no documents. pub fn forbid_prefix(&mut self, prefix: &[u16]) { - self.empty_prefixes.insert(prefix.iter().copied(), ()); + self.empty_prefixes.insert(prefix.iter().copied()); } + + /// Store in the cache that every path containing the two given edges resolves to no documents. pub fn forbid_couple_edges(&mut self, edge1: u16, edge2: u16) { self.empty_couple_edges[edge1 as usize].insert(edge2); } + + /// Returns true if the cache can determine that the given path resolves to no documents. pub fn path_is_empty(&self, path: &[u16], path_bitmap: &SmallBitmap) -> bool { if path_bitmap.intersects(&self.empty_edges) { return true; diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 635f194f5..3f74a3cf5 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -1,9 +1,19 @@ +/*! Module implementing the graph used for the graph-based ranking rules +and its related algorithms. + +A ranking rule graph is built on top of the [`QueryGraph`]: the nodes stay +the same but the edges are replaced. +*/ + mod build; mod cheapest_paths; mod edge_docids_cache; mod empty_paths_cache; mod paths_map; + +/// Implementation of the `proximity` ranking rule mod proximity; +/// Implementation of the `typo` ranking rule mod typo; pub use edge_docids_cache::EdgeDocidsCache; @@ -17,30 +27,38 @@ use super::small_bitmap::SmallBitmap; use super::{QueryGraph, QueryNode, SearchContext}; use crate::Result; +/// The condition that is associated with an edge in the ranking rule graph. +/// +/// Some edges are unconditional, which means that traversing them does not reduce +/// the set of candidates. +/// +/// Most edges, however, have a condition attached to them. For example, for the +/// proximity ranking rule, the condition could be that a word is N-close to another one. +/// When the edge is traversed, some database operations are executed to retrieve the set +/// of documents that satisfy the condition, which reduces the list of candidate document ids. #[derive(Debug, Clone)] -pub enum EdgeDetails { +pub enum EdgeCondition { Unconditional, - Data(E), + Conditional(E), } +/// An edge in the ranking rule graph. +/// +/// It contains: +/// 1. The source and destination nodes +/// 2. The cost of traversing this edge +/// 3. The condition associated with it #[derive(Debug, Clone)] pub struct Edge { - pub from_node: u16, - pub to_node: u16, + pub source_node: u16, + pub dest_node: u16, pub cost: u8, - pub details: EdgeDetails, -} - -#[derive(Debug, Clone)] -pub struct EdgePointer<'graph, E> { - pub index: u16, - pub edge: &'graph Edge, + pub condition: EdgeCondition, } // pub struct SubWordDerivations { // words: FxHashSet>, -// synonyms: FxHashSet>, // NO! they're phrases, not strings -// split_words: bool, +// phrases: FxHashSet>, // use_prefix_db: bool, // } @@ -74,46 +92,55 @@ pub struct EdgePointer<'graph, E> { // } // fn word_derivations_used_by_edge( -// edge: G::EdgeDetails, +// edge: G::EdgeCondition, // ) -> SubWordDerivations { // todo!() // } +/// A trait to be implemented by a marker type to build a graph-based ranking rule. +/// +/// It mostly describes how to: +/// 1. Retrieve the set of edges (their cost and condition) between two nodes. +/// 2. Compute the document ids satisfying a condition pub trait RankingRuleGraphTrait: Sized { - /// The details of an edge connecting two query nodes. These details + /// The condition of an edge connecting two query nodes. The condition /// should be sufficient to compute the edge's cost and associated document ids - /// in [`compute_docids`](RankingRuleGraphTrait). - type EdgeDetails: Sized + Clone; + /// in [`resolve_edge_condition`](RankingRuleGraphTrait::resolve_edge_condition). + type EdgeCondition: Sized + Clone; + /// A structure used in the construction of the graph, created when a + /// query graph source node is visited. It is used to determine the cost + /// and condition of a ranking rule edge when the destination node is visited. type BuildVisitedFromNode; - /// Return the label of the given edge details, to be used when visualising - /// the ranking rule graph using GraphViz. - fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String; + /// Return the label of the given edge condition, to be used when visualising + /// the ranking rule graph. + fn label_for_edge_condition(edge: &Self::EdgeCondition) -> String; - /// Compute the document ids associated with the given edge. - fn compute_docids<'search>( + /// Compute the document ids associated with the given edge condition, + /// restricted to the given universe. + fn resolve_edge_condition<'search>( ctx: &mut SearchContext<'search>, - edge_details: &Self::EdgeDetails, + edge_condition: &Self::EdgeCondition, universe: &RoaringBitmap, ) -> Result; - /// Prepare to build the edges outgoing from `from_node`. + /// Prepare to build the edges outgoing from `source_node`. /// - /// This call is followed by zero, one or more calls to [`build_visit_to_node`](RankingRuleGraphTrait::build_visit_to_node), + /// This call is followed by zero, one or more calls to [`build_step_visit_destination_node`](RankingRuleGraphTrait::build_step_visit_destination_node), /// which builds the actual edges. - fn build_visit_from_node<'search>( + fn build_step_visit_source_node<'search>( ctx: &mut SearchContext<'search>, - from_node: &QueryNode, + source_node: &QueryNode, ) -> Result>; - /// Return the cost and details of the edges going from the previously visited node - /// (with [`build_visit_from_node`](RankingRuleGraphTrait::build_visit_from_node)) to `to_node`. - fn build_visit_to_node<'from_data, 'search: 'from_data>( + /// Return the cost and condition of the edges going from the previously visited node + /// (with [`build_step_visit_source_node`](RankingRuleGraphTrait::build_step_visit_source_node)) to `dest_node`. + fn build_step_visit_destination_node<'from_data, 'search: 'from_data>( ctx: &mut SearchContext<'search>, - to_node: &QueryNode, - from_node_data: &'from_data Self::BuildVisitedFromNode, - ) -> Result)>>; + dest_node: &QueryNode, + source_node_data: &'from_data Self::BuildVisitedFromNode, + ) -> Result)>>; fn log_state( graph: &RankingRuleGraph, @@ -126,45 +153,32 @@ pub trait RankingRuleGraphTrait: Sized { ); } +/// The graph used by graph-based ranking rules. +/// +/// It is built on top of a [`QueryGraph`], keeping the same nodes +/// but replacing the edges. pub struct RankingRuleGraph { pub query_graph: QueryGraph, - // pub edges: Vec>>>, - pub all_edges: Vec>>, - - pub node_edges: Vec, - - pub successors: Vec, - // TODO: to get the edges between two nodes: - // 1. get node_outgoing_edges[from] - // 2. get node_incoming_edges[to] - // 3. take intersection betweem the two + pub edges_store: Vec>>, + pub edges_of_node: Vec, } impl Clone for RankingRuleGraph { fn clone(&self) -> Self { Self { query_graph: self.query_graph.clone(), - all_edges: self.all_edges.clone(), - node_edges: self.node_edges.clone(), - successors: self.successors.clone(), + edges_store: self.edges_store.clone(), + edges_of_node: self.edges_of_node.clone(), } } } impl RankingRuleGraph { - pub fn remove_edge(&mut self, edge_index: u16) { - let edge_opt = &mut self.all_edges[edge_index as usize]; + /// Remove the given edge from the ranking rule graph + pub fn remove_ranking_rule_edge(&mut self, edge_index: u16) { + let edge_opt = &mut self.edges_store[edge_index as usize]; let Some(edge) = &edge_opt else { return }; - let (from_node, _to_node) = (edge.from_node, edge.to_node); + let (source_node, _dest_node) = (edge.source_node, edge.dest_node); *edge_opt = None; - let from_node_edges = &mut self.node_edges[from_node as usize]; - from_node_edges.remove(edge_index); - - let mut new_successors_from_node = SmallBitmap::new(self.all_edges.len() as u16); - let all_edges = &self.all_edges; - for from_node_edge in from_node_edges.iter() { - let Edge { to_node, .. } = &all_edges[from_node_edge as usize].as_ref().unwrap(); - new_successors_from_node.insert(*to_node); - } - self.successors[from_node as usize] = new_successors_from_node; + self.edges_of_node[source_node as usize].remove(edge_index); } } diff --git a/milli/src/search/new/ranking_rule_graph/paths_map.rs b/milli/src/search/new/ranking_rule_graph/paths_map.rs index 82f181b97..b601f28d9 100644 --- a/milli/src/search/new/ranking_rule_graph/paths_map.rs +++ b/milli/src/search/new/ranking_rule_graph/paths_map.rs @@ -1,117 +1,32 @@ -use super::cheapest_paths::Path; -use crate::search::new::small_bitmap::SmallBitmap; - -// What is PathsMap used for? +// What is PathSet used for? // For the empty_prefixes field in the EmptyPathsCache only :/ // but it could be used for more, like efficient computing of a set of paths -#[derive(Debug, Clone)] -pub struct PathsMap { - pub nodes: Vec<(u16, PathsMap)>, - pub value: Option, +/// A set of [`Path`] +#[derive(Default, Debug, Clone)] +pub struct PathSet { + nodes: Vec<(u16, PathSet)>, + is_end: bool, } -impl Default for PathsMap { - fn default() -> Self { - Self { nodes: vec![], value: None } - } -} - -impl PathsMap { - pub fn from_paths(paths: &[Path]) -> Self { - let mut result = Self::default(); - for p in paths { - result.add_path(p); - } - result - } - pub fn add_path(&mut self, path: &Path) { - self.insert(path.edges.iter().copied(), path.cost); - } -} -impl PathsMap { - pub fn is_empty(&self) -> bool { - self.nodes.is_empty() && self.value.is_none() - } - - pub fn insert(&mut self, mut edges: impl Iterator, value: V) { +impl PathSet { + pub fn insert(&mut self, mut edges: impl Iterator) { match edges.next() { None => { - self.value = Some(value); + self.is_end = true; } Some(first_edge) => { - // comment for (edge, next_node) in &mut self.nodes { if edge == &first_edge { - return next_node.insert(edges, value); + return next_node.insert(edges); } } - let mut rest = PathsMap::default(); - rest.insert(edges, value); + let mut rest = PathSet::default(); + rest.insert(edges); self.nodes.push((first_edge, rest)); } } } - fn remove_first_rec(&mut self, cur: &mut Vec) -> (bool, V) { - let Some((first_edge, rest)) = self.nodes.first_mut() else { - // The PathsMap has to be correct by construction here, otherwise - // the unwrap() will crash - return (true, self.value.take().unwrap()) - }; - cur.push(*first_edge); - let (rest_is_empty, value) = rest.remove_first_rec(cur); - if rest_is_empty { - self.nodes.remove(0); - (self.nodes.is_empty(), value) - } else { - (false, value) - } - } - pub fn remove_first(&mut self) -> Option<(Vec, V)> { - if self.is_empty() { - return None; - } - let mut result = vec![]; - let (_, value) = self.remove_first_rec(&mut result); - Some((result, value)) - } - pub fn iterate_rec(&self, cur: &mut Vec, visit: &mut impl FnMut(&Vec, &V)) { - if let Some(value) = &self.value { - visit(cur, value); - } - for (first_edge, rest) in self.nodes.iter() { - cur.push(*first_edge); - rest.iterate_rec(cur, visit); - cur.pop(); - } - } - pub fn iterate(&self, mut visit: impl FnMut(&Vec, &V)) { - self.iterate_rec(&mut vec![], &mut visit) - } - - pub fn remove_prefixes(&mut self, prefixes: &PathsMap) { - prefixes.iterate(|prefix, _v| { - self.remove_prefix(prefix); - }); - } - pub fn remove_edges(&mut self, forbidden_edges: &SmallBitmap) { - let mut i = 0; - while i < self.nodes.len() { - let should_remove = if forbidden_edges.contains(self.nodes[i].0) { - true - } else if !self.nodes[i].1.nodes.is_empty() { - self.nodes[i].1.remove_edges(forbidden_edges); - self.nodes[i].1.nodes.is_empty() - } else { - false - }; - if should_remove { - self.nodes.remove(i); - } else { - i += 1; - } - } - } pub fn remove_edge(&mut self, forbidden_edge: &u16) { let mut i = 0; while i < self.nodes.len() { @@ -130,34 +45,11 @@ impl PathsMap { } } } - pub fn remove_prefix(&mut self, forbidden_prefix: &[u16]) { - let [first_edge, remaining_prefix @ ..] = forbidden_prefix else { - self.nodes.clear(); - self.value = None; - return; - }; - - let mut i = 0; - while i < self.nodes.len() { - let edge = self.nodes[i].0; - let should_remove = if edge == *first_edge { - self.nodes[i].1.remove_prefix(remaining_prefix); - self.nodes[i].1.nodes.is_empty() - } else { - false - }; - if should_remove { - self.nodes.remove(i); - } else { - i += 1; - } - } - } pub fn final_edges_after_prefix(&self, prefix: &[u16], visit: &mut impl FnMut(u16)) { let [first_edge, remaining_prefix @ ..] = prefix else { for node in self.nodes.iter() { - if node.1.value.is_some() { + if node.1.is_end { visit(node.0) } } @@ -170,20 +62,8 @@ impl PathsMap { } } - pub fn edge_indices_after_prefix(&self, prefix: &[u16]) -> Vec { - let [first_edge, remaining_prefix @ ..] = prefix else { - return self.nodes.iter().map(|n| n.0).collect(); - }; - for (edge, rest) in self.nodes.iter() { - if edge == first_edge { - return rest.edge_indices_after_prefix(remaining_prefix); - } - } - vec![] - } - pub fn contains_prefix_of_path(&self, path: &[u16]) -> bool { - if self.value.is_some() { + if self.is_end { return true; } match path { diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 48a6dda7e..0911f0638 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -5,7 +5,7 @@ use itertools::Itertools; use super::ProximityEdge; use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::search::new::ranking_rule_graph::proximity::WordPair; -use crate::search::new::ranking_rule_graph::EdgeDetails; +use crate::search::new::ranking_rule_graph::EdgeCondition; use crate::search::new::{QueryNode, SearchContext}; use crate::Result; @@ -57,10 +57,10 @@ pub fn visit_to_node<'search, 'from_data>( ctx: &mut SearchContext<'search>, to_node: &QueryNode, from_node_data: &'from_data (WordDerivations, i8), -) -> Result)>> { +) -> Result)>> { let (derivations1, pos1) = from_node_data; let term2 = match &to_node { - QueryNode::End => return Ok(vec![(0, EdgeDetails::Unconditional)]), + QueryNode::End => return Ok(vec![(0, EdgeCondition::Unconditional)]), QueryNode::Deleted | QueryNode::Start => return Ok(vec![]), QueryNode::Term(term) => term, }; @@ -96,7 +96,7 @@ pub fn visit_to_node<'search, 'from_data>( // We want to effectively ignore this pair of terms // Unconditionally walk through the edge without computing the docids // But also what should the cost be? - return Ok(vec![(0, EdgeDetails::Unconditional)]); + return Ok(vec![(0, EdgeCondition::Unconditional)]); } let updb1 = derivations1.use_prefix_db; @@ -189,7 +189,7 @@ pub fn visit_to_node<'search, 'from_data>( for (proximity, word_pairs) in proximity_word_pairs { edges.push(( cost, - EdgeDetails::Data(ProximityEdge { + EdgeCondition::Conditional(ProximityEdge { pairs: word_pairs.into_boxed_slice(), proximity, }), @@ -198,6 +198,6 @@ pub fn visit_to_node<'search, 'from_data>( edges }) .collect::>(); - new_edges.push((8 + (ngram_len2 - 1) as u8, EdgeDetails::Unconditional)); + new_edges.push((8 + (ngram_len2 - 1) as u8, EdgeCondition::Unconditional)); Ok(new_edges) } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 09c9aa960..bf07bf21d 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -4,7 +4,7 @@ pub mod compute_docids; use roaring::RoaringBitmap; use super::empty_paths_cache::EmptyPathsCache; -use super::{EdgeDetails, RankingRuleGraphTrait}; +use super::{EdgeCondition, RankingRuleGraphTrait}; use crate::search::new::interner::Interned; use crate::search::new::logger::SearchLogger; use crate::search::new::query_term::WordDerivations; @@ -30,34 +30,34 @@ pub struct ProximityEdge { pub enum ProximityGraph {} impl RankingRuleGraphTrait for ProximityGraph { - type EdgeDetails = ProximityEdge; + type EdgeCondition = ProximityEdge; type BuildVisitedFromNode = (WordDerivations, i8); - fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String { + fn label_for_edge_condition(edge: &Self::EdgeCondition) -> String { let ProximityEdge { pairs, proximity } = edge; format!(", prox {proximity}, {} pairs", pairs.len()) } - fn compute_docids<'search>( + fn resolve_edge_condition<'search>( ctx: &mut SearchContext<'search>, - edge: &Self::EdgeDetails, + edge: &Self::EdgeCondition, universe: &RoaringBitmap, ) -> Result { compute_docids::compute_docids(ctx, edge, universe) } - fn build_visit_from_node<'search>( + fn build_step_visit_source_node<'search>( ctx: &mut SearchContext<'search>, from_node: &QueryNode, ) -> Result> { build::visit_from_node(ctx, from_node) } - fn build_visit_to_node<'from_data, 'search: 'from_data>( + fn build_step_visit_destination_node<'from_data, 'search: 'from_data>( ctx: &mut SearchContext<'search>, to_node: &QueryNode, from_node_data: &'from_data Self::BuildVisitedFromNode, - ) -> Result)>> { + ) -> Result)>> { build::visit_to_node(ctx, to_node, from_node_data) } diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index bf2c6572e..2f6e7ad80 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -2,7 +2,7 @@ use heed::BytesDecode; use roaring::RoaringBitmap; use super::empty_paths_cache::EmptyPathsCache; -use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait}; +use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::Interned; use crate::search::new::logger::SearchLogger; use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations}; @@ -20,19 +20,19 @@ pub enum TypoEdge { pub enum TypoGraph {} impl RankingRuleGraphTrait for TypoGraph { - type EdgeDetails = TypoEdge; + type EdgeCondition = TypoEdge; type BuildVisitedFromNode = (); - fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String { + fn label_for_edge_condition(edge: &Self::EdgeCondition) -> String { match edge { TypoEdge::Phrase { .. } => ", 0 typos".to_owned(), TypoEdge::Word { nbr_typos, .. } => format!(", {nbr_typos} typos"), } } - fn compute_docids<'db_cache, 'search>( + fn resolve_edge_condition<'db_cache, 'search>( ctx: &mut SearchContext<'search>, - edge: &Self::EdgeDetails, + edge: &Self::EdgeCondition, universe: &RoaringBitmap, ) -> Result { match edge { @@ -66,29 +66,29 @@ impl RankingRuleGraphTrait for TypoGraph { } } - fn build_visit_from_node<'search>( + fn build_step_visit_source_node<'search>( _ctx: &mut SearchContext<'search>, _from_node: &QueryNode, ) -> Result> { Ok(Some(())) } - fn build_visit_to_node<'from_data, 'search: 'from_data>( + fn build_step_visit_destination_node<'from_data, 'search: 'from_data>( _ctx: &mut SearchContext<'search>, to_node: &QueryNode, _from_node_data: &'from_data Self::BuildVisitedFromNode, - ) -> Result)>> { + ) -> Result)>> { match to_node { QueryNode::Term(LocatedQueryTerm { value, .. }) => match value { &QueryTerm::Phrase { phrase } => { - Ok(vec![(0, EdgeDetails::Data(TypoEdge::Phrase { phrase }))]) + Ok(vec![(0, EdgeCondition::Conditional(TypoEdge::Phrase { phrase }))]) } QueryTerm::Word { derivations } => { let mut edges = vec![]; if !derivations.zero_typo.is_empty() || derivations.use_prefix_db { edges.push(( 0, - EdgeDetails::Data(TypoEdge::Word { + EdgeCondition::Conditional(TypoEdge::Word { derivations: derivations.clone(), nbr_typos: 0, }), @@ -97,7 +97,7 @@ impl RankingRuleGraphTrait for TypoGraph { if !derivations.one_typo.is_empty() { edges.push(( 1, - EdgeDetails::Data(TypoEdge::Word { + EdgeCondition::Conditional(TypoEdge::Word { derivations: derivations.clone(), nbr_typos: 1, }), @@ -106,7 +106,7 @@ impl RankingRuleGraphTrait for TypoGraph { if !derivations.two_typos.is_empty() { edges.push(( 2, - EdgeDetails::Data(TypoEdge::Word { + EdgeCondition::Conditional(TypoEdge::Word { derivations: derivations.clone(), nbr_typos: 2, }), @@ -115,7 +115,7 @@ impl RankingRuleGraphTrait for TypoGraph { Ok(edges) } }, - QueryNode::End => Ok(vec![(0, EdgeDetails::Unconditional)]), + QueryNode::End => Ok(vec![(0, EdgeCondition::Unconditional)]), QueryNode::Deleted | QueryNode::Start => panic!(), } }