diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index f35d024cc..9af41b322 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -40,10 +40,11 @@ use roaring::RoaringBitmap; use super::logger::SearchLogger; use super::ranking_rule_graph::{ - EdgeDocidsCache, EmptyPathsCache, RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, ProximityGraph, + EdgeCondition, EdgeConditionsCache, EmptyPathsCache, ProximityGraph, RankingRuleGraph, + RankingRuleGraphTrait, TypoGraph, }; use super::small_bitmap::SmallBitmap; -use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; +use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; use crate::Result; pub type Proximity = GraphBasedRankingRule; @@ -78,7 +79,7 @@ pub struct GraphBasedRankingRuleState { /// The current graph graph: RankingRuleGraph, /// Cache to retrieve the docids associated with each edge - edge_docids_cache: EdgeDocidsCache, + edge_conditions_cache: EdgeConditionsCache, /// Cache used to optimistically discard paths that resolve to no documents. empty_paths_cache: EmptyPathsCache, /// A structure giving the list of possible costs from each node to the end node, @@ -94,25 +95,27 @@ pub struct GraphBasedRankingRuleState { fn remove_empty_edges<'search, G: RankingRuleGraphTrait>( ctx: &mut SearchContext<'search>, graph: &mut RankingRuleGraph, - edge_docids_cache: &mut EdgeDocidsCache, + edge_docids_cache: &mut EdgeConditionsCache, universe: &RoaringBitmap, empty_paths_cache: &mut EmptyPathsCache, ) -> Result<()> { for edge_index in 0..graph.edges_store.len() as u16 { - if graph.edges_store[edge_index as usize].is_none() { + let Some(edge) = graph.edges_store[edge_index as usize].as_ref() else { continue; - } - let docids = edge_docids_cache.get_edge_docids(ctx, edge_index, &*graph, universe)?; - match docids { - BitmapOrAllRef::Bitmap(docids) => { + }; + let condition = edge.condition; + + match condition { + EdgeCondition::Unconditional => continue, + EdgeCondition::Conditional(condition) => { + let docids = edge_docids_cache.get_edge_docids(ctx, condition, graph, universe)?; if docids.is_disjoint(universe) { graph.remove_ranking_rule_edge(edge_index); empty_paths_cache.forbid_edge(edge_index); - edge_docids_cache.cache.remove(&edge_index); + edge_docids_cache.cache.remove(&condition); continue; } } - BitmapOrAllRef::All => continue, } } Ok(()) @@ -132,7 +135,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> query_graph: &QueryGraph, ) -> Result<()> { let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?; - let mut edge_docids_cache = EdgeDocidsCache::default(); + let mut edge_docids_cache = EdgeConditionsCache::default(); let mut empty_paths_cache = EmptyPathsCache::new(graph.edges_store.len() as u16); // First simplify the graph as much as possible, by computing the docids of the edges @@ -150,7 +153,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> let state = GraphBasedRankingRuleState { graph, - edge_docids_cache, + edge_conditions_cache: edge_docids_cache, empty_paths_cache, all_distances, cur_distance_idx: 0, @@ -174,11 +177,11 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> // should never happen let mut state = self.state.take().unwrap(); - // TODO: does this have a real positive performance cost? + // TODO: does this have a real positive performance impact? remove_empty_edges( ctx, &mut state.graph, - &mut state.edge_docids_cache, + &mut state.edge_conditions_cache, universe, &mut state.empty_paths_cache, )?; @@ -201,17 +204,17 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> let GraphBasedRankingRuleState { graph, - edge_docids_cache, + edge_conditions_cache: edge_docids_cache, empty_paths_cache, all_distances, cur_distance_idx: _, } = &mut state; - let original_universe = universe; + // let original_universe = universe; let mut universe = universe.clone(); // TODO: remove this unnecessary clone - let original_graph = graph.clone(); + // let original_graph = graph.clone(); // and this vector as well let mut paths = vec![]; @@ -241,12 +244,15 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> for &edge_index in path { visited_edges.push(edge_index); - let edge_docids = - edge_docids_cache.get_edge_docids(ctx, edge_index, graph, &universe)?; - let edge_docids = match edge_docids { - BitmapOrAllRef::Bitmap(b) => b, - BitmapOrAllRef::All => continue, + let edge = graph.edges_store[edge_index as usize].as_ref().unwrap(); + let condition = match edge.condition { + EdgeCondition::Unconditional => continue, + EdgeCondition::Conditional(condition) => condition, }; + + let edge_docids = + edge_docids_cache.get_edge_docids(ctx, condition, graph, &universe)?; + cached_edge_docids.push((edge_index, edge_docids.clone())); // If the edge is empty, then the path will be empty as well, we update the graph @@ -257,7 +263,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> // 2. remove this edge from the ranking rule graph graph.remove_ranking_rule_edge(edge_index); // 3. Also remove the entry from the edge_docids_cache, since we don't need it anymore - edge_docids_cache.cache.remove(&edge_index); + edge_docids_cache.cache.remove(&condition); return Ok(()); } path_docids &= edge_docids; @@ -279,6 +285,8 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> empty_paths_cache.forbid_couple_edges(*edge_index2, edge_index); } } + // We should maybe instead try to compute: + // 0th & nth & 1st & n-1th & 2nd & etc... return Ok(()); } } @@ -289,15 +297,15 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> }, )?; - G::log_state( - &original_graph, - &paths, - &state.empty_paths_cache, - original_universe, - &state.all_distances, - cost, - logger, - ); + // G::log_state( + // &original_graph, + // &paths, + // &state.empty_paths_cache, + // original_universe, + // &state.all_distances, + // cost, + // logger, + // ); // TODO: Graph-based ranking rules do not (yet) modify the query graph. We could, however, // remove nodes and/or terms within nodes that weren't present in any of the paths. diff --git a/milli/src/search/new/interner.rs b/milli/src/search/new/interner.rs index e68f3b949..7edef41c8 100644 --- a/milli/src/search/new/interner.rs +++ b/milli/src/search/new/interner.rs @@ -14,6 +14,21 @@ impl Interned { Self { idx, _phantom: PhantomData } } } + +// TODO: the stable store should be replaced by a bump allocator +// and the interned value should be a pointer wrapper +// then we can get its value with `interned.get()` instead of `interner.get(interned)` +// and as a bonus, its validity is tracked with Rust's lifetime system +// one problem is that we need two lifetimes: one for the bump allocator, one for the +// hashmap +// but that's okay, we can use: +// ``` +// struct Interner<'bump> { +// bump: &'bump Bump, +// lookup: FxHashMap +// } +// ``` + /// An [`Interner`] is used to store a unique copy of a value of type `T`. This value /// is then identified by a lightweight index of type [`Interned`], which can /// be copied, compared, and hashed efficiently. An immutable reference to the original value diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 18c51f4a4..110cfad64 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -35,11 +35,6 @@ use crate::search::new::query_term::located_query_terms_from_string; use crate::search::new::words::Words; use crate::{Filter, Index, Result, TermsMatchingStrategy}; -pub enum BitmapOrAllRef<'s> { - Bitmap(&'s RoaringBitmap), - All, -} - pub struct SearchContext<'search> { pub index: &'search Index, pub txn: &'search RoTxn<'search>, diff --git a/milli/src/search/new/ranking_rule_graph/build.rs b/milli/src/search/new/ranking_rule_graph/build.rs index 49c78a32f..3393f086a 100644 --- a/milli/src/search/new/ranking_rule_graph/build.rs +++ b/milli/src/search/new/ranking_rule_graph/build.rs @@ -1,6 +1,7 @@ use std::collections::HashSet; use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait}; +use crate::search::new::interner::Interner; use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::{QueryGraph, SearchContext}; use crate::Result; @@ -10,6 +11,8 @@ impl RankingRuleGraph { pub fn build(ctx: &mut SearchContext, query_graph: QueryGraph) -> Result { let QueryGraph { nodes: graph_nodes, edges: graph_edges, .. } = &query_graph; + let mut conditions_interner = Interner::default(); + let mut edges_store = vec![]; let mut edges_of_node = vec![]; @@ -21,18 +24,22 @@ impl RankingRuleGraph { for successor_idx in graph_edges[node_idx].successors.iter() { let dest_node = &graph_nodes[successor_idx as usize]; - let edges = - G::build_step_visit_destination_node(ctx, dest_node, &source_node_data)?; + let edges = G::build_step_visit_destination_node( + ctx, + &mut conditions_interner, + dest_node, + &source_node_data, + )?; if edges.is_empty() { continue; } - for (cost, details) in edges { + for (cost, condition) in edges { edges_store.push(Some(Edge { source_node: node_idx as u16, dest_node: successor_idx, cost, - condition: details, + condition, })); new_edges.insert(edges_store.len() as u16 - 1); } @@ -43,6 +50,6 @@ impl RankingRuleGraph { .map(|edges| SmallBitmap::from_iter(edges.into_iter(), edges_store.len() as u16)) .collect(); - Ok(RankingRuleGraph { query_graph, edges_store, edges_of_node }) + Ok(RankingRuleGraph { query_graph, edges_store, edges_of_node, conditions_interner }) } } diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs index f7bf1b002..416ededc0 100644 --- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs @@ -3,22 +3,23 @@ use std::marker::PhantomData; use fxhash::FxHashMap; use roaring::RoaringBitmap; -use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait}; -use crate::search::new::{BitmapOrAllRef, SearchContext}; +use super::{RankingRuleGraph, RankingRuleGraphTrait}; +use crate::search::new::interner::Interned; +use crate::search::new::SearchContext; use crate::Result; /// A cache storing the document ids associated with each ranking rule edge -pub struct EdgeDocidsCache { +pub struct EdgeConditionsCache { // TODO: should be FxHashMap, RoaringBitmap> - pub cache: FxHashMap, + pub cache: FxHashMap, RoaringBitmap>, _phantom: PhantomData, } -impl Default for EdgeDocidsCache { +impl Default for EdgeConditionsCache { fn default() -> Self { Self { cache: Default::default(), _phantom: Default::default() } } } -impl EdgeDocidsCache { +impl EdgeConditionsCache { /// Retrieve the document ids for the given edge condition. /// /// If the cache does not yet contain these docids, they are computed @@ -27,30 +28,25 @@ impl EdgeDocidsCache { &'s mut self, ctx: &mut SearchContext<'search>, // TODO: should be Interned - edge_index: u16, + interned_edge_condition: Interned, graph: &RankingRuleGraph, // TODO: maybe universe doesn't belong here universe: &RoaringBitmap, - ) -> Result> { - let edge = graph.edges_store[edge_index as usize].as_ref().unwrap(); - - match &edge.condition { - EdgeCondition::Unconditional => Ok(BitmapOrAllRef::All), - EdgeCondition::Conditional(details) => { - if self.cache.contains_key(&edge_index) { - // TODO: should we update the bitmap in the cache if the new universe - // reduces it? - // TODO: maybe have a generation: u32 to track every time the universe was - // reduced. Then only attempt to recompute the intersection when there is a chance - // that edge_docids & universe changed - return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index])); - } - // TODO: maybe universe doesn't belong here - let docids = universe & G::resolve_edge_condition(ctx, details, universe)?; - let _ = self.cache.insert(edge_index, docids); - let docids = &self.cache[&edge_index]; - Ok(BitmapOrAllRef::Bitmap(docids)) - } + ) -> Result<&'s RoaringBitmap> { + if self.cache.contains_key(&interned_edge_condition) { + // TODO: should we update the bitmap in the cache if the new universe + // reduces it? + // TODO: maybe have a generation: u32 to track every time the universe was + // reduced. Then only attempt to recompute the intersection when there is a chance + // that edge_docids & universe changed + return Ok(&self.cache[&interned_edge_condition]); } + // TODO: maybe universe doesn't belong here + let edge_condition = graph.conditions_interner.get(interned_edge_condition); + // TODO: faster way to do this? + let docids = universe & G::resolve_edge_condition(ctx, edge_condition, universe)?; + let _ = self.cache.insert(interned_edge_condition, docids); + let docids = &self.cache[&interned_edge_condition]; + Ok(docids) } } diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 143554c72..8aa29a8b7 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -16,12 +16,15 @@ mod proximity; /// Implementation of the `typo` ranking rule mod typo; -pub use edge_docids_cache::EdgeDocidsCache; +use std::hash::Hash; + +pub use edge_docids_cache::EdgeConditionsCache; pub use empty_paths_cache::EmptyPathsCache; pub use proximity::ProximityGraph; use roaring::RoaringBitmap; pub use typo::TypoGraph; +use super::interner::{Interned, Interner}; use super::logger::SearchLogger; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, QueryNode, SearchContext}; @@ -36,10 +39,20 @@ use crate::Result; /// proximity ranking rule, the condition could be that a word is N-close to another one. /// When the edge is traversed, some database operations are executed to retrieve the set /// of documents that satisfy the condition, which reduces the list of candidate document ids. -#[derive(Debug, Clone)] pub enum EdgeCondition { Unconditional, - Conditional(E), + Conditional(Interned), +} + +impl Copy for EdgeCondition {} + +impl Clone for EdgeCondition { + fn clone(&self) -> Self { + match self { + Self::Unconditional => Self::Unconditional, + Self::Conditional(arg0) => Self::Conditional(*arg0), + } + } } /// An edge in the ranking rule graph. @@ -48,7 +61,7 @@ pub enum EdgeCondition { /// 1. The source and destination nodes /// 2. The cost of traversing this edge /// 3. The condition associated with it -#[derive(Debug, Clone)] +#[derive(Clone)] pub struct Edge { pub source_node: u16, pub dest_node: u16, @@ -106,7 +119,7 @@ pub trait RankingRuleGraphTrait: Sized { /// The condition of an edge connecting two query nodes. The condition /// should be sufficient to compute the edge's cost and associated document ids /// in [`resolve_edge_condition`](RankingRuleGraphTrait::resolve_edge_condition). - type EdgeCondition: Sized + Clone; + type EdgeCondition: Sized + Clone + PartialEq + Eq + Hash; /// A structure used in the construction of the graph, created when a /// query graph source node is visited. It is used to determine the cost @@ -138,6 +151,7 @@ pub trait RankingRuleGraphTrait: Sized { /// (with [`build_step_visit_source_node`](RankingRuleGraphTrait::build_step_visit_source_node)) to `dest_node`. fn build_step_visit_destination_node<'from_data, 'search: 'from_data>( ctx: &mut SearchContext<'search>, + conditions_interner: &mut Interner, dest_node: &QueryNode, source_node_data: &'from_data Self::BuildVisitedFromNode, ) -> Result)>>; @@ -161,16 +175,18 @@ pub struct RankingRuleGraph { pub query_graph: QueryGraph, pub edges_store: Vec>>, pub edges_of_node: Vec, + pub conditions_interner: Interner, } -impl Clone for RankingRuleGraph { - fn clone(&self) -> Self { - Self { - query_graph: self.query_graph.clone(), - edges_store: self.edges_store.clone(), - edges_of_node: self.edges_of_node.clone(), - } - } -} +// impl Clone for RankingRuleGraph { +// fn clone(&self) -> Self { +// Self { +// query_graph: self.query_graph.clone(), +// edges_store: self.edges_store.clone(), +// edges_of_node: self.edges_of_node.clone(), +// conditions_interner: self.conditions_interner.clone(), +// } +// } +// } impl RankingRuleGraph { /// Remove the given edge from the ranking rule graph pub fn remove_ranking_rule_edge(&mut self, edge_index: u16) { diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 6caa4a769..192b74faf 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -3,6 +3,7 @@ use std::collections::BTreeMap; use itertools::Itertools; use super::ProximityEdge; +use crate::search::new::interner::Interner; use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::search::new::ranking_rule_graph::proximity::WordPair; use crate::search::new::ranking_rule_graph::EdgeCondition; @@ -59,6 +60,7 @@ pub fn visit_from_node( pub fn visit_to_node<'search, 'from_data>( ctx: &mut SearchContext<'search>, + conditions_interner: &mut Interner, to_node: &QueryNode, from_node_data: &'from_data (WordDerivations, i8), ) -> Result)>> { @@ -224,22 +226,23 @@ pub fn visit_to_node<'search, 'from_data>( } } } - let mut new_edges = cost_proximity_word_pairs - .into_iter() - .flat_map(|(cost, proximity_word_pairs)| { - let mut edges = vec![]; - for (proximity, word_pairs) in proximity_word_pairs { - edges.push(( - cost, - EdgeCondition::Conditional(ProximityEdge { - pairs: word_pairs.into_boxed_slice(), - proximity, - }), - )) - } - edges - }) - .collect::>(); + let mut new_edges = + cost_proximity_word_pairs + .into_iter() + .flat_map(|(cost, proximity_word_pairs)| { + let mut edges = vec![]; + for (proximity, word_pairs) in proximity_word_pairs { + edges.push(( + cost, + EdgeCondition::Conditional(conditions_interner.insert(ProximityEdge { + pairs: word_pairs.into_boxed_slice(), + proximity, + })), + )) + } + edges + }) + .collect::>(); new_edges.push((8 + (ngram_len2 - 1) as u8, EdgeCondition::Unconditional)); Ok(new_edges) } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index bf07bf21d..822c9531c 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -5,23 +5,21 @@ use roaring::RoaringBitmap; use super::empty_paths_cache::EmptyPathsCache; use super::{EdgeCondition, RankingRuleGraphTrait}; -use crate::search::new::interner::Interned; +use crate::search::new::interner::{Interned, Interner}; use crate::search::new::logger::SearchLogger; use crate::search::new::query_term::WordDerivations; use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; -// TODO: intern the proximity edges as well? - -#[derive(Clone)] +#[derive(Clone, PartialEq, Eq, Hash)] pub enum WordPair { Words { left: Interned, right: Interned }, WordPrefix { left: Interned, right_prefix: Interned }, WordPrefixSwapped { left_prefix: Interned, right: Interned }, } -#[derive(Clone)] +#[derive(Clone, PartialEq, Eq, Hash)] pub struct ProximityEdge { pairs: Box<[WordPair]>, proximity: u8, @@ -55,10 +53,11 @@ impl RankingRuleGraphTrait for ProximityGraph { fn build_step_visit_destination_node<'from_data, 'search: 'from_data>( ctx: &mut SearchContext<'search>, + conditions_interner: &mut Interner, to_node: &QueryNode, from_node_data: &'from_data Self::BuildVisitedFromNode, ) -> Result)>> { - build::visit_to_node(ctx, to_node, from_node_data) + build::visit_to_node(ctx, conditions_interner, to_node, from_node_data) } fn log_state( diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index 6354909f6..596fbfb64 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -2,14 +2,14 @@ use roaring::RoaringBitmap; use super::empty_paths_cache::EmptyPathsCache; use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait}; -use crate::search::new::interner::Interned; +use crate::search::new::interner::{Interned, Interner}; use crate::search::new::logger::SearchLogger; use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations}; use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; -#[derive(Clone)] +#[derive(Clone, PartialEq, Eq, Hash)] pub enum TypoEdge { Phrase { phrase: Interned }, Word { derivations: Interned, nbr_typos: u8 }, @@ -78,15 +78,19 @@ impl RankingRuleGraphTrait for TypoGraph { fn build_step_visit_destination_node<'from_data, 'search: 'from_data>( ctx: &mut SearchContext<'search>, + conditions_interner: &mut Interner, to_node: &QueryNode, _from_node_data: &'from_data Self::BuildVisitedFromNode, ) -> Result)>> { let SearchContext { derivations_interner, .. } = ctx; match to_node { QueryNode::Term(LocatedQueryTerm { value, .. }) => match *value { - QueryTerm::Phrase { phrase } => { - Ok(vec![(0, EdgeCondition::Conditional(TypoEdge::Phrase { phrase }))]) - } + QueryTerm::Phrase { phrase } => Ok(vec![( + 0, + EdgeCondition::Conditional( + conditions_interner.insert(TypoEdge::Phrase { phrase }), + ), + )]), QueryTerm::Word { derivations } => { let mut edges = vec![]; @@ -136,10 +140,12 @@ impl RankingRuleGraphTrait for TypoGraph { if !new_derivations.is_empty() { edges.push(( nbr_typos, - EdgeCondition::Conditional(TypoEdge::Word { - derivations: derivations_interner.insert(new_derivations), - nbr_typos, - }), + EdgeCondition::Conditional(conditions_interner.insert( + TypoEdge::Word { + derivations: derivations_interner.insert(new_derivations), + nbr_typos, + }, + )), )) } }