diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index d51fb6920..ac56b4f20 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -21,7 +21,7 @@ pub struct GraphBasedRankingRuleState { graph: RankingRuleGraph, edge_docids_cache: EdgeDocidsCache, empty_paths_cache: EmptyPathsCache, - all_distances: Vec>, + all_distances: Vec>, cur_distance_idx: usize, } @@ -32,14 +32,14 @@ fn remove_empty_edges<'search, G: RankingRuleGraphTrait>( universe: &RoaringBitmap, empty_paths_cache: &mut EmptyPathsCache, ) -> Result<()> { - for edge_index in 0..graph.all_edges.len() as u32 { + for edge_index in 0..graph.all_edges.len() as u16 { if graph.all_edges[edge_index as usize].is_none() { continue; } let docids = edge_docids_cache.get_edge_docids(ctx, edge_index, &*graph, universe)?; match docids { - BitmapOrAllRef::Bitmap(bitmap) => { - if bitmap.is_disjoint(universe) { + BitmapOrAllRef::Bitmap(docids) => { + if docids.is_disjoint(universe) { graph.remove_edge(edge_index); empty_paths_cache.forbid_edge(edge_index); edge_docids_cache.cache.remove(&edge_index); @@ -68,7 +68,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> // TODO: update old state instead of starting from scratch let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?; let mut edge_docids_cache = EdgeDocidsCache::default(); - let mut empty_paths_cache = EmptyPathsCache::new(graph.all_edges.len()); + let mut empty_paths_cache = EmptyPathsCache::new(graph.all_edges.len() as u16); remove_empty_edges( ctx, @@ -118,31 +118,82 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> state.all_distances[state.graph.query_graph.root_node as usize][state.cur_distance_idx]; state.cur_distance_idx += 1; - let paths = state.graph.paths_of_cost( - state.graph.query_graph.root_node as usize, + let mut bucket = RoaringBitmap::new(); + + let GraphBasedRankingRuleState { + graph, + edge_docids_cache, + empty_paths_cache, + all_distances, + cur_distance_idx: _, + } = &mut state; + + let mut paths = vec![]; + let original_universe = universe; + let mut universe = universe.clone(); + + graph.visit_paths_of_cost( + graph.query_graph.root_node as usize, cost, - &state.all_distances, - &state.empty_paths_cache, - ); + all_distances, + empty_paths_cache, + |path, graph, empty_paths_cache| { + let mut path_docids = universe.clone(); + let mut visited_edges = vec![]; + let mut cached_edge_docids = vec![]; + for &edge_index in path { + visited_edges.push(edge_index); + let edge_docids = + edge_docids_cache.get_edge_docids(ctx, edge_index, graph, &universe)?; + let edge_docids = match edge_docids { + BitmapOrAllRef::Bitmap(b) => b, + BitmapOrAllRef::All => continue, + }; + cached_edge_docids.push((edge_index, edge_docids.clone())); + if edge_docids.is_disjoint(&universe) { + // 1. Store in the cache that this edge is empty for this universe + empty_paths_cache.forbid_edge(edge_index); + // 2. remove this edge from the ranking rule graph + graph.remove_edge(edge_index); + edge_docids_cache.cache.remove(&edge_index); + return Ok(()); + } + path_docids &= edge_docids; + + if path_docids.is_disjoint(&universe) { + empty_paths_cache.forbid_prefix(&visited_edges); + // if the intersection between this edge and any + // previous one is disjoint with the universe, + // then we add these two edges to the empty_path_cache + for (edge_index2, edge_docids2) in + cached_edge_docids[..cached_edge_docids.len() - 1].iter() + { + let intersection = edge_docids & edge_docids2; + if intersection.is_disjoint(&universe) { + // needs_filtering_empty_couple_edges = true; + empty_paths_cache.forbid_couple_edges(*edge_index2, edge_index); + } + } + return Ok(()); + } + } + paths.push(path.to_vec()); + bucket |= &path_docids; + universe -= path_docids; + Ok(()) + }, + )?; G::log_state( &state.graph, &paths, &state.empty_paths_cache, - universe, + original_universe, &state.all_distances, cost, logger, ); - let bucket = state.graph.resolve_paths( - ctx, - &mut state.edge_docids_cache, - &mut state.empty_paths_cache, - universe, - paths, - )?; - let next_query_graph = state.graph.query_graph.clone(); self.state = Some(state); diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 76c3f8977..10b5e7097 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -42,19 +42,19 @@ pub enum SearchEvents { }, ProximityState { graph: RankingRuleGraph, - paths: Vec>, + paths: Vec>, empty_paths_cache: EmptyPathsCache, universe: RoaringBitmap, - distances: Vec>, - cost: u64, + distances: Vec>, + cost: u16, }, TypoState { graph: RankingRuleGraph, - paths: Vec>, + paths: Vec>, empty_paths_cache: EmptyPathsCache, universe: RoaringBitmap, - distances: Vec>, - cost: u64, + distances: Vec>, + cost: u16, }, RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap, time: Instant }, } @@ -165,11 +165,11 @@ impl SearchLogger for DetailedSearchLogger { self.events.push(SearchEvents::WordsState { query_graph: query_graph.clone() }); } - fn log_proximity_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec>, cost: u64,) { + fn log_proximity_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec>, cost: u16,) { self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.to_vec(), empty_paths_cache: empty_paths_cache.clone(), universe: universe.clone(), distances, cost }) } - fn log_typo_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec>, cost: u64,) { + fn log_typo_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec>, cost: u16,) { self.events.push(SearchEvents::TypoState { graph: query_graph.clone(), paths: paths_map.to_vec(), empty_paths_cache: empty_paths_cache.clone(), universe: universe.clone(), distances, cost }) } @@ -352,7 +352,7 @@ results.{random} {{ writeln!(&mut file, "}}").unwrap(); } - fn query_node_d2_desc(ctx: &mut SearchContext, node_idx: usize, node: &QueryNode, _distances: &[u64], file: &mut File) { + fn query_node_d2_desc(ctx: &mut SearchContext, node_idx: usize, node: &QueryNode, _distances: &[u16], file: &mut File) { match &node { QueryNode::Term(LocatedQueryTerm { value, .. }) => { match value { @@ -420,7 +420,7 @@ shape: class").unwrap(); } } } - fn ranking_rule_graph_d2_description(ctx: &mut SearchContext, graph: &RankingRuleGraph, paths: &[Vec], _empty_paths_cache: &EmptyPathsCache, distances: Vec>, file: &mut File) { + fn ranking_rule_graph_d2_description(ctx: &mut SearchContext, graph: &RankingRuleGraph, paths: &[Vec], _empty_paths_cache: &EmptyPathsCache, distances: Vec>, file: &mut File) { writeln!(file,"direction: right").unwrap(); writeln!(file, "Proximity Graph {{").unwrap(); @@ -477,7 +477,7 @@ shape: class").unwrap(); // } // writeln!(file, "}}").unwrap(); } - fn edge_d2_description(ctx: &mut SearchContext,graph: &RankingRuleGraph, edge_idx: u32, file: &mut File) { + fn edge_d2_description(ctx: &mut SearchContext,graph: &RankingRuleGraph, edge_idx: u16, file: &mut File) { let Edge { from_node, to_node, cost, .. } = graph.all_edges[edge_idx as usize].as_ref().unwrap() ; let from_node = &graph.query_graph.nodes[*from_node as usize]; let from_node_desc = match from_node { @@ -511,7 +511,7 @@ shape: class").unwrap(); shape: class }}").unwrap(); } - fn paths_d2_description(ctx: &mut SearchContext, graph: &RankingRuleGraph, paths: &[Vec], file: &mut File) { + fn paths_d2_description(ctx: &mut SearchContext, graph: &RankingRuleGraph, paths: &[Vec], file: &mut File) { for (path_idx, edge_indexes) in paths.iter().enumerate() { writeln!(file, "{path_idx} {{").unwrap(); for edge_idx in edge_indexes.iter() { diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 9a141c1c6..bf78e4de0 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -58,22 +58,22 @@ impl SearchLogger for DefaultSearchLogger { fn log_proximity_state( &mut self, _query_graph: &RankingRuleGraph, - _paths_map: &[Vec], + _paths_map: &[Vec], _empty_paths_cache: &EmptyPathsCache, _universe: &RoaringBitmap, - _distances: Vec>, - _cost: u64, + _distances: Vec>, + _cost: u16, ) { } fn log_typo_state( &mut self, _query_graph: &RankingRuleGraph, - _paths: &[Vec], + _paths: &[Vec], _empty_paths_cache: &EmptyPathsCache, _universe: &RoaringBitmap, - _distances: Vec>, - _cost: u64, + _distances: Vec>, + _cost: u16, ) { } } @@ -120,20 +120,20 @@ pub trait SearchLogger { fn log_proximity_state( &mut self, query_graph: &RankingRuleGraph, - paths: &[Vec], + paths: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, - _distances: Vec>, - cost: u64, + distances: Vec>, + cost: u16, ); fn log_typo_state( &mut self, query_graph: &RankingRuleGraph, - paths: &[Vec], + paths: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, - _distances: Vec>, - cost: u64, + distances: Vec>, + cost: u16, ); } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 0feef1f60..dc73fe51c 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -7,31 +7,26 @@ mod query_term; mod ranking_rule_graph; mod ranking_rules; mod resolve_query_graph; +mod small_bitmap; mod sort; mod words; -use std::collections::BTreeSet; - -pub use ranking_rules::{ - apply_ranking_rules, RankingRule, RankingRuleOutput, RankingRuleOutputIter, - RankingRuleOutputIterWrapper, RankingRuleQueryTrait, -}; - -use crate::{ - new::query_term::located_query_terms_from_string, Filter, Index, Result, TermsMatchingStrategy, -}; +use self::interner::Interner; +use self::logger::SearchLogger; +use self::query_term::Phrase; +use self::resolve_query_graph::{resolve_query_graph, NodeDocIdsCache}; +use crate::new::query_term::located_query_terms_from_string; +use crate::{Filter, Index, Result, TermsMatchingStrategy}; use charabia::Tokenize; use db_cache::DatabaseCache; use heed::RoTxn; use query_graph::{QueryGraph, QueryNode}; -use roaring::RoaringBitmap; - -use self::{ - interner::Interner, - logger::SearchLogger, - query_term::Phrase, - resolve_query_graph::{resolve_query_graph, NodeDocIdsCache}, +pub use ranking_rules::{ + apply_ranking_rules, RankingRule, RankingRuleOutput, RankingRuleOutputIter, + RankingRuleOutputIterWrapper, RankingRuleQueryTrait, }; +use roaring::RoaringBitmap; +use std::collections::BTreeSet; pub enum BitmapOrAllRef<'s> { Bitmap(&'s RoaringBitmap), @@ -109,7 +104,7 @@ pub fn execute_search<'search>( logger: &mut dyn SearchLogger, ) -> Result> { assert!(!query.is_empty()); - let query_terms = located_query_terms_from_string(ctx, query.tokenize(), None).unwrap(); + let query_terms = located_query_terms_from_string(ctx, query.tokenize(), None)?; let graph = QueryGraph::from_query(ctx, query_terms)?; logger.initial_query(&graph); @@ -127,7 +122,7 @@ pub fn execute_search<'search>( TermsMatchingStrategy::Last, logger, )?; - // TODO: create ranking rules here, reuse the node docids cache for the words ranking rule + // TODO: create ranking rules here logger.initial_universe(&universe); diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 449b6536c..b879b2c15 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -1,8 +1,7 @@ use super::query_term::{self, LocatedQueryTerm, QueryTerm, WordDerivations}; +use super::small_bitmap::SmallBitmap; use super::SearchContext; use crate::Result; -use roaring::RoaringBitmap; -use std::fmt::Debug; #[derive(Clone)] pub enum QueryNode { @@ -12,17 +11,17 @@ pub enum QueryNode { End, } -#[derive(Debug, Clone)] +#[derive(Clone)] pub struct Edges { // TODO: use a tiny bitset instead, something like a simple Vec where most queries will see a vector of one element - pub predecessors: RoaringBitmap, - pub successors: RoaringBitmap, + pub predecessors: SmallBitmap, + pub successors: SmallBitmap, } #[derive(Clone)] pub struct QueryGraph { - pub root_node: u32, - pub end_node: u32, + pub root_node: u16, + pub end_node: u16, pub nodes: Vec, pub edges: Vec, } @@ -30,7 +29,7 @@ pub struct QueryGraph { fn _assert_sizes() { // TODO: QueryNodes are too big now, 88B is a bit too big let _: [u8; 88] = [0; std::mem::size_of::()]; - let _: [u8; 48] = [0; std::mem::size_of::()]; + let _: [u8; 32] = [0; std::mem::size_of::()]; } impl Default for QueryGraph { @@ -38,8 +37,8 @@ impl Default for QueryGraph { fn default() -> Self { let nodes = vec![QueryNode::Start, QueryNode::End]; let edges = vec![ - Edges { predecessors: RoaringBitmap::new(), successors: RoaringBitmap::new() }, - Edges { predecessors: RoaringBitmap::new(), successors: RoaringBitmap::new() }, + Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) }, + Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) }, ]; Self { root_node: 0, end_node: 1, nodes, edges } @@ -47,18 +46,18 @@ impl Default for QueryGraph { } impl QueryGraph { - fn connect_to_node(&mut self, from_nodes: &[u32], to_node: u32) { + fn connect_to_node(&mut self, from_nodes: &[u16], to_node: u16) { for &from_node in from_nodes { self.edges[from_node as usize].successors.insert(to_node); self.edges[to_node as usize].predecessors.insert(from_node); } } - fn add_node(&mut self, from_nodes: &[u32], node: QueryNode) -> u32 { - let new_node_idx = self.nodes.len() as u32; + fn add_node(&mut self, from_nodes: &[u16], node: QueryNode) -> u16 { + let new_node_idx = self.nodes.len() as u16; self.nodes.push(node); self.edges.push(Edges { - predecessors: from_nodes.iter().collect(), - successors: RoaringBitmap::new(), + predecessors: SmallBitmap::from_array(from_nodes, 64), + successors: SmallBitmap::new(64), }); for from_node in from_nodes { self.edges[*from_node as usize].successors.insert(new_node_idx); @@ -79,7 +78,7 @@ impl QueryGraph { let word_set = ctx.index.words_fst(ctx.txn)?; let mut graph = QueryGraph::default(); - let (mut prev2, mut prev1, mut prev0): (Vec, Vec, Vec) = + let (mut prev2, mut prev1, mut prev0): (Vec, Vec, Vec) = (vec![], vec![], vec![graph.root_node]); // TODO: split words / synonyms @@ -157,40 +156,40 @@ impl QueryGraph { Ok(graph) } - pub fn remove_nodes(&mut self, nodes: &[u32]) { + pub fn remove_nodes(&mut self, nodes: &[u16]) { for &node in nodes { self.nodes[node as usize] = QueryNode::Deleted; let edges = self.edges[node as usize].clone(); for pred in edges.predecessors.iter() { self.edges[pred as usize].successors.remove(node); } - for succ in edges.successors { + for succ in edges.successors.iter() { self.edges[succ as usize].predecessors.remove(node); } self.edges[node as usize] = - Edges { predecessors: RoaringBitmap::new(), successors: RoaringBitmap::new() }; + Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) }; } } - pub fn remove_nodes_keep_edges(&mut self, nodes: &[u32]) { + pub fn remove_nodes_keep_edges(&mut self, nodes: &[u16]) { for &node in nodes { self.nodes[node as usize] = QueryNode::Deleted; let edges = self.edges[node as usize].clone(); for pred in edges.predecessors.iter() { self.edges[pred as usize].successors.remove(node); - self.edges[pred as usize].successors |= &edges.successors; + self.edges[pred as usize].successors.union(&edges.successors); } - for succ in edges.successors { + for succ in edges.successors.iter() { self.edges[succ as usize].predecessors.remove(node); - self.edges[succ as usize].predecessors |= &edges.predecessors; + self.edges[succ as usize].predecessors.union(&edges.predecessors); } self.edges[node as usize] = - Edges { predecessors: RoaringBitmap::new(), successors: RoaringBitmap::new() }; + Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) }; } } pub fn remove_words_at_position(&mut self, position: i8) -> bool { let mut nodes_to_remove_keeping_edges = vec![]; for (node_idx, node) in self.nodes.iter().enumerate() { - let node_idx = node_idx as u32; + let node_idx = node_idx as u16; let QueryNode::Term(LocatedQueryTerm { value: _, positions }) = node else { continue }; if positions.start() == &position { nodes_to_remove_keeping_edges.push(node_idx); @@ -212,7 +211,7 @@ impl QueryGraph { || (!matches!(node, QueryNode::Start | QueryNode::Deleted) && self.edges[node_idx].predecessors.is_empty()) { - nodes_to_remove.push(node_idx as u32); + nodes_to_remove.push(node_idx as u16); } } if nodes_to_remove.is_empty() { diff --git a/milli/src/search/new/ranking_rule_graph/build.rs b/milli/src/search/new/ranking_rule_graph/build.rs index 03a7f6c9d..261f2909b 100644 --- a/milli/src/search/new/ranking_rule_graph/build.rs +++ b/milli/src/search/new/ranking_rule_graph/build.rs @@ -1,40 +1,54 @@ +use std::collections::HashSet; + use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait}; +use crate::new::small_bitmap::SmallBitmap; use crate::new::{QueryGraph, SearchContext}; use crate::Result; -use roaring::RoaringBitmap; impl RankingRuleGraph { pub fn build(ctx: &mut SearchContext, query_graph: QueryGraph) -> Result { - let mut ranking_rule_graph = - Self { query_graph, all_edges: vec![], node_edges: vec![], successors: vec![] }; + let QueryGraph { nodes: graph_nodes, edges: graph_edges, .. } = &query_graph; - for (node_idx, node) in ranking_rule_graph.query_graph.nodes.iter().enumerate() { - ranking_rule_graph.node_edges.push(RoaringBitmap::new()); - ranking_rule_graph.successors.push(RoaringBitmap::new()); - let new_edges = ranking_rule_graph.node_edges.last_mut().unwrap(); - let new_successors = ranking_rule_graph.successors.last_mut().unwrap(); + let mut all_edges = vec![]; + let mut node_edges = vec![]; + let mut successors = vec![]; + + for (node_idx, node) in graph_nodes.iter().enumerate() { + node_edges.push(HashSet::new()); + successors.push(HashSet::new()); + let new_edges = node_edges.last_mut().unwrap(); + let new_successors = successors.last_mut().unwrap(); let Some(from_node_data) = G::build_visit_from_node(ctx, node)? else { continue }; - for successor_idx in ranking_rule_graph.query_graph.edges[node_idx].successors.iter() { - let to_node = &ranking_rule_graph.query_graph.nodes[successor_idx as usize]; + for successor_idx in graph_edges[node_idx].successors.iter() { + let to_node = &graph_nodes[successor_idx as usize]; let mut edges = G::build_visit_to_node(ctx, to_node, &from_node_data)?; if edges.is_empty() { continue; } edges.sort_by_key(|e| e.0); for (cost, details) in edges { - ranking_rule_graph.all_edges.push(Some(Edge { - from_node: node_idx as u32, + all_edges.push(Some(Edge { + from_node: node_idx as u16, to_node: successor_idx, cost, details, })); - new_edges.insert(ranking_rule_graph.all_edges.len() as u32 - 1); + new_edges.insert(all_edges.len() as u16 - 1); new_successors.insert(successor_idx); } } } - Ok(ranking_rule_graph) + let node_edges = node_edges + .into_iter() + .map(|edges| SmallBitmap::from_iter(edges.into_iter(), all_edges.len() as u16)) + .collect(); + let successors = successors + .into_iter() + .map(|edges| SmallBitmap::from_iter(edges.into_iter(), all_edges.len() as u16)) + .collect(); + + Ok(RankingRuleGraph { query_graph, all_edges, node_edges, successors }) } } diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 2377f1c84..14afd83d0 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -2,124 +2,146 @@ use super::empty_paths_cache::EmptyPathsCache; use super::{RankingRuleGraph, RankingRuleGraphTrait}; +use crate::new::small_bitmap::SmallBitmap; +use crate::Result; use std::collections::VecDeque; #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct Path { - pub edges: Vec, + pub edges: Vec, pub cost: u64, } impl RankingRuleGraph { - pub fn paths_of_cost( - &self, + pub fn visit_paths_of_cost( + &mut self, from: usize, - cost: u64, - all_distances: &[Vec], - empty_paths_cache: &EmptyPathsCache, - ) -> Vec> { - let mut paths = vec![]; - self.paths_of_cost_rec( + cost: u16, + all_distances: &[Vec], + empty_paths_cache: &mut EmptyPathsCache, + mut visit: impl FnMut(&[u16], &mut Self, &mut EmptyPathsCache) -> Result<()>, + ) -> Result<()> { + let _ = self.visit_paths_of_cost_rec( from, - all_distances, cost, - &mut vec![], - &mut paths, - &vec![false; self.all_edges.len()], + all_distances, empty_paths_cache, - ); - paths + &mut visit, + &mut vec![], + &mut SmallBitmap::new(self.all_edges.len() as u16), + empty_paths_cache.empty_edges.clone(), + )?; + Ok(()) } - pub fn paths_of_cost_rec( - &self, + pub fn visit_paths_of_cost_rec( + &mut self, from: usize, - all_distances: &[Vec], - cost: u64, - prev_edges: &mut Vec, - paths: &mut Vec>, - forbidden_edges: &[bool], - empty_paths_cache: &EmptyPathsCache, - ) { - let distances = &all_distances[from]; - if !distances.contains(&cost) { - panic!(); - } - let tos = &self.query_graph.edges[from].successors; - let mut valid_edges = vec![]; - for to in tos { - self.visit_edges::<()>(from as u32, to, |edge_idx, edge| { - if cost >= edge.cost as u64 - && all_distances[to as usize].contains(&(cost - edge.cost as u64)) - && !forbidden_edges[edge_idx as usize] - { - valid_edges.push((edge_idx, edge.cost, to)); - } - std::ops::ControlFlow::Continue(()) - }); - } + cost: u16, + // TODO: replace all_distances with a Vec where the SmallBitmap contains true if the cost exists and false otherwise + all_distances: &[Vec], + empty_paths_cache: &mut EmptyPathsCache, + visit: &mut impl FnMut(&[u16], &mut Self, &mut EmptyPathsCache) -> Result<()>, + // replace prev edges by: + // (1) a small bitmap representing the path + // (2) a pointer within the EmptyPathsCache::forbidden_prefixes structure + prev_edges: &mut Vec, + cur_path: &mut SmallBitmap, + mut forbidden_edges: SmallBitmap, + ) -> Result { + let mut any_valid = false; - for (edge_idx, edge_cost, to) in valid_edges { - prev_edges.push(edge_idx); - if empty_paths_cache.empty_prefixes.contains_prefix_of_path(prev_edges) { + let edges = self.node_edges[from].clone(); + for edge_idx in edges.iter() { + let Some(edge) = self.all_edges[edge_idx as usize].as_ref() else { continue }; + if cost < edge.cost as u16 + || forbidden_edges.contains(edge_idx) + || !all_distances[edge.to_node as usize].contains(&(cost - edge.cost as u16)) + { continue; } - let mut new_forbidden_edges = forbidden_edges.to_vec(); - for edge_idx in empty_paths_cache.empty_couple_edges[edge_idx as usize].iter() { - new_forbidden_edges[*edge_idx as usize] = true; - } - for edge_idx in empty_paths_cache.empty_prefixes.final_edges_ater_prefix(prev_edges) { - new_forbidden_edges[edge_idx as usize] = true; - } + cur_path.insert(edge_idx); + prev_edges.push(edge_idx); - if to == self.query_graph.end_node { - paths.push(prev_edges.clone()); + let mut new_forbidden_edges = forbidden_edges.clone(); + new_forbidden_edges.union(&empty_paths_cache.empty_couple_edges[edge_idx as usize]); + empty_paths_cache.empty_prefixes.final_edges_after_prefix(prev_edges, &mut |x| { + new_forbidden_edges.insert(x); + }); + + let next_any_valid = if edge.to_node == self.query_graph.end_node { + any_valid = true; + visit(prev_edges, self, empty_paths_cache)?; + true } else { - self.paths_of_cost_rec( - to as usize, + self.visit_paths_of_cost_rec( + edge.to_node as usize, + cost - edge.cost as u16, all_distances, - cost - edge_cost as u64, - prev_edges, - paths, - &new_forbidden_edges, empty_paths_cache, - ) - } + visit, + prev_edges, + cur_path, + new_forbidden_edges, + )? + }; + any_valid |= next_any_valid; + cur_path.remove(edge_idx); prev_edges.pop(); + if next_any_valid { + if empty_paths_cache.path_is_empty(prev_edges, cur_path) { + return Ok(any_valid); + } + forbidden_edges.union(&empty_paths_cache.empty_edges); + for edge in prev_edges.iter() { + forbidden_edges.union(&empty_paths_cache.empty_couple_edges[*edge as usize]); + } + empty_paths_cache.empty_prefixes.final_edges_after_prefix(prev_edges, &mut |x| { + forbidden_edges.insert(x); + }); + } + if next_any_valid && empty_paths_cache.path_is_empty(prev_edges, cur_path) { + return Ok(any_valid); + } } + + Ok(any_valid) } - pub fn initialize_distances_cheapest(&self) -> Vec> { - let mut distances_to_end: Vec> = vec![vec![]; self.query_graph.nodes.len()]; - let mut enqueued = vec![false; self.query_graph.nodes.len()]; + pub fn initialize_distances_cheapest(&self) -> Vec> { + let mut distances_to_end: Vec> = vec![vec![]; self.query_graph.nodes.len()]; + let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len() as u16); let mut node_stack = VecDeque::new(); distances_to_end[self.query_graph.end_node as usize] = vec![0]; + for prev_node in self.query_graph.edges[self.query_graph.end_node as usize].predecessors.iter() { node_stack.push_back(prev_node as usize); - enqueued[prev_node as usize] = true; + enqueued.insert(prev_node); } while let Some(cur_node) = node_stack.pop_front() { let mut self_distances = vec![]; - for succ_node in self.query_graph.edges[cur_node].successors.iter() { + + let cur_node_edges = &self.node_edges[cur_node]; + for edge_idx in cur_node_edges.iter() { + let edge = self.all_edges[edge_idx as usize].as_ref().unwrap(); + let succ_node = edge.to_node; let succ_distances = &distances_to_end[succ_node as usize]; - let _ = self.visit_edges::<()>(cur_node as u32, succ_node, |_, edge| { - for succ_distance in succ_distances { - self_distances.push(edge.cost as u64 + succ_distance); - } - std::ops::ControlFlow::Continue(()) - }); + for succ_distance in succ_distances { + self_distances.push(edge.cost as u16 + succ_distance); + } } + self_distances.sort_unstable(); self_distances.dedup(); distances_to_end[cur_node] = self_distances; for prev_node in self.query_graph.edges[cur_node].predecessors.iter() { - if !enqueued[prev_node as usize] { + if !enqueued.contains(prev_node) { node_stack.push_back(prev_node as usize); - enqueued[prev_node as usize] = true; + enqueued.insert(prev_node); } } } diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs index 21c186f3c..13ee03a22 100644 --- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs @@ -11,9 +11,20 @@ use roaring::RoaringBitmap; // computing their hash and comparing them // which can be done... // by using a pointer (real, Rc, bumpalo, or in a vector)??? +// +// But actually.... the edge details' docids are a subset of the universe at the +// moment they were computed. +// But the universes between two iterations of a ranking rule are completely different +// Thus, there is no point in doing this. +// UNLESS... +// we compute the whole docids corresponding to the edge details (potentially expensive in time and memory +// in the common case) +// +// But we could still benefit within a single iteration for requests like: +// `a a a a a a a a a` where we have many of the same edge details, repeated pub struct EdgeDocidsCache { - pub cache: FxHashMap, + pub cache: FxHashMap, _phantom: PhantomData, } impl Default for EdgeDocidsCache { @@ -25,7 +36,7 @@ impl EdgeDocidsCache { pub fn get_edge_docids<'s, 'search>( &'s mut self, ctx: &mut SearchContext<'search>, - edge_index: u32, + edge_index: u16, graph: &RankingRuleGraph, // TODO: maybe universe doesn't belong here universe: &RoaringBitmap, @@ -41,7 +52,7 @@ impl EdgeDocidsCache { return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index])); } // TODO: maybe universe doesn't belong here - let docids = universe & G::compute_docids(ctx, details)?; + let docids = universe & G::compute_docids(ctx, details, universe)?; let _ = self.cache.insert(edge_index, docids); let docids = &self.cache[&edge_index]; Ok(BitmapOrAllRef::Bitmap(docids)) diff --git a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs index bbfe2eedd..3c8fb5184 100644 --- a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs @@ -1,60 +1,48 @@ +use crate::new::small_bitmap::SmallBitmap; + use super::paths_map::PathsMap; #[derive(Clone)] pub struct EmptyPathsCache { - pub empty_edges: Vec, + pub empty_edges: SmallBitmap, pub empty_prefixes: PathsMap<()>, - pub empty_couple_edges: Vec>, + pub empty_couple_edges: Vec, } impl EmptyPathsCache { - pub fn new(all_edges_len: usize) -> Self { + pub fn new(all_edges_len: u16) -> Self { Self { - empty_edges: vec![false; all_edges_len], + empty_edges: SmallBitmap::new(all_edges_len), empty_prefixes: PathsMap::default(), - empty_couple_edges: vec![vec![]; all_edges_len], + empty_couple_edges: vec![SmallBitmap::new(all_edges_len); all_edges_len as usize], } } - pub fn forbid_edge(&mut self, edge_idx: u32) { - self.empty_edges[edge_idx as usize] = true; - self.empty_couple_edges[edge_idx as usize] = vec![]; + pub fn forbid_edge(&mut self, edge_idx: u16) { + self.empty_edges.insert(edge_idx); + self.empty_couple_edges[edge_idx as usize].clear(); self.empty_prefixes.remove_edge(&edge_idx); for edges2 in self.empty_couple_edges.iter_mut() { - if let Some(edge2_pos) = edges2.iter().position(|e| *e == edge_idx) { - edges2.swap_remove(edge2_pos); - } + edges2.remove(edge_idx); } } - pub fn forbid_prefix(&mut self, prefix: &[u32]) { + pub fn forbid_prefix(&mut self, prefix: &[u16]) { self.empty_prefixes.insert(prefix.iter().copied(), ()); } - pub fn forbid_couple_edges(&mut self, edge1: u32, edge2: u32) { - assert!(!self.empty_couple_edges[edge1 as usize].contains(&edge2)); - self.empty_couple_edges[edge1 as usize].push(edge2); + pub fn forbid_couple_edges(&mut self, edge1: u16, edge2: u16) { + self.empty_couple_edges[edge1 as usize].insert(edge2); } - pub fn path_is_empty(&self, path: &[u32]) -> bool { - for edge in path { - if self.empty_edges[*edge as usize] { + pub fn path_is_empty(&self, path: &[u16], path_bitmap: &SmallBitmap) -> bool { + if path_bitmap.intersects(&self.empty_edges) { + return true; + } + for edge in path.iter() { + let forbidden_other_edges = &self.empty_couple_edges[*edge as usize]; + if path_bitmap.intersects(forbidden_other_edges) { return true; } } if self.empty_prefixes.contains_prefix_of_path(path) { return true; } - for (edge1, edges2) in self.empty_couple_edges.iter().enumerate() { - if let Some(pos_edge1) = path.iter().position(|e| *e == edge1 as u32) { - if path[pos_edge1..].iter().any(|e| edges2.contains(e)) { - return true; - } - } - } - // for (edge1, edge2) in self.empty_couple_edges.iter() { - // if path.contains(edge1) && path.contains(edge2) { - // return true; - // } - // } - // if self.empty_prefixes.contains_prefix_of_path(path) { - // return true; - // } false } } diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 446c4e248..989986159 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -4,17 +4,16 @@ mod edge_docids_cache; mod empty_paths_cache; mod paths_map; mod proximity; -mod resolve_paths; mod typo; use super::logger::SearchLogger; +use super::small_bitmap::SmallBitmap; use super::{QueryGraph, QueryNode, SearchContext}; use crate::Result; pub use edge_docids_cache::EdgeDocidsCache; pub use empty_paths_cache::EmptyPathsCache; pub use proximity::ProximityGraph; use roaring::RoaringBitmap; -use std::ops::ControlFlow; pub use typo::TypoGraph; #[derive(Debug, Clone)] @@ -25,15 +24,15 @@ pub enum EdgeDetails { #[derive(Debug, Clone)] pub struct Edge { - pub from_node: u32, - pub to_node: u32, + pub from_node: u16, + pub to_node: u16, pub cost: u8, pub details: EdgeDetails, } #[derive(Debug, Clone)] pub struct EdgePointer<'graph, E> { - pub index: u32, + pub index: u16, pub edge: &'graph Edge, } @@ -95,6 +94,7 @@ pub trait RankingRuleGraphTrait: Sized { fn compute_docids<'search>( ctx: &mut SearchContext<'search>, edge_details: &Self::EdgeDetails, + universe: &RoaringBitmap, ) -> Result; /// Prepare to build the edges outgoing from `from_node`. @@ -116,11 +116,11 @@ pub trait RankingRuleGraphTrait: Sized { fn log_state( graph: &RankingRuleGraph, - paths: &[Vec], + paths: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, - distances: &[Vec], - cost: u64, + distances: &[Vec], + cost: u16, logger: &mut dyn SearchLogger, ); } @@ -130,9 +130,9 @@ pub struct RankingRuleGraph { // pub edges: Vec>>>, pub all_edges: Vec>>, - pub node_edges: Vec, + pub node_edges: Vec, - pub successors: Vec, + pub successors: Vec, // TODO: to get the edges between two nodes: // 1. get node_outgoing_edges[from] // 2. get node_incoming_edges[to] @@ -149,29 +149,7 @@ impl Clone for RankingRuleGraph { } } impl RankingRuleGraph { - // Visit all edges between the two given nodes in order of increasing cost. - pub fn visit_edges<'graph, O>( - &'graph self, - from: u32, - to: u32, - mut visit: impl FnMut(u32, &'graph Edge) -> ControlFlow, - ) -> Option { - let from_edges = &self.node_edges[from as usize]; - for edge_idx in from_edges { - let edge = self.all_edges[edge_idx as usize].as_ref().unwrap(); - if edge.to_node == to { - let cf = visit(edge_idx, edge); - match cf { - ControlFlow::Continue(_) => continue, - ControlFlow::Break(o) => return Some(o), - } - } - } - - None - } - - pub fn remove_edge(&mut self, edge_index: u32) { + pub fn remove_edge(&mut self, edge_index: u16) { let edge_opt = &mut self.all_edges[edge_index as usize]; let Some(edge) = &edge_opt else { return }; let (from_node, _to_node) = (edge.from_node, edge.to_node); @@ -180,9 +158,10 @@ impl RankingRuleGraph { let from_node_edges = &mut self.node_edges[from_node as usize]; from_node_edges.remove(edge_index); - let mut new_successors_from_node = RoaringBitmap::new(); + let mut new_successors_from_node = SmallBitmap::new(self.all_edges.len() as u16); + let all_edges = &self.all_edges; for from_node_edge in from_node_edges.iter() { - let Edge { to_node, .. } = &self.all_edges[from_node_edge as usize].as_ref().unwrap(); + let Edge { to_node, .. } = &all_edges[from_node_edge as usize].as_ref().unwrap(); new_successors_from_node.insert(*to_node); } self.successors[from_node as usize] = new_successors_from_node; diff --git a/milli/src/search/new/ranking_rule_graph/paths_map.rs b/milli/src/search/new/ranking_rule_graph/paths_map.rs index b9d089efc..0cce9c93f 100644 --- a/milli/src/search/new/ranking_rule_graph/paths_map.rs +++ b/milli/src/search/new/ranking_rule_graph/paths_map.rs @@ -1,9 +1,4 @@ - - - - -use roaring::RoaringBitmap; - +use crate::new::small_bitmap::SmallBitmap; use super::cheapest_paths::Path; // What is PathsMap used for? @@ -13,7 +8,7 @@ use super::cheapest_paths::Path; #[derive(Debug, Clone)] pub struct PathsMap { - pub nodes: Vec<(u32, PathsMap)>, + pub nodes: Vec<(u16, PathsMap)>, pub value: Option, } impl Default for PathsMap { @@ -39,7 +34,7 @@ impl PathsMap { self.nodes.is_empty() && self.value.is_none() } - pub fn insert(&mut self, mut edges: impl Iterator, value: V) { + pub fn insert(&mut self, mut edges: impl Iterator, value: V) { match edges.next() { None => { self.value = Some(value); @@ -57,7 +52,7 @@ impl PathsMap { } } } - fn remove_first_rec(&mut self, cur: &mut Vec) -> (bool, V) { + fn remove_first_rec(&mut self, cur: &mut Vec) -> (bool, V) { let Some((first_edge, rest)) = self.nodes.first_mut() else { // The PathsMap has to be correct by construction here, otherwise // the unwrap() will crash @@ -72,7 +67,7 @@ impl PathsMap { (false, value) } } - pub fn remove_first(&mut self) -> Option<(Vec, V)> { + pub fn remove_first(&mut self) -> Option<(Vec, V)> { if self.is_empty() { return None; } @@ -81,7 +76,7 @@ impl PathsMap { let (_, value) = self.remove_first_rec(&mut result); Some((result, value)) } - pub fn iterate_rec(&self, cur: &mut Vec, visit: &mut impl FnMut(&Vec, &V)) { + pub fn iterate_rec(&self, cur: &mut Vec, visit: &mut impl FnMut(&Vec, &V)) { if let Some(value) = &self.value { visit(cur, value); } @@ -91,7 +86,7 @@ impl PathsMap { cur.pop(); } } - pub fn iterate(&self, mut visit: impl FnMut(&Vec, &V)) { + pub fn iterate(&self, mut visit: impl FnMut(&Vec, &V)) { self.iterate_rec(&mut vec![], &mut visit) } @@ -100,7 +95,7 @@ impl PathsMap { self.remove_prefix(prefix); }); } - pub fn remove_edges(&mut self, forbidden_edges: &RoaringBitmap) { + pub fn remove_edges(&mut self, forbidden_edges: &SmallBitmap) { let mut i = 0; while i < self.nodes.len() { let should_remove = if forbidden_edges.contains(self.nodes[i].0) { @@ -118,7 +113,7 @@ impl PathsMap { } } } - pub fn remove_edge(&mut self, forbidden_edge: &u32) { + pub fn remove_edge(&mut self, forbidden_edge: &u16) { let mut i = 0; while i < self.nodes.len() { let should_remove = if &self.nodes[i].0 == forbidden_edge { @@ -136,7 +131,7 @@ impl PathsMap { } } } - pub fn remove_prefix(&mut self, forbidden_prefix: &[u32]) { + pub fn remove_prefix(&mut self, forbidden_prefix: &[u16]) { let [first_edge, remaining_prefix @ ..] = forbidden_prefix else { self.nodes.clear(); self.value = None; @@ -160,25 +155,23 @@ impl PathsMap { } } - pub fn final_edges_ater_prefix(&self, prefix: &[u32]) -> Vec { + pub fn final_edges_after_prefix(&self, prefix: &[u16], visit: &mut impl FnMut(u16)) { let [first_edge, remaining_prefix @ ..] = prefix else { - return self.nodes.iter().filter_map(|n| { - if n.1.value.is_some() { - Some(n.0) - } else { - None + for node in self.nodes.iter() { + if node.1.value.is_some() { + visit(node.0) } - }).collect(); + } + return }; for (edge, rest) in self.nodes.iter() { if edge == first_edge { - return rest.final_edges_ater_prefix(remaining_prefix); + return rest.final_edges_after_prefix(remaining_prefix, visit); } } - vec![] } - pub fn edge_indices_after_prefix(&self, prefix: &[u32]) -> Vec { + pub fn edge_indices_after_prefix(&self, prefix: &[u16]) -> Vec { let [first_edge, remaining_prefix @ ..] = prefix else { return self.nodes.iter().map(|n| n.0).collect(); }; @@ -190,7 +183,7 @@ impl PathsMap { vec![] } - pub fn contains_prefix_of_path(&self, path: &[u32]) -> bool { + pub fn contains_prefix_of_path(&self, path: &[u16]) -> bool { if self.value.is_some() { return true; } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 4603c7ea0..e0bc1f5e4 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -111,6 +111,8 @@ pub fn visit_to_node<'search, 'from_data>( for word1 in derivations1.clone() { for proximity in 1..=(8 - ngram_len2) { let cost = (proximity + ngram_len2 - 1) as u8; + // TODO: if we had access to the universe here, we could already check whether + // the bitmap corresponding to this word pair is disjoint with the universe or not if ctx .get_word_prefix_pair_proximity_docids( word1, @@ -183,8 +185,13 @@ pub fn visit_to_node<'search, 'from_data>( .flat_map(|(cost, proximity_word_pairs)| { let mut edges = vec![]; for (proximity, word_pairs) in proximity_word_pairs { - edges - .push((cost, EdgeDetails::Data(ProximityEdge { pairs: word_pairs, proximity }))) + edges.push(( + cost, + EdgeDetails::Data(ProximityEdge { + pairs: word_pairs.into_boxed_slice(), + proximity, + }), + )) } edges }) diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index df289fb2c..94a46d670 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -1,14 +1,15 @@ use super::{ProximityEdge, WordPair}; use crate::new::SearchContext; use crate::{CboRoaringBitmapCodec, Result}; -use roaring::{MultiOps, RoaringBitmap}; +use roaring::RoaringBitmap; pub fn compute_docids<'search>( ctx: &mut SearchContext<'search>, edge: &ProximityEdge, + universe: &RoaringBitmap, ) -> Result { let ProximityEdge { pairs, proximity } = edge; - let mut pair_docids = vec![]; + let mut pair_docids = RoaringBitmap::new(); for pair in pairs.iter() { let bytes = match pair { WordPair::Words { left, right } => { @@ -21,10 +22,11 @@ pub fn compute_docids<'search>( ctx.get_prefix_word_pair_proximity_docids(*left_prefix, *right, *proximity) } }?; - let bitmap = - bytes.map(CboRoaringBitmapCodec::deserialize_from).transpose()?.unwrap_or_default(); - pair_docids.push(bitmap); + // TODO: deserialize bitmap within a universe, and (maybe) using a bump allocator? + let bitmap = universe + & bytes.map(CboRoaringBitmapCodec::deserialize_from).transpose()?.unwrap_or_default(); + pair_docids |= bitmap; } - let docids = MultiOps::union(pair_docids); - Ok(docids) + + Ok(pair_docids) } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index ec1a7b5fa..6c95b0805 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -10,7 +10,7 @@ use crate::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; use roaring::RoaringBitmap; -// TODO: intern the strings, refer to them by their pointer? +// TODO: intern the proximity edges as well? #[derive(Clone)] pub enum WordPair { @@ -21,8 +21,7 @@ pub enum WordPair { #[derive(Clone)] pub struct ProximityEdge { - // TODO: use a list of pointers to the word pairs instead? - pairs: Vec, + pairs: Box<[WordPair]>, proximity: u8, } @@ -40,8 +39,9 @@ impl RankingRuleGraphTrait for ProximityGraph { fn compute_docids<'search>( ctx: &mut SearchContext<'search>, edge: &Self::EdgeDetails, + universe: &RoaringBitmap, ) -> Result { - compute_docids::compute_docids(ctx, edge) + compute_docids::compute_docids(ctx, edge, universe) } fn build_visit_from_node<'search>( @@ -61,11 +61,11 @@ impl RankingRuleGraphTrait for ProximityGraph { fn log_state( graph: &super::RankingRuleGraph, - paths: &[Vec], + paths: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, - distances: &[Vec], - cost: u64, + distances: &[Vec], + cost: u16, logger: &mut dyn SearchLogger, ) { logger.log_proximity_state( diff --git a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs deleted file mode 100644 index b3e03d555..000000000 --- a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs +++ /dev/null @@ -1,97 +0,0 @@ -#![allow(clippy::too_many_arguments)] - -use super::edge_docids_cache::EdgeDocidsCache; -use super::empty_paths_cache::EmptyPathsCache; -use super::{RankingRuleGraph, RankingRuleGraphTrait}; -use crate::new::{BitmapOrAllRef, SearchContext}; -use crate::Result; -use roaring::{MultiOps, RoaringBitmap}; - -impl RankingRuleGraph { - // TODO: reduce the universe after computing each path - // TODO: deserialize roaring bitmap within a universe - pub fn resolve_paths<'search>( - &mut self, - ctx: &mut SearchContext<'search>, - edge_docids_cache: &mut EdgeDocidsCache, - empty_paths_cache: &mut EmptyPathsCache, - universe: &RoaringBitmap, - mut paths: Vec>, - ) -> Result { - paths.sort_unstable(); - // let mut needs_filtering_empty_edges = false; - // let mut needs_filtering_empty_prefix = false; - // let mut needs_filtering_empty_couple_edges = false; - let mut needs_filtering = false; - let mut path_bitmaps = vec![]; - 'path_loop: loop { - // TODO: distinguish between empty_edges, empty_prefix, and empty_couple_edges filtering - if needs_filtering { - for path in paths.iter_mut() { - if empty_paths_cache.path_is_empty(path) { - path.clear(); - } - } - needs_filtering = false; - } - let Some(edge_indexes) = paths.pop() else { - break; - }; - - if edge_indexes.is_empty() { - continue; - } - - let mut path_bitmap = universe.clone(); - let mut visited_edges = vec![]; - let mut cached_edge_docids = vec![]; - 'edge_loop: for edge_index in edge_indexes { - visited_edges.push(edge_index); - let edge_docids = - edge_docids_cache.get_edge_docids(ctx, edge_index, self, universe)?; - match edge_docids { - BitmapOrAllRef::Bitmap(edge_docids) => { - cached_edge_docids.push((edge_index, edge_docids.clone())); - let (_, edge_docids) = cached_edge_docids.last().unwrap(); - if edge_docids.is_disjoint(universe) { - // 1. Store in the cache that this edge is empty for this universe - empty_paths_cache.forbid_edge(edge_index); - // 2. remove this edge from the proximity graph - self.remove_edge(edge_index); - edge_docids_cache.cache.remove(&edge_index); - needs_filtering = true; - // needs_filtering_empty_edges = true; - // 3. continue executing this function again on the remaining paths - continue 'path_loop; - } else { - path_bitmap &= edge_docids; - if path_bitmap.is_disjoint(universe) { - // needs_filtering_empty_prefix = true; - needs_filtering = true; - empty_paths_cache.forbid_prefix(&visited_edges); - // if the intersection between this edge and any - // previous one is disjoint with the universe, - // then we add these two edges to the empty_path_cache - for (edge_index2, edge_docids2) in - cached_edge_docids[..cached_edge_docids.len() - 1].iter() - { - let intersection = edge_docids & edge_docids2; - if intersection.is_disjoint(universe) { - // needs_filtering_empty_couple_edges = true; - empty_paths_cache - .forbid_couple_edges(*edge_index2, edge_index); - } - } - continue 'path_loop; - } - } - } - BitmapOrAllRef::All => continue 'edge_loop, - } - } - path_bitmaps.push(path_bitmap); - } - - Ok(MultiOps::union(path_bitmaps)) - } -} diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index d3771221f..c510c4851 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -31,6 +31,7 @@ impl RankingRuleGraphTrait for TypoGraph { fn compute_docids<'db_cache, 'search>( ctx: &mut SearchContext<'search>, edge: &Self::EdgeDetails, + universe: &RoaringBitmap, ) -> Result { match edge { TypoEdge::Phrase { phrase } => resolve_phrase(ctx, *phrase), @@ -44,14 +45,17 @@ impl RankingRuleGraphTrait for TypoGraph { let mut docids = RoaringBitmap::new(); for word in words.iter().copied() { let Some(bytes) = ctx.get_word_docids(word)? else { continue }; - let bitmap = - RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?; + // TODO: deserialize bitmap within a universe + let bitmap = universe + & RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?; docids |= bitmap; } if *nbr_typos == 0 { if let Some(bytes) = ctx.get_prefix_docids(derivations.original)? { - let bitmap = - RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?; + // TODO: deserialize bitmap within a universe + let bitmap = universe + & RoaringBitmapCodec::bytes_decode(bytes) + .ok_or(heed::Error::Decoding)?; docids |= bitmap; } } @@ -116,11 +120,11 @@ impl RankingRuleGraphTrait for TypoGraph { fn log_state( graph: &RankingRuleGraph, - paths: &[Vec], + paths: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, - distances: &[Vec], - cost: u64, + distances: &[Vec], + cost: u16, logger: &mut dyn SearchLogger, ) { logger.log_typo_state(graph, paths, empty_paths_cache, universe, distances.to_vec(), cost); diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index bfb9b5492..b65ff6d1a 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -262,46 +262,48 @@ mod tests { println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len()); - // loop { - let start = Instant::now(); + loop { + let start = Instant::now(); - // let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); + // let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); + let mut ctx = SearchContext::new(&index, &txn); + let results = execute_search( + &mut ctx, + "which a the releases from poison by the government", + None, + 0, + 20, + &mut DefaultSearchLogger, + // &mut logger, + ) + .unwrap(); - let results = execute_search( - &mut SearchContext::new(&index, &txn), - "releases from poison by the government", - None, - 0, - 20, - &mut DefaultSearchLogger, - // &mut logger, - ) - .unwrap(); + // logger.write_d2_description(&mut ctx); - // logger.write_d2_description(); + let elapsed = start.elapsed(); + println!("{}us", elapsed.as_micros()); - let elapsed = start.elapsed(); + let _documents = index + .documents(&txn, results.iter().copied()) + .unwrap() + .into_iter() + .map(|(id, obkv)| { + let mut object = serde_json::Map::default(); + for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { + let value = obkv.get(fid).unwrap(); + let value: serde_json::Value = serde_json::from_slice(value).unwrap(); + object.insert(fid_name.to_owned(), value); + } + (id, serde_json::to_string_pretty(&object).unwrap()) + }) + .collect::>(); - let documents = index - .documents(&txn, results.iter().copied()) - .unwrap() - .into_iter() - .map(|(id, obkv)| { - let mut object = serde_json::Map::default(); - for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { - let value = obkv.get(fid).unwrap(); - let value: serde_json::Value = serde_json::from_slice(value).unwrap(); - object.insert(fid_name.to_owned(), value); - } - (id, serde_json::to_string_pretty(&object).unwrap()) - }) - .collect::>(); - - println!("{}us: {:?}", elapsed.as_micros(), results); - for (id, document) in documents { - println!("{id}:"); - println!("{document}"); + println!("{}us: {:?}", elapsed.as_micros(), results); } + // for (id, _document) in documents { + // println!("{id}:"); + // // println!("{document}"); + // } } #[test] @@ -342,9 +344,9 @@ mod tests { .collect::>(); println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); - for (id, document) in documents { + for (id, _document) in documents { println!("{id}:"); - println!("{document}"); + // println!("{document}"); } } #[test] @@ -360,7 +362,7 @@ mod tests { // loop { let start = Instant::now(); - let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); + // let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); let mut ctx = SearchContext::new(&index, &txn); let results = execute_search( &mut ctx, @@ -368,12 +370,12 @@ mod tests { None, 0, 20, - // &mut DefaultSearchLogger, - &mut logger, + &mut DefaultSearchLogger, + // &mut logger, ) .unwrap(); - logger.write_d2_description(&mut ctx); + // logger.write_d2_description(&mut ctx); let elapsed = start.elapsed(); diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index de5cf02ab..4fa0912e1 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -1,5 +1,6 @@ use super::interner::Interned; use super::query_term::{Phrase, QueryTerm, WordDerivations}; +use super::small_bitmap::SmallBitmap; use super::{QueryGraph, QueryNode, SearchContext}; use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec}; use fxhash::FxHashMap; @@ -10,13 +11,13 @@ use std::collections::VecDeque; // TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc. #[derive(Default)] pub struct NodeDocIdsCache { - pub cache: FxHashMap, + pub cache: FxHashMap, } impl<'search> SearchContext<'search> { fn get_node_docids<'cache>( &'cache mut self, term: &QueryTerm, - node_idx: u32, + node_idx: u16, ) -> Result<&'cache RoaringBitmap> { if self.node_docids_cache.cache.contains_key(&node_idx) { return Ok(&self.node_docids_cache.cache[&node_idx]); @@ -76,7 +77,7 @@ pub fn resolve_query_graph<'search>( // TODO: there is definitely a faster way to compute this big // roaring bitmap expression - let mut nodes_resolved = RoaringBitmap::new(); + let mut nodes_resolved = SmallBitmap::new(64); let mut path_nodes_docids = vec![RoaringBitmap::new(); q.nodes.len()]; let mut next_nodes_to_visit = VecDeque::new(); @@ -89,8 +90,10 @@ pub fn resolve_query_graph<'search>( continue; } // Take union of all predecessors - let predecessors_iter = predecessors.iter().map(|p| &path_nodes_docids[p as usize]); - let predecessors_docids = MultiOps::union(predecessors_iter); + let mut predecessors_docids = RoaringBitmap::new(); + for p in predecessors.iter() { + predecessors_docids |= &path_nodes_docids[p as usize]; + } let n = &q.nodes[node as usize]; diff --git a/milli/src/search/new/small_bitmap.rs b/milli/src/search/new/small_bitmap.rs new file mode 100644 index 000000000..f7adecee0 --- /dev/null +++ b/milli/src/search/new/small_bitmap.rs @@ -0,0 +1,271 @@ +// #[macro_export] +// macro_rules! iter_bitmap { +// ($bitmap:expr, $id:lifetime, $p:pat, $body:block) => { +// match $bitmap { +// SmallBitmap::Tiny(mut set) => { +// while set > 0 { +// let $p = set.trailing_zeros() as u16; +// $body; +// set &= set - 1; +// } +// } +// SmallBitmap::Small(sets) => { +// let mut base = 0; +// for set in sets.iter() { +// let mut set = *set; +// while set > 0 { +// let idx = set.trailing_zeros() as u16; +// let $p = idx + base; +// set &= set - 1; +// $body; +// } +// base += 64; +// } +// } +// } +// }; +// } + +#[derive(Clone)] +pub enum SmallBitmap { + Tiny(u64), + Small(Box<[u64]>), +} +impl SmallBitmap { + pub fn new(universe_length: u16) -> Self { + if universe_length <= 64 { + Self::Tiny(0) + } else { + Self::Small(vec![0; 1 + universe_length as usize / 64].into_boxed_slice()) + } + } + pub fn from_iter(xs: impl Iterator, universe_length: u16) -> Self { + let mut s = Self::new(universe_length); + for x in xs { + s.insert(x); + } + s + } + pub fn from_array(xs: &[u16], universe_length: u16) -> Self { + let mut s = Self::new(universe_length); + for x in xs { + s.insert(*x); + } + s + } + pub fn is_empty(&self) -> bool { + match self { + SmallBitmap::Tiny(set) => *set == 0, + SmallBitmap::Small(sets) => { + for set in sets.iter() { + if *set != 0 { + return false; + } + } + true + } + } + } + pub fn clear(&mut self) { + match self { + SmallBitmap::Tiny(set) => *set = 0, + SmallBitmap::Small(sets) => { + for set in sets.iter_mut() { + *set = 0; + } + } + } + } + pub fn contains(&self, mut x: u16) -> bool { + let set = match self { + SmallBitmap::Tiny(set) => *set, + SmallBitmap::Small(set) => { + let idx = x / 64; + x %= 64; + set[idx as usize] + } + }; + set & 0b1 << x != 0 + } + pub fn insert(&mut self, mut x: u16) { + let set = match self { + SmallBitmap::Tiny(set) => set, + SmallBitmap::Small(set) => { + let idx = x / 64; + x %= 64; + &mut set[idx as usize] + } + }; + *set |= 0b1 << x; + } + pub fn remove(&mut self, mut x: u16) { + let set = match self { + SmallBitmap::Tiny(set) => set, + SmallBitmap::Small(set) => { + let idx = x / 64; + x %= 64; + &mut set[idx as usize] + } + }; + *set &= !(0b1 << x); + } + // fn iter_single(mut set: u64, mut visit: impl FnMut(u16) -> Result<()>) -> Result<()> { + // while set > 0 { + // let idx = set.trailing_zeros() as u16; + // visit(idx)?; + // set &= set - 1; + // } + // Ok(()) + // } + // pub fn iter(&self, mut visit: impl FnMut(u16) -> Result<()>) -> Result<()> { + // match self { + // SmallBitmap::Tiny(set) => Self::iter_single(*set, &mut visit), + // SmallBitmap::Small(sets) => { + // let mut base = 0; + // for set in sets.iter() { + // Self::iter_single(*set, |x| visit(base + x))?; + // base += 64; + // } + // Ok(()) + // } + // } + // } + + pub fn intersection(&mut self, other: &SmallBitmap) { + self.apply_op(other, |a, b| *a &= b); + } + pub fn union(&mut self, other: &SmallBitmap) { + self.apply_op(other, |a, b| *a |= b); + } + pub fn subtract(&mut self, other: &SmallBitmap) { + self.apply_op(other, |a, b| *a &= !b); + } + + pub fn apply_op(&mut self, other: &SmallBitmap, op: impl Fn(&mut u64, u64)) { + match (self, other) { + (SmallBitmap::Tiny(a), SmallBitmap::Tiny(b)) => op(a, *b), + (SmallBitmap::Small(a), SmallBitmap::Small(b)) => { + assert!(a.len() == b.len(),); + for (a, b) in a.iter_mut().zip(b.iter()) { + op(a, *b); + } + } + _ => { + panic!(); + } + } + } + pub fn all_satisfy_op(&self, other: &SmallBitmap, op: impl Fn(u64, u64) -> bool) -> bool { + match (self, other) { + (SmallBitmap::Tiny(a), SmallBitmap::Tiny(b)) => op(*a, *b), + (SmallBitmap::Small(a), SmallBitmap::Small(b)) => { + assert!(a.len() == b.len()); + for (a, b) in a.iter().zip(b.iter()) { + if !op(*a, *b) { + return false; + } + } + true + } + _ => { + panic!(); + } + } + } + pub fn any_satisfy_op(&self, other: &SmallBitmap, op: impl Fn(u64, u64) -> bool) -> bool { + match (self, other) { + (SmallBitmap::Tiny(a), SmallBitmap::Tiny(b)) => op(*a, *b), + (SmallBitmap::Small(a), SmallBitmap::Small(b)) => { + assert!(a.len() == b.len()); + for (a, b) in a.iter().zip(b.iter()) { + if op(*a, *b) { + return true; + } + } + false + } + _ => { + panic!(); + } + } + } + pub fn is_subset(&self, other: &SmallBitmap) -> bool { + self.all_satisfy_op(other, |a, b| a & !b == 0) + } + pub fn intersects(&self, other: &SmallBitmap) -> bool { + self.any_satisfy_op(other, |a, b| a & b != 0) + } + pub fn iter(&self) -> SmallBitmapIter<'_> { + match self { + SmallBitmap::Tiny(x) => SmallBitmapIter::Tiny(*x), + SmallBitmap::Small(xs) => { + SmallBitmapIter::Small { cur: xs[0], next: &xs[1..], base: 0 } + } + } + } +} + +pub enum SmallBitmapIter<'b> { + Tiny(u64), + Small { cur: u64, next: &'b [u64], base: u16 }, +} +impl<'b> Iterator for SmallBitmapIter<'b> { + type Item = u16; + + fn next(&mut self) -> Option { + match self { + SmallBitmapIter::Tiny(set) => { + if *set > 0 { + let idx = set.trailing_zeros() as u16; + *set &= *set - 1; + Some(idx) + } else { + None + } + } + SmallBitmapIter::Small { cur, next, base } => { + if *cur > 0 { + let idx = cur.trailing_zeros() as u16; + *cur &= *cur - 1; + Some(idx + *base) + } else if next.is_empty() { + return None; + } else { + *base += 64; + *cur = next[0]; + *next = &next[1..]; + self.next() + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::SmallBitmap; + + #[test] + fn test_small_bitmap() { + let mut bitmap1 = SmallBitmap::new(32); + for x in 0..16 { + bitmap1.insert(x * 2); + } + let mut bitmap2 = SmallBitmap::new(32); + for x in 0..=10 { + bitmap2.insert(x * 3); + } + bitmap1.intersection(&bitmap2); + // println!("{}", bitmap.contains(12)); + // bitmap1 + // .iter(|x| { + // println!("{x}"); + // Ok(()) + // }) + // .unwrap(); + + // iter_bitmap!(bitmap1, 'loop1, x, { + // println!("{x}"); + // }) + } +}