diff --git a/milli/examples/index.rs b/milli/examples/index.rs index 18f82797b..781440b56 100644 --- a/milli/examples/index.rs +++ b/milli/examples/index.rs @@ -6,7 +6,7 @@ use std::path::Path; use heed::EnvOpenOptions; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; -use milli::{Criterion, Index, Object}; +use milli::{Index, Object}; fn usage(error: &str, program_name: &str) -> String { format!( @@ -52,18 +52,10 @@ fn main() -> Result<(), Box> { let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); builder.set_filterable_fields(filterable_fields); - builder.set_criteria(vec![ - Criterion::Words, - Criterion::Typo, - Criterion::Proximity, - Criterion::Attribute, - ]); builder.execute(|_| (), || false).unwrap(); let config = IndexerConfig::default(); - let mut indexing_config = IndexDocumentsConfig::default(); - - indexing_config.autogenerate_docids = true; + let indexing_config = IndexDocumentsConfig::default(); let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap(); diff --git a/milli/examples/settings.rs b/milli/examples/settings.rs index bb24969cc..c7f4780cb 100644 --- a/milli/examples/settings.rs +++ b/milli/examples/settings.rs @@ -24,8 +24,8 @@ fn main() { Criterion::Typo, Criterion::Proximity, Criterion::Attribute, + Criterion::Sort, Criterion::Exactness, - // Criterion::Asc("release_date".to_owned()), ]); builder.execute(|_| (), || false).unwrap(); diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs index ec0116fae..5144a0a28 100644 --- a/milli/src/search/new/bucket_sort.rs +++ b/milli/src/search/new/bucket_sort.rs @@ -72,7 +72,11 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( /// Update the universes accordingly and inform the logger. macro_rules! back { () => { - assert!(ranking_rule_universes[cur_ranking_rule_index].is_empty()); + assert!( + ranking_rule_universes[cur_ranking_rule_index].is_empty(), + "The ranking rule {} did not sort its bucket exhaustively", + ranking_rules[cur_ranking_rule_index].id() + ); logger.end_iteration_ranking_rule( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index].as_ref(), diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 0d22b5b1e..f5918517b 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -36,6 +36,7 @@ That is we find the documents where either: - OR: `pretty` is 2-close to `house` AND `house` is 1-close to `by` */ +use std::collections::BTreeSet; use std::ops::ControlFlow; use roaring::RoaringBitmap; @@ -99,6 +100,8 @@ impl GraphBasedRankingRule { } } +static mut COUNT_PATHS: usize = 0; + /// The internal state of a graph-based ranking rule during iteration pub struct GraphBasedRankingRuleState { /// The current graph @@ -110,7 +113,7 @@ pub struct GraphBasedRankingRuleState { /// A structure giving the list of possible costs from each node to the end node all_costs: MappedInterner>, /// An index in the first element of `all_distances`, giving the cost of the next bucket - cur_distance_idx: usize, + cur_cost: u64, } impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBasedRankingRule { @@ -160,7 +163,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase conditions_cache: condition_docids_cache, dead_ends_cache, all_costs, - cur_distance_idx: 0, + cur_cost: 0, }; self.state = Some(state); @@ -181,16 +184,16 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // should never happen let mut state = self.state.take().unwrap(); - // If the cur_distance_idx does not point to a valid cost in the `all_distances` - // structure, then we have computed all the buckets and can return. - if state.cur_distance_idx >= state.all_costs.get(state.graph.query_graph.root_node).len() { - self.state = None; - return Ok(None); - } - // Retrieve the cost of the paths to compute - let cost = state.all_costs.get(state.graph.query_graph.root_node)[state.cur_distance_idx]; - state.cur_distance_idx += 1; + let Some(&cost) = state + .all_costs + .get(state.graph.query_graph.root_node) + .iter() + .find(|c| **c >= state.cur_cost) else { + self.state = None; + return Ok(None); + }; + state.cur_cost = cost + 1; let mut bucket = RoaringBitmap::new(); @@ -199,7 +202,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase conditions_cache: condition_docids_cache, dead_ends_cache, all_costs, - cur_distance_idx: _, + cur_cost: _, } = &mut state; let mut universe = universe.clone(); @@ -216,9 +219,34 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // the number of future candidate paths given by that same function. let mut subpaths_docids: Vec<(Interned, RoaringBitmap)> = vec![]; + let mut at_least_one = false; + // unsafe { + // if COUNT_PATHS >= 1489 && COUNT_PATHS < 1491 { + // println!("COUNT_PATHS {COUNT_PATHS} COST {cost}, NODES {COUNT_VISITED_NODES}, UNIVERSE {}", universe.len()); + // // let all_costs = all_costs.get(graph.query_graph.root_node); + // // println!("{all_costs:?}"); + // dead_ends_cache.debug_print(0); + // println!("{universe:?}"); + + // println!("=================="); + // } + // } + let mut nodes_with_removed_outgoing_conditions = BTreeSet::new(); let visitor = PathVisitor::new(cost, graph, all_costs, dead_ends_cache); + visitor.visit_paths(&mut |path, graph, dead_ends_cache| { + unsafe { + COUNT_PATHS += 1; + } + // if self.id == "position" { + // at_least_one = true; + // print!("."); + // } + // if self.id == "fid" { + at_least_one = true; + // print!("!"); + // } considered_paths.push(path.to_vec()); // If the universe is empty, stop exploring the graph, since no docids will ever be found anymore. if universe.is_empty() { @@ -243,7 +271,6 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase }; // Then for the remaining of the path, we continue computing docids. for latest_condition in path[idx_of_first_different_condition..].iter().copied() { - // The visit_path_condition will stop let success = visit_path_condition( ctx, graph, @@ -251,6 +278,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase dead_ends_cache, condition_docids_cache, &mut subpaths_docids, + &mut nodes_with_removed_outgoing_conditions, latest_condition, )?; if !success { @@ -281,7 +309,11 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase Ok(ControlFlow::Continue(())) } })?; - + // if at_least_one { + // unsafe { + // println!("\n===== {id} COST: {cost} ==== PATHS: {COUNT_PATHS} ==== NODES: {COUNT_VISITED_NODES} ===== UNIVERSE: {universe}", id=self.id, universe=universe.len()); + // } + // } logger.log_internal_state(graph); logger.log_internal_state(&good_paths); @@ -305,6 +337,10 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase let next_query_graph = QueryGraph::build_from_paths(paths); + if !nodes_with_removed_outgoing_conditions.is_empty() { + graph.update_all_costs_before_nodes(&nodes_with_removed_outgoing_conditions, all_costs); + } + self.state = Some(state); Ok(Some(RankingRuleOutput { query: next_query_graph, candidates: bucket })) @@ -321,6 +357,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase /// Returns false if the intersection between the condition /// docids and the previous path docids is empty. +#[allow(clippy::too_many_arguments)] fn visit_path_condition( ctx: &mut SearchContext, graph: &mut RankingRuleGraph, @@ -328,6 +365,7 @@ fn visit_path_condition( dead_ends_cache: &mut DeadEndsCache, condition_docids_cache: &mut ConditionDocIdsCache, subpath: &mut Vec<(Interned, RoaringBitmap)>, + nodes_with_removed_outgoing_conditions: &mut BTreeSet>, latest_condition: Interned, ) -> Result { let condition_docids = &condition_docids_cache @@ -337,7 +375,8 @@ fn visit_path_condition( // 1. Store in the cache that this edge is empty for this universe dead_ends_cache.forbid_condition(latest_condition); // 2. remove all the edges with this condition from the ranking rule graph - graph.remove_edges_with_condition(latest_condition); + let source_nodes = graph.remove_edges_with_condition(latest_condition); + nodes_with_removed_outgoing_conditions.extend(source_nodes); return Ok(false); } diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 443ab0ec4..4a104df69 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -100,16 +100,21 @@ impl VisitorState { let ControlFlow::Continue(next_any_valid) = cf else { return Ok(ControlFlow::Break(())); }; + any_valid |= next_any_valid; if next_any_valid { + // backtrack as much as possible if a valid path was found and the dead_ends_cache + // was updated such that the current prefix is now invalid self.forbidden_conditions = ctx .dead_ends_cache .forbidden_conditions_for_all_prefixes_up_to(self.path.iter().copied()); if self.visited_conditions.intersects(&self.forbidden_conditions) { - break; + return Ok(ControlFlow::Continue(true)); } } - any_valid |= next_any_valid; } + // if there wasn't any valid path from this node to the end node, then + // this node is a dead end **for this specific cost**. + // we could encode this in the dead-ends cache Ok(ControlFlow::Continue(any_valid)) } @@ -117,7 +122,7 @@ impl VisitorState { fn visit_no_condition( &mut self, dest_node: Interned, - edge_forbidden_nodes: &SmallBitmap, + edge_new_nodes_to_skip: &SmallBitmap, visit: VisitFn, ctx: &mut VisitorContext, ) -> Result> { @@ -137,7 +142,7 @@ impl VisitorState { } } else { let old_fbct = self.forbidden_conditions_to_nodes.clone(); - self.forbidden_conditions_to_nodes.union(edge_forbidden_nodes); + self.forbidden_conditions_to_nodes.union(edge_new_nodes_to_skip); let cf = self.visit_node(dest_node, visit, ctx)?; self.forbidden_conditions_to_nodes = old_fbct; Ok(cf) @@ -147,14 +152,14 @@ impl VisitorState { &mut self, condition: Interned, dest_node: Interned, - edge_forbidden_nodes: &SmallBitmap, + edge_new_nodes_to_skip: &SmallBitmap, visit: VisitFn, ctx: &mut VisitorContext, ) -> Result> { assert!(dest_node != ctx.graph.query_graph.end_node); if self.forbidden_conditions_to_nodes.contains(dest_node) - || edge_forbidden_nodes.intersects(&self.visited_nodes) + || edge_new_nodes_to_skip.intersects(&self.visited_nodes) { return Ok(ControlFlow::Continue(false)); } @@ -162,11 +167,13 @@ impl VisitorState { return Ok(ControlFlow::Continue(false)); } - if ctx + // Checking that from the destination node, there is at least + // one cost that we can visit that corresponds to our remaining budget. + if !ctx .all_costs_from_node .get(dest_node) .iter() - .all(|next_cost| *next_cost != self.remaining_cost) + .any(|next_cost| *next_cost == self.remaining_cost) { return Ok(ControlFlow::Continue(false)); } @@ -182,7 +189,7 @@ impl VisitorState { self.forbidden_conditions.union(&next_forbidden); } let old_fctn = self.forbidden_conditions_to_nodes.clone(); - self.forbidden_conditions_to_nodes.union(edge_forbidden_nodes); + self.forbidden_conditions_to_nodes.union(edge_new_nodes_to_skip); let cf = self.visit_node(dest_node, visit, ctx)?; @@ -212,22 +219,21 @@ impl RankingRuleGraph { } while let Some(cur_node) = node_stack.pop_front() { - let mut self_costs = BTreeSet::::new(); + let mut self_costs = Vec::::new(); let cur_node_edges = &self.edges_of_node.get(cur_node); for edge_idx in cur_node_edges.iter() { let edge = self.edges_store.get(edge_idx).as_ref().unwrap(); let succ_node = edge.dest_node; let succ_costs = costs_to_end.get(succ_node); - for succ_distance in succ_costs { - self_costs.insert(edge.cost as u64 + succ_distance); + for succ_cost in succ_costs { + self_costs.push(edge.cost as u64 + succ_cost); } } - let costs_to_end_cur_node = costs_to_end.get_mut(cur_node); - for cost in self_costs.iter() { - costs_to_end_cur_node.push(*cost); - } - *costs_to_end.get_mut(cur_node) = self_costs.into_iter().collect(); + self_costs.sort_unstable(); + self_costs.dedup(); + + *costs_to_end.get_mut(cur_node) = self_costs; for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() { if !enqueued.contains(prev_node) { node_stack.push_back(prev_node); @@ -237,4 +243,56 @@ impl RankingRuleGraph { } costs_to_end } + + pub fn update_all_costs_before_nodes( + &self, + removed_nodes: &BTreeSet>, + costs: &mut MappedInterner>, + ) { + // unsafe { + // FIND_ALL_COSTS_INC_COUNT += 1; + // println!( + // "update_all_costs_after_removing_edge incrementally count: {}", + // FIND_ALL_COSTS_INC_COUNT + // ); + // } + + let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len()); + let mut node_stack = VecDeque::new(); + + for node in removed_nodes.iter() { + enqueued.insert(*node); + node_stack.push_back(*node); + } + + while let Some(cur_node) = node_stack.pop_front() { + let mut self_costs = BTreeSet::::new(); + + let cur_node_edges = &self.edges_of_node.get(cur_node); + for edge_idx in cur_node_edges.iter() { + let edge = self.edges_store.get(edge_idx).as_ref().unwrap(); + let succ_node = edge.dest_node; + let succ_costs = costs.get(succ_node); + for succ_distance in succ_costs { + self_costs.insert(edge.cost as u64 + succ_distance); + } + } + let costs_to_end_cur_node = costs.get_mut(cur_node); + for cost in self_costs.iter() { + costs_to_end_cur_node.push(*cost); + } + let self_costs = self_costs.into_iter().collect::>(); + if &self_costs == costs.get(cur_node) { + continue; + } + *costs.get_mut(cur_node) = self_costs; + + for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() { + if !enqueued.contains(prev_node) { + node_stack.push_back(prev_node); + enqueued.insert(prev_node); + } + } + } + } } diff --git a/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs b/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs index f3bb25d56..4bbf91fcd 100644 --- a/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs @@ -88,4 +88,12 @@ impl DeadEndsCache { } } } + + // pub fn debug_print(&self, indent: usize) { + // println!("{} {:?}", " ".repeat(indent), self.forbidden.iter().collect::>()); + // for (condition, next) in self.conditions.iter().zip(self.next.iter()) { + // println!("{} {condition}:", " ".repeat(indent)); + // next.debug_print(indent + 2); + // } + // } } diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index db65afdd7..f60c481de 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -10,10 +10,10 @@ mod cheapest_paths; mod condition_docids_cache; mod dead_ends_cache; -/// Implementation of the `attribute` ranking rule -mod fid; /// Implementation of the `exactness` ranking rule mod exactness; +/// Implementation of the `attribute` ranking rule +mod fid; /// Implementation of the `position` ranking rule mod position; /// Implementation of the `proximity` ranking rule @@ -21,13 +21,14 @@ mod proximity; /// Implementation of the `typo` ranking rule mod typo; +use std::collections::BTreeSet; use std::hash::Hash; -pub use fid::{FidCondition, FidGraph}; pub use cheapest_paths::PathVisitor; pub use condition_docids_cache::ConditionDocIdsCache; pub use dead_ends_cache::DeadEndsCache; pub use exactness::{ExactnessCondition, ExactnessGraph}; +pub use fid::{FidCondition, FidGraph}; pub use position::{PositionCondition, PositionGraph}; pub use proximity::{ProximityCondition, ProximityGraph}; use roaring::RoaringBitmap; @@ -130,7 +131,12 @@ impl Clone for RankingRuleGraph { } impl RankingRuleGraph { /// Remove all edges with the given condition - pub fn remove_edges_with_condition(&mut self, condition_to_remove: Interned) { + /// Return a set of all the source nodes of the removed edges + pub fn remove_edges_with_condition( + &mut self, + condition_to_remove: Interned, + ) -> BTreeSet> { + let mut source_nodes = BTreeSet::new(); for (edge_id, edge_opt) in self.edges_store.iter_mut() { let Some(edge) = edge_opt.as_mut() else { continue }; let Some(condition) = edge.condition else { continue }; @@ -139,7 +145,9 @@ impl RankingRuleGraph { let (source_node, _dest_node) = (edge.source_node, edge.dest_node); *edge_opt = None; self.edges_of_node.get_mut(source_node).remove(edge_id); + source_nodes.insert(source_node); } } + source_nodes } } diff --git a/milli/src/search/new/ranking_rule_graph/position/mod.rs b/milli/src/search/new/ranking_rule_graph/position/mod.rs index a8e3f3916..d3b9ac1d1 100644 --- a/milli/src/search/new/ranking_rule_graph/position/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/position/mod.rs @@ -74,7 +74,7 @@ impl RankingRuleGraphTrait for PositionGraph { let mut edges = vec![]; for position in all_positions { - let cost = { + let sum_positions = { let mut cost = 0; for i in 0..term.term_ids.len() { // This is actually not fully correct and slightly penalises ngrams unfairly. @@ -89,7 +89,7 @@ impl RankingRuleGraphTrait for PositionGraph { // TODO: We can improve performances and relevancy by storing // the term subsets associated to each position fetched. edges.push(( - cost, + cost_from_sum_positions(sum_positions), conditions_interner.insert(PositionCondition { term: term.clone(), // TODO remove this ugly clone position, @@ -100,3 +100,26 @@ impl RankingRuleGraphTrait for PositionGraph { Ok(edges) } } + +fn cost_from_sum_positions(sum_positions: u32) -> u32 { + match sum_positions { + 0 | 1 | 2 | 3 => sum_positions, + 4 | 5 => 4, + 6 | 7 => 5, + 8 | 9 => 6, + 10 | 11 => 7, + 12 | 13 => 8, + 14 | 15 => 9, + 16 | 17..=24 => 10, + 25..=32 => 11, + 33..=64 => 12, + 65..=128 => 13, + 129..=256 => 14, + 257..=512 => 15, + 513..=1024 => 16, + 1025..=2048 => 17, + 2049..=4096 => 18, + 4097..=8192 => 19, + _ => 20, + } +}