diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index a9bb31682..a466714e3 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -71,30 +71,29 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap assert!(universe.len() > 1); let mut state = self.state.take().unwrap(); - let Some(cheapest_paths_state) = state.cheapest_paths_state.take() else { + let Some(mut cheapest_paths_state) = state.cheapest_paths_state.take() else { return Ok(None); }; let mut paths = PathsMap::default(); - if let Some(next_cheapest_paths_state) = cheapest_paths_state - .compute_paths_of_next_lowest_cost( - &mut state.graph, - &state.empty_paths_cache, - &mut paths, - ) - { - state.cheapest_paths_state = Some(next_cheapest_paths_state); - } else { - state.cheapest_paths_state = None; + while paths.is_empty() { + if let Some(next_cheapest_paths_state) = cheapest_paths_state + .compute_paths_of_next_lowest_cost( + &mut state.graph, + &state.empty_paths_cache, + &mut paths, + ) + { + cheapest_paths_state = next_cheapest_paths_state; + } else { + self.state = None; + return Ok(None); + } } + state.cheapest_paths_state = Some(cheapest_paths_state); - if paths.is_empty() { - self.state = None; - return Ok(None); - } - - G::log_state(&state.graph, &paths, logger); + G::log_state(&state.graph, &paths, &state.empty_paths_cache, logger); let bucket = state.graph.resolve_paths( index, diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index b3f5bbcce..b59b30e6e 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -6,6 +6,7 @@ use std::{io::Write, path::PathBuf}; use crate::new::QueryNode; use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; +use crate::new::ranking_rule_graph::empty_paths_cache::EmptyPathsCache; use crate::new::ranking_rule_graph::{Edge, EdgeDetails, RankingRuleGraphTrait}; use crate::new::ranking_rule_graph::{ paths_map::PathsMap, proximity::ProximityGraph, RankingRuleGraph, @@ -36,6 +37,7 @@ pub enum SearchEvents { ProximityState { graph: RankingRuleGraph, paths: PathsMap, + empty_paths_cache: EmptyPathsCache, }, } @@ -107,16 +109,16 @@ impl SearchLogger for DetailedSearchLogger { universe: universe.clone(), }) } - fn add_to_results(&mut self, docids: &RoaringBitmap) { - self.events.push(SearchEvents::ExtendResults { new: docids.clone() }); + fn add_to_results(&mut self, docids: &mut dyn Iterator) { + self.events.push(SearchEvents::ExtendResults { new: docids.collect() }); } fn log_words_state(&mut self, query_graph: &QueryGraph) { self.events.push(SearchEvents::WordsState { query_graph: query_graph.clone() }); } - fn log_proximity_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &PathsMap,) { - self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.clone() }) + fn log_proximity_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &PathsMap, empty_paths_cache: &EmptyPathsCache) { + self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.clone(), empty_paths_cache: empty_paths_cache.clone() }) } @@ -224,13 +226,13 @@ results.{random} {{ link: \"{id}.d2.svg\" }}").unwrap(); }, - SearchEvents::ProximityState { graph, paths } => { + SearchEvents::ProximityState { graph, paths, empty_paths_cache } => { let cur_ranking_rule = timestamp.len() - 1; let cur_activated_id = activated_id(×tamp); let id = format!("{cur_ranking_rule}.{cur_activated_id}"); let mut new_file_path = self.folder_path.join(format!("{id}.d2")); let mut new_file = std::fs::File::create(new_file_path).unwrap(); - Self::proximity_graph_d2_description(graph, paths, &mut new_file); + Self::proximity_graph_d2_description(graph, paths, empty_paths_cache, &mut new_file); writeln!( &mut file, "{id} {{ @@ -288,7 +290,7 @@ shape: class").unwrap(); } } } - fn proximity_graph_d2_description(graph: &RankingRuleGraph, paths: &PathsMap, file: &mut File) { + fn proximity_graph_d2_description(graph: &RankingRuleGraph, paths: &PathsMap, empty_paths_cache: &EmptyPathsCache, file: &mut File) { writeln!(file,"direction: right").unwrap(); writeln!(file, "Proximity Graph {{").unwrap(); @@ -322,35 +324,48 @@ shape: class").unwrap(); writeln!(file, "Shortest Paths {{").unwrap(); Self::paths_d2_description(graph, "", paths, file); writeln!(file, "}}").unwrap(); - } - fn paths_d2_description(graph: &RankingRuleGraph, paths_idx: &str, paths: &PathsMap, file: &mut File) { + writeln!(file, "Empty Path Prefixes {{").unwrap(); + Self::paths_d2_description(graph, "", &empty_paths_cache.empty_prefixes, file); + writeln!(file, "}}").unwrap(); + + writeln!(file, "Removed Edges {{").unwrap(); + for edge_idx in empty_paths_cache.empty_edges.iter() { + writeln!(file, "{edge_idx}").unwrap(); + } + writeln!(file, "}}").unwrap(); + } + fn edge_d2_description(graph: &RankingRuleGraph, paths_idx: &str, edge_idx: u32, file: &mut File) -> String { + let Edge { from_node, to_node, cost, .. } = graph.all_edges[edge_idx as usize].as_ref().unwrap() ; + let from_node = &graph.query_graph.nodes[*from_node as usize]; + let from_node_desc = match from_node { + QueryNode::Term(term) => match &term.value { + QueryTerm::Phrase(_) => todo!(), + QueryTerm::Word { derivations } => derivations.original.clone(), + }, + QueryNode::Deleted => panic!(), + QueryNode::Start => "START".to_owned(), + QueryNode::End => "END".to_owned(), + }; + let to_node = &graph.query_graph.nodes[*to_node as usize]; + let to_node_desc = match to_node { + QueryNode::Term(term) => match &term.value { + QueryTerm::Phrase(_) => todo!(), + QueryTerm::Word { derivations } => derivations.original.clone(), + }, + QueryNode::Deleted => panic!(), + QueryNode::Start => "START".to_owned(), + QueryNode::End => "END".to_owned(), + }; + let edge_id = format!("{paths_idx}{edge_idx}"); + writeln!(file, "{edge_id}: \"{from_node_desc}->{to_node_desc} [{cost}]\" {{ + shape: class + }}").unwrap(); + edge_id + } + fn paths_d2_description(graph: &RankingRuleGraph, paths_idx: &str, paths: &PathsMap, file: &mut File) { for (edge_idx, rest) in paths.nodes.iter() { - let Edge { from_node, to_node, cost, .. } = graph.all_edges[*edge_idx as usize].as_ref().unwrap() ; - let from_node = &graph.query_graph.nodes[*from_node as usize]; - let from_node_desc = match from_node { - QueryNode::Term(term) => match &term.value { - QueryTerm::Phrase(_) => todo!(), - QueryTerm::Word { derivations } => derivations.original.clone(), - }, - QueryNode::Deleted => panic!(), - QueryNode::Start => "START".to_owned(), - QueryNode::End => "END".to_owned(), - }; - let to_node = &graph.query_graph.nodes[*to_node as usize]; - let to_node_desc = match to_node { - QueryNode::Term(term) => match &term.value { - QueryTerm::Phrase(_) => todo!(), - QueryTerm::Word { derivations } => derivations.original.clone(), - }, - QueryNode::Deleted => panic!(), - QueryNode::Start => "START".to_owned(), - QueryNode::End => "END".to_owned(), - }; - let edge_id = format!("{paths_idx}{edge_idx}"); - writeln!(file, "{edge_id}: \"{from_node_desc}->{to_node_desc} [{cost}]\" {{ - shape: class - }}").unwrap(); + let edge_id = Self::edge_d2_description(graph, paths_idx, *edge_idx, file); for (dest_edge_idx, _) in rest.nodes.iter() { let dest_edge_id = format!("{paths_idx}{edge_idx}{dest_edge_idx}"); writeln!(file, "{edge_id} -> {dest_edge_id}").unwrap(); diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index ccafc7f11..d1a94f7e5 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -5,7 +5,10 @@ use roaring::RoaringBitmap; use super::{ query_graph, - ranking_rule_graph::{paths_map::PathsMap, proximity::ProximityGraph, RankingRuleGraph}, + ranking_rule_graph::{ + empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, proximity::ProximityGraph, + RankingRuleGraph, + }, QueryGraph, RankingRule, RankingRuleQueryTrait, }; @@ -41,7 +44,7 @@ impl SearchLogger for DefaultSearchLogger { ) { } - fn add_to_results(&mut self, docids: &RoaringBitmap) {} + fn add_to_results(&mut self, docids: &mut dyn Iterator) {} fn log_words_state(&mut self, query_graph: &Q) {} @@ -49,6 +52,7 @@ impl SearchLogger for DefaultSearchLogger { &mut self, query_graph: &RankingRuleGraph, paths_map: &PathsMap, + empty_paths_cache: &EmptyPathsCache, ) { } } @@ -78,7 +82,7 @@ pub trait SearchLogger { ranking_rule: &dyn RankingRule<'transaction, Q>, universe: &RoaringBitmap, ); - fn add_to_results(&mut self, docids: &RoaringBitmap); + fn add_to_results(&mut self, docids: &mut dyn Iterator); fn log_words_state(&mut self, query_graph: &Q); @@ -86,5 +90,6 @@ pub trait SearchLogger { &mut self, query_graph: &RankingRuleGraph, paths: &PathsMap, + empty_paths_cache: &EmptyPathsCache, ); } diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 759780200..e58950c98 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -81,7 +81,9 @@ impl KCheapestPathsState { while self.kth_cheapest_path.cost <= cur_cost { if let Some(next_self) = self.compute_next_cheapest_paths(graph, empty_paths_cache) { self = next_self; - if self.kth_cheapest_path.cost == cur_cost { + if self.kth_cheapest_path.cost == cur_cost + && !empty_paths_cache.path_is_empty(&self.kth_cheapest_path.edges) + { into_map.add_path(&self.kth_cheapest_path); } else { break; @@ -148,7 +150,13 @@ impl KCheapestPathsState { while let Some((next_cheapest_path, cost2)) = next_cheapest_paths.remove_first() { assert_eq!(cost, cost2); - if empty_paths_cache.path_is_empty(&next_cheapest_path) { + // NOTE: it is important not to discard the paths that are forbidden due to a + // forbidden prefix, because the cheapest path algorithm (Dijkstra) cannot take + // this property into account. + if next_cheapest_path + .iter() + .any(|edge_index| graph.all_edges[*edge_index as usize].is_none()) + { continue; } else { self.cheapest_paths.insert(next_cheapest_path.iter().copied(), cost); diff --git a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs index 5748dce3c..d8d645092 100644 --- a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs @@ -4,12 +4,16 @@ use roaring::RoaringBitmap; use super::paths_map::PathsMap; -#[derive(Default)] +#[derive(Default, Clone)] pub struct EmptyPathsCache { pub empty_edges: RoaringBitmap, pub empty_prefixes: PathsMap<()>, } impl EmptyPathsCache { + pub fn forbid_edge(&mut self, edge_idx: u32) { + self.empty_edges.insert(edge_idx); + self.empty_prefixes.remove_edge(&edge_idx); + } pub fn path_is_empty(&self, path: &[u32]) -> bool { for edge in path { if self.empty_edges.contains(*edge) { diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index e677be1d9..d939b6923 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -11,6 +11,7 @@ use std::ops::ControlFlow; use heed::RoTxn; use roaring::RoaringBitmap; +use self::empty_paths_cache::EmptyPathsCache; use self::paths_map::PathsMap; use super::db_cache::DatabaseCache; @@ -82,6 +83,7 @@ pub trait RankingRuleGraphTrait: Sized { fn log_state( graph: &RankingRuleGraph, paths: &PathsMap, + empty_paths_cache: &EmptyPathsCache, logger: &mut dyn SearchLogger, ); } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 66e6bad98..c823cbf9c 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -3,6 +3,7 @@ pub mod compute_docids; use heed::RoTxn; +use super::empty_paths_cache::EmptyPathsCache; use super::paths_map::PathsMap; use super::{Edge, EdgeDetails, RankingRuleGraphTrait}; use crate::new::db_cache::DatabaseCache; @@ -67,8 +68,9 @@ impl RankingRuleGraphTrait for ProximityGraph { fn log_state( graph: &super::RankingRuleGraph, paths: &PathsMap, + empty_paths_cache: &EmptyPathsCache, logger: &mut dyn crate::new::logger::SearchLogger, ) { - logger.log_proximity_state(graph, paths); + logger.log_proximity_state(graph, paths, empty_paths_cache); } } diff --git a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs index d21ddcd86..1a97dc485 100644 --- a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs @@ -40,7 +40,7 @@ impl RankingRuleGraph { BitmapOrAllRef::Bitmap(edge_docids) => { if edge_docids.is_disjoint(universe) { // 1. Store in the cache that this edge is empty for this universe - empty_paths_cache.empty_edges.insert(edge_index); + empty_paths_cache.forbid_edge(edge_index); // 2. remove all the paths that contain this edge for this universe paths.remove_edge(&edge_index); // 3. remove this edge from the proximity graph diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index 74ded6d97..70682a561 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -139,6 +139,7 @@ pub fn execute_search<'transaction>( candidates[0] = universe.clone(); let mut cur_ranking_rule_index = 0; + macro_rules! back { () => { logger.end_iteration_ranking_rule( @@ -157,13 +158,20 @@ pub fn execute_search<'transaction>( } let mut results = vec![]; + macro_rules! add_to_results { + ($candidates:expr) => { + logger.add_to_results(&mut $candidates.iter().take(20 - results.len())); + let iter = $candidates.iter().take(20 - results.len()); + results.extend(iter); + }; + } + // TODO: skip buckets when we want to start from an offset while results.len() < 20 { // The universe for this bucket is zero or one element, so we don't need to sort // anything, just extend the results and go back to the parent ranking rule. if candidates[cur_ranking_rule_index].len() <= 1 { - logger.add_to_results(&candidates[cur_ranking_rule_index]); - results.extend(&candidates[cur_ranking_rule_index]); + add_to_results!(candidates[cur_ranking_rule_index]); back!(); continue; } @@ -183,15 +191,12 @@ pub fn execute_search<'transaction>( if next_bucket.candidates.len() <= 1 { // Only zero or one candidate, no need to sort through the child ranking rule. - logger.add_to_results(&next_bucket.candidates); - results.extend(next_bucket.candidates); + add_to_results!(next_bucket.candidates); continue; } else { // many candidates, give to next ranking rule, if any if cur_ranking_rule_index == ranking_rules_len - 1 { - // TODO: don't extend too much, up to the limit only - logger.add_to_results(&next_bucket.candidates); - results.extend(next_bucket.candidates); + add_to_results!(next_bucket.candidates); } else { cur_ranking_rule_index += 1; candidates[cur_ranking_rule_index] = next_bucket.candidates.clone(); @@ -313,8 +318,7 @@ mod tests { let mut db_cache = DatabaseCache::default(); let query_graph = - make_query_graph(&index, &txn, &mut db_cache, "the sun flower is facing the su") - .unwrap(); + make_query_graph(&index, &txn, &mut db_cache, "a a a a a a a a a a").unwrap(); // TODO: filters + maybe distinct attributes? let universe = get_start_universe(