diff --git a/milli/src/lib.rs b/milli/src/lib.rs index ade6ee8bd..199221c7c 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -70,7 +70,9 @@ pub mod update; #[macro_use] pub mod snapshot_tests; -pub use search::new::{execute_search, SearchContext}; +pub use search::new::DetailedSearchLogger; + +pub use search::new::{execute_search, DefaultSearchLogger, SearchContext}; use std::collections::{BTreeMap, HashMap}; use std::convert::{TryFrom, TryInto}; diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index bbe4099bc..edac0b15c 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -45,8 +45,8 @@ use super::interner::MappedInterner; use super::logger::SearchLogger; use super::query_graph::QueryNode; use super::ranking_rule_graph::{ - ConditionDocIdsCache, DeadEndPathCache, ProximityGraph, RankingRuleGraph, - RankingRuleGraphTrait, TypoGraph, + ConditionDocIdsCache, DeadEndsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, + TypoGraph, }; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; @@ -87,7 +87,7 @@ pub struct GraphBasedRankingRuleState { /// Cache to retrieve the docids associated with each edge conditions_cache: ConditionDocIdsCache, /// Cache used to optimistically discard paths that resolve to no documents. - dead_end_path_cache: DeadEndPathCache, + dead_end_path_cache: DeadEndsCache, /// A structure giving the list of possible costs from each node to the end node, /// along with a set of unavoidable edges that must be traversed to achieve that distance. all_distances: MappedInterner)>, QueryNode>, @@ -103,7 +103,7 @@ fn remove_empty_edges<'ctx, G: RankingRuleGraphTrait>( graph: &mut RankingRuleGraph, condition_docids_cache: &mut ConditionDocIdsCache, universe: &RoaringBitmap, - dead_end_path_cache: &mut DeadEndPathCache, + dead_end_path_cache: &mut DeadEndsCache, ) -> Result<()> { for edge_id in graph.edges_store.indexes() { let Some(edge) = graph.edges_store.get(edge_id).as_ref() else { @@ -113,9 +113,9 @@ fn remove_empty_edges<'ctx, G: RankingRuleGraphTrait>( let docids = condition_docids_cache.get_condition_docids(ctx, condition, graph, universe)?; - if docids.is_disjoint(universe) { + if docids.is_empty() { graph.remove_edges_with_condition(condition); - dead_end_path_cache.add_condition(condition); + dead_end_path_cache.forbid_condition(condition); // add_condition(condition); condition_docids_cache.cache.remove(&condition); continue; } @@ -135,8 +135,8 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase query_graph: &QueryGraph, ) -> Result<()> { let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?; - let mut condition_docids_cache = ConditionDocIdsCache::default(); - let mut dead_end_path_cache = DeadEndPathCache::new(&graph.conditions_interner); + let mut condition_docids_cache = ConditionDocIdsCache::new(universe); + let mut dead_end_path_cache = DeadEndsCache::new(&graph.conditions_interner); // First simplify the graph as much as possible, by computing the docids of all the conditions // within the rule's universe and removing the edges that have no associated docids. @@ -230,62 +230,79 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase graph.query_graph.root_node, cost, all_distances, + dead_end_path_cache.forbidden.clone(), + |condition, forbidden_conditions| {}, dead_end_path_cache, |path, graph, dead_end_path_cache| { // Accumulate the path for logging purposes only paths.push(path.to_vec()); + let mut path_docids = universe.clone(); // We store the edges and their docids in vectors in case the path turns out to be // empty and we need to figure out why it was empty. let mut visited_conditions = vec![]; let mut cached_condition_docids = vec![]; - // graph.conditions_interner.map(|_| RoaringBitmap::new()); - for &condition in path { - visited_conditions.push(condition); + for &latest_condition in path { + visited_conditions.push(latest_condition); - let condition_docids = condition_docids_cache - .get_condition_docids(ctx, condition, graph, &universe)?; + let condition_docids = condition_docids_cache.get_condition_docids( + ctx, + latest_condition, + graph, + &universe, + )?; - cached_condition_docids.push((condition, condition_docids.clone())); // .get_mut(condition) = condition_docids.clone(); + cached_condition_docids.push((latest_condition, condition_docids.clone())); // If the edge is empty, then the path will be empty as well, we update the graph // and caches accordingly and skip to the next candidate path. if condition_docids.is_disjoint(&universe) { // 1. Store in the cache that this edge is empty for this universe - dead_end_path_cache.add_condition(condition); - // 2. remove this edge from the ranking rule graph - // ouch, no! :( need to link a condition to one or more ranking rule edges - graph.remove_edges_with_condition(condition); + dead_end_path_cache.forbid_condition(latest_condition); + // 2. remove all the edges with this condition from the ranking rule graph + graph.remove_edges_with_condition(latest_condition); // 3. Also remove the entry from the condition_docids_cache, since we don't need it anymore - condition_docids_cache.cache.remove(&condition); + condition_docids_cache.cache.remove(&latest_condition); return Ok(ControlFlow::Continue(())); } - path_docids &= condition_docids; - // If the (sub)path is empty, we try to figure out why and update the caches accordingly. - if path_docids.is_disjoint(&universe) { + if path_docids.is_disjoint(condition_docids) { // First, we know that this path is empty, and thus any path // that is a superset of it will also be empty. - dead_end_path_cache.add_prefix(&visited_conditions); + dead_end_path_cache.forbid_condition_after_prefix( + visited_conditions[..visited_conditions.len() - 1].iter().copied(), + latest_condition, + ); + + let mut dead_end_cache_cursor = dead_end_path_cache; + // Second, if the intersection between this edge and any - // previous one is disjoint with the universe, - // then we also know that any path containing the same couple of - // edges will also be empty. - for (past_condition, condition_docids2) in cached_condition_docids.iter() { - if *past_condition == condition { + // previous prefix is disjoint with the universe, then... TODO + for (past_condition, past_condition_docids) in + cached_condition_docids.iter() + { + // TODO: should ensure that it is simply not possible to have twice + // the same condition in the cached_condition_docids. Maybe it is + // already the case? + dead_end_cache_cursor = + dead_end_cache_cursor.advance(*past_condition).unwrap(); + // TODO: check how that interacts with the dead end cache? + if *past_condition == latest_condition { + // TODO: should we break instead? + // Is it even possible? continue; }; - let intersection = condition_docids & condition_docids2; - if intersection.is_disjoint(&universe) { - dead_end_path_cache - .add_condition_couple(*past_condition, condition); + if condition_docids.is_disjoint(past_condition_docids) { + dead_end_cache_cursor.forbid_condition(latest_condition); } } // We should maybe instead try to compute: // 0th & nth & 1st & n-1th & 2nd & etc... return Ok(ControlFlow::Continue(())); + } else { + path_docids &= condition_docids; } } assert!(!path_docids.is_empty()); @@ -303,7 +320,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase } }, )?; - + // println!(" {} paths of cost {} in {}", paths.len(), cost, self.id); G::log_state( &original_graph, &paths, diff --git a/milli/src/search/new/interner.rs b/milli/src/search/new/interner.rs index 3b5544b68..ea8b987fd 100644 --- a/milli/src/search/new/interner.rs +++ b/milli/src/search/new/interner.rs @@ -152,7 +152,7 @@ impl Hash for Interned { } } -impl Ord for Interned { +impl Ord for Interned { fn cmp(&self, other: &Self) -> std::cmp::Ordering { self.idx.cmp(&other.idx) } diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 1ed992c56..2b5d31781 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -3,7 +3,7 @@ use std::io::Write; use std::path::PathBuf; use std::time::Instant; -use rand::random; +// use rand::random; use roaring::RoaringBitmap; use crate::search::new::interner::{Interned, MappedInterner}; @@ -323,12 +323,11 @@ impl DetailedSearchLogger { let cur_activated_id = activated_id(×tamp); let docids = new.iter().collect::>(); let len = new.len(); - let random = random::(); writeln!( &mut file, - "{cur_ranking_rule}.{cur_activated_id} -> results.{random} : \"add {len}\" -results.{random} {{ + "{cur_ranking_rule}.{cur_activated_id} -> results.{cur_ranking_rule}{cur_activated_id} : \"add {len}\" +results.{cur_ranking_rule}{cur_activated_id} {{ tooltip: \"{docids:?}\" style {{ fill: \"#B6E2D3\" @@ -572,17 +571,17 @@ shape: class" Self::paths_d2_description(ctx, graph, paths, file); writeln!(file, "}}").unwrap(); - writeln!(file, "Dead-end couples of conditions {{").unwrap(); - for (i, (e1, e2)) in dead_end_paths_cache.condition_couples.iter().enumerate() { - writeln!(file, "{i} : \"\" {{").unwrap(); - Self::condition_d2_description(ctx, graph, e1, file); - for e2 in e2.iter() { - Self::condition_d2_description(ctx, graph, e2, file); - writeln!(file, "{e1} -- {e2}").unwrap(); - } - writeln!(file, "}}").unwrap(); - } - writeln!(file, "}}").unwrap(); + // writeln!(file, "Dead-end couples of conditions {{").unwrap(); + // for (i, (e1, e2)) in dead_end_paths_cache.condition_couples.iter().enumerate() { + // writeln!(file, "{i} : \"\" {{").unwrap(); + // Self::condition_d2_description(ctx, graph, e1, file); + // for e2 in e2.iter() { + // Self::condition_d2_description(ctx, graph, e2, file); + // writeln!(file, "{e1} -- {e2}").unwrap(); + // } + // writeln!(file, "}}").unwrap(); + // } + // writeln!(file, "}}").unwrap(); writeln!(file, "Dead-end edges {{").unwrap(); for condition in dead_end_paths_cache.conditions.iter() { diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index ba58d049f..f8ab89cbf 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -1,4 +1,4 @@ -#[cfg(test)] +// #[cfg(test)] pub mod detailed; use roaring::RoaringBitmap; @@ -6,8 +6,7 @@ use roaring::RoaringBitmap; use super::interner::{Interned, MappedInterner}; use super::query_graph::QueryNode; use super::ranking_rule_graph::{ - DeadEndPathCache, ProximityCondition, ProximityGraph, RankingRuleGraph, TypoCondition, - TypoGraph, + DeadEndsCache, ProximityCondition, ProximityGraph, RankingRuleGraph, TypoCondition, TypoGraph, }; use super::small_bitmap::SmallBitmap; use super::{RankingRule, RankingRuleQueryTrait}; @@ -67,7 +66,7 @@ pub trait SearchLogger { &mut self, query_graph: &RankingRuleGraph, paths: &[Vec>], - dead_end_path_cache: &DeadEndPathCache, + dead_end_path_cache: &DeadEndsCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, cost: u16, @@ -78,7 +77,7 @@ pub trait SearchLogger { &mut self, query_graph: &RankingRuleGraph, paths: &[Vec>], - dead_end_path_cache: &DeadEndPathCache, + dead_end_path_cache: &DeadEndsCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, cost: u16, @@ -138,7 +137,7 @@ impl SearchLogger for DefaultSearchLogger { &mut self, _query_graph: &RankingRuleGraph, _paths_map: &[Vec>], - _dead_end_path_cache: &DeadEndPathCache, + _dead_end_path_cache: &DeadEndsCache, _universe: &RoaringBitmap, _distances: &MappedInterner)>, QueryNode>, _cost: u16, @@ -149,7 +148,7 @@ impl SearchLogger for DefaultSearchLogger { &mut self, _query_graph: &RankingRuleGraph, _paths: &[Vec>], - _dead_end_path_cache: &DeadEndPathCache, + _dead_end_path_cache: &DeadEndsCache, _universe: &RoaringBitmap, _distances: &MappedInterner)>, QueryNode>, _cost: u16, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 8ff832de4..29a6020aa 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -15,26 +15,26 @@ mod sort; // TODO: documentation + comments mod words; +// #[cfg(test)] +pub use logger::detailed::DetailedSearchLogger; pub use logger::{DefaultSearchLogger, SearchLogger}; use std::collections::{BTreeSet, HashSet}; +use crate::{Filter, Index, MatchingWords, Result, Search, SearchResult, TermsMatchingStrategy}; use charabia::Tokenize; use db_cache::DatabaseCache; +use graph_based_ranking_rule::{Proximity, Typo}; use heed::RoTxn; -use query_graph::{QueryGraph, QueryNode}; -pub use ranking_rules::{bucket_sort, RankingRule, RankingRuleOutput, RankingRuleQueryTrait}; +use interner::DedupInterner; +use query_graph::{QueryGraph, QueryNode, QueryNodeData}; +use query_term::{located_query_terms_from_string, Phrase, QueryTerm}; +use ranking_rules::{bucket_sort, PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait}; +use resolve_query_graph::{resolve_query_graph, QueryTermDocIdsCache}; use roaring::RoaringBitmap; +use words::Words; -use self::interner::DedupInterner; -use self::query_graph::QueryNodeData; -use self::query_term::{Phrase, QueryTerm}; -use self::ranking_rules::PlaceholderQuery; -use self::resolve_query_graph::{resolve_query_graph, QueryTermDocIdsCache}; -use crate::search::new::graph_based_ranking_rule::{Proximity, Typo}; -use crate::search::new::query_term::located_query_terms_from_string; -use crate::search::new::words::Words; -use crate::{Filter, Index, Result, TermsMatchingStrategy}; +use self::ranking_rules::RankingRule; /// A structure used throughout the execution of a search query. pub struct SearchContext<'ctx> { @@ -231,12 +231,12 @@ pub fn execute_search<'ctx>( length: usize, placeholder_search_logger: &mut dyn SearchLogger, query_graph_logger: &mut dyn SearchLogger, -) -> Result> { +) -> Result { assert!(!query.is_empty()); let query_terms = located_query_terms_from_string(ctx, query.tokenize(), None)?; let graph = QueryGraph::from_query(ctx, query_terms)?; - let universe = if let Some(filters) = filters { + let mut universe = if let Some(filters) = filters { filters.evaluate(ctx.txn, ctx.index)? } else { ctx.index.documents_ids(ctx.txn)? @@ -249,8 +249,8 @@ pub fn execute_search<'ctx>( // But in that case, we should return no results. // // The search is a placeholder search only if there are no tokens? - if graph.nodes.len() > 2 { - let universe = resolve_maximally_reduced_query_graph( + let documents_ids = if graph.nodes.len() > 2 { + universe = resolve_maximally_reduced_query_graph( ctx, &universe, &graph, @@ -259,7 +259,7 @@ pub fn execute_search<'ctx>( )?; let ranking_rules = get_ranking_rules_for_query_graph_search(ctx, terms_matching_strategy)?; - bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger) + bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)? } else { let ranking_rules = get_ranking_rules_for_placeholder_search(ctx)?; bucket_sort( @@ -270,7 +270,22 @@ pub fn execute_search<'ctx>( from, length, placeholder_search_logger, - ) + )? + }; + + Ok(SearchResult { + // TODO: correct matching words + matching_words: MatchingWords::default(), + // TODO: candidates with distinct + candidates: universe, + documents_ids, + }) +} + +impl<'a> Search<'a> { + // TODO + pub fn execute_new(&self) -> Result { + todo!() } } @@ -329,7 +344,7 @@ mod tests { println!("{}us", elapsed.as_micros()); let _documents = index - .documents(&txn, results.iter().copied()) + .documents(&txn, results.documents_ids.iter().copied()) .unwrap() .into_iter() .map(|(id, obkv)| { diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index bc0e88326..a36c6943f 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -4,8 +4,7 @@ use std::collections::btree_map::Entry; use std::collections::{BTreeMap, VecDeque}; use std::ops::ControlFlow; -use super::dead_end_path_cache::DeadEndPathCache; -use super::{RankingRuleGraph, RankingRuleGraphTrait}; +use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::{Interned, MappedInterner}; use crate::search::new::query_graph::QueryNode; use crate::search::new::small_bitmap::SmallBitmap; @@ -23,11 +22,11 @@ impl RankingRuleGraph { from: Interned, cost: u16, all_distances: &MappedInterner)>, QueryNode>, - dead_end_path_cache: &mut DeadEndPathCache, + dead_end_path_cache: &mut DeadEndsCache, mut visit: impl FnMut( &[Interned], &mut Self, - &mut DeadEndPathCache, + &mut DeadEndsCache, ) -> Result>, ) -> Result<()> { let _ = self.visit_paths_of_cost_rec( @@ -38,7 +37,7 @@ impl RankingRuleGraph { &mut visit, &mut vec![], &mut SmallBitmap::for_interned_values_in(&self.conditions_interner), - &mut dead_end_path_cache.conditions.clone(), + &mut dead_end_path_cache.forbidden.clone(), )?; Ok(()) } @@ -47,11 +46,11 @@ impl RankingRuleGraph { from: Interned, cost: u16, all_distances: &MappedInterner)>, QueryNode>, - dead_end_path_cache: &mut DeadEndPathCache, + dead_end_path_cache: &mut DeadEndsCache, visit: &mut impl FnMut( &[Interned], &mut Self, - &mut DeadEndPathCache, + &mut DeadEndsCache, ) -> Result>, prev_conditions: &mut Vec>, cur_path: &mut SmallBitmap, @@ -74,7 +73,6 @@ impl RankingRuleGraph { ControlFlow::Continue(_) => {} ControlFlow::Break(_) => return Ok(true), } - true } else { self.visit_paths_of_cost_rec( edge.dest_node, @@ -85,7 +83,7 @@ impl RankingRuleGraph { prev_conditions, cur_path, forbidden_conditions, - )? + )?; } } Some(condition) => { @@ -101,24 +99,20 @@ impl RankingRuleGraph { } cur_path.insert(condition); prev_conditions.push(condition); - let mut new_forbidden_conditions = forbidden_conditions.clone(); - new_forbidden_conditions - .union(dead_end_path_cache.condition_couples.get(condition)); - dead_end_path_cache.prefixes.final_edges_after_prefix( - prev_conditions, - &mut |x| { - new_forbidden_conditions.insert(x); - }, - ); - let next_any_valid = if edge.dest_node == self.query_graph.end_node { + if let Some(next_forbidden) = + dead_end_path_cache.forbidden_conditions_after_prefix(&prev_conditions) + { + new_forbidden_conditions.union(&next_forbidden); + } + + if edge.dest_node == self.query_graph.end_node { any_valid = true; let control_flow = visit(prev_conditions, self, dead_end_path_cache)?; match control_flow { ControlFlow::Continue(_) => {} ControlFlow::Break(_) => return Ok(true), } - true } else { self.visit_paths_of_cost_rec( edge.dest_node, @@ -129,28 +123,12 @@ impl RankingRuleGraph { prev_conditions, cur_path, &mut new_forbidden_conditions, - )? - }; + )?; + } cur_path.remove(condition); prev_conditions.pop(); - next_any_valid } }; - any_valid |= next_any_valid; - - if next_any_valid { - if dead_end_path_cache.path_is_dead_end(prev_conditions, cur_path) { - return Ok(any_valid); - } - forbidden_conditions.union(&dead_end_path_cache.conditions); - for prev_condition in prev_conditions.iter() { - forbidden_conditions - .union(dead_end_path_cache.condition_couples.get(*prev_condition)); - } - dead_end_path_cache.prefixes.final_edges_after_prefix(prev_conditions, &mut |x| { - forbidden_conditions.insert(x); - }); - } } Ok(any_valid) diff --git a/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs index 9766cfaa3..367f36e6a 100644 --- a/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs @@ -12,11 +12,16 @@ use crate::Result; pub struct ConditionDocIdsCache { // TODO: should be FxHashMap, RoaringBitmap> pub cache: FxHashMap, RoaringBitmap>, + pub universe_length: u64, _phantom: PhantomData, } -impl Default for ConditionDocIdsCache { - fn default() -> Self { - Self { cache: Default::default(), _phantom: Default::default() } +impl ConditionDocIdsCache { + pub fn new(universe: &RoaringBitmap) -> Self { + Self { + cache: Default::default(), + _phantom: Default::default(), + universe_length: universe.len(), + } } } impl ConditionDocIdsCache { @@ -33,6 +38,9 @@ impl ConditionDocIdsCache { universe: &RoaringBitmap, ) -> Result<&'s RoaringBitmap> { if self.cache.contains_key(&interned_condition) { + // TODO compare length of universe compared to the one in self + // if it is smaller, then update the value + // TODO: should we update the bitmap in the cache if the new universe // reduces it? // TODO: maybe have a generation: u32 to track every time the universe was diff --git a/milli/src/search/new/ranking_rule_graph/dead_end_path_cache.rs b/milli/src/search/new/ranking_rule_graph/dead_end_path_cache.rs index b4af625d6..701421ea7 100644 --- a/milli/src/search/new/ranking_rule_graph/dead_end_path_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/dead_end_path_cache.rs @@ -1,84 +1,83 @@ -use super::{path_set::PathSet, RankingRuleGraphTrait}; -use crate::search::new::{ - interner::{FixedSizeInterner, Interned, MappedInterner}, - small_bitmap::SmallBitmap, -}; +// use super::{path_set::PathSet, RankingRuleGraphTrait}; +// use crate::search::new::{ +// interner::{FixedSizeInterner, Interned, MappedInterner}, +// small_bitmap::SmallBitmap, +// }; -/// A cache which stores sufficient conditions for a path -/// to resolve to an empty set of candidates within the current -/// universe. -pub struct DeadEndPathCache { - /// The set of edge conditions that resolve to no documents. - pub conditions: SmallBitmap, - /// A set of path prefixes that resolve to no documents. - pub prefixes: PathSet, - /// A set of empty couples of edge conditions that resolve to no documents. - pub condition_couples: MappedInterner, G::Condition>, -} -impl Clone for DeadEndPathCache { - fn clone(&self) -> Self { - Self { - conditions: self.conditions.clone(), - prefixes: self.prefixes.clone(), - condition_couples: self.condition_couples.clone(), - } - } -} +// /// A cache which stores sufficient conditions for a path +// /// to resolve to an empty set of candidates within the current +// /// universe. +// pub struct DeadEndPathCache { +// /// The set of edge conditions that resolve to no documents. +// pub conditions: SmallBitmap, +// /// A set of path prefixes that resolve to no documents. +// pub prefixes: PathSet, +// /// A set of empty couples of edge conditions that resolve to no documents. +// pub condition_couples: MappedInterner, G::Condition>, +// } +// impl Clone for DeadEndPathCache { +// fn clone(&self) -> Self { +// Self { +// conditions: self.conditions.clone(), +// prefixes: self.prefixes.clone(), +// condition_couples: self.condition_couples.clone(), +// } +// } +// } -impl DeadEndPathCache { - /// Create a new cache for a ranking rule graph containing at most `all_edges_len` edges. - pub fn new(all_conditions: &FixedSizeInterner) -> Self { - Self { - conditions: SmallBitmap::for_interned_values_in(all_conditions), - prefixes: PathSet::default(), - condition_couples: all_conditions - .map(|_| SmallBitmap::for_interned_values_in(all_conditions)), - } - } +// impl DeadEndPathCache { +// /// Create a new cache for a ranking rule graph containing at most `all_edges_len` edges. +// pub fn new(all_conditions: &FixedSizeInterner) -> Self { +// Self { +// conditions: SmallBitmap::for_interned_values_in(all_conditions), +// prefixes: PathSet::default(), +// condition_couples: all_conditions +// .map(|_| SmallBitmap::for_interned_values_in(all_conditions)), +// } +// } - /// Store in the cache that every path containing the given edge resolves to no documents. - pub fn add_condition(&mut self, condition: Interned) { - self.conditions.insert(condition); - self.condition_couples.get_mut(condition).clear(); - self.prefixes.remove_edge(condition); - for (_, edges2) in self.condition_couples.iter_mut() { - edges2.remove(condition); - } - } - /// Store in the cache that every path containing the given prefix resolves to no documents. - pub fn add_prefix(&mut self, prefix: &[Interned]) { - // TODO: typed PathSet - self.prefixes.insert(prefix.iter().copied()); - } +// /// Store in the cache that every path containing the given edge resolves to no documents. +// pub fn add_condition(&mut self, condition: Interned) { +// self.conditions.insert(condition); +// self.condition_couples.get_mut(condition).clear(); +// self.prefixes.remove_edge(condition); +// for (_, edges2) in self.condition_couples.iter_mut() { +// edges2.remove(condition); +// } +// } +// /// Store in the cache that every path containing the given prefix resolves to no documents. +// pub fn add_prefix(&mut self, prefix: &[Interned]) { +// // TODO: typed PathSet +// self.prefixes.insert(prefix.iter().copied()); +// } - /// Store in the cache that every path containing the two given edges resolves to no documents. - pub fn add_condition_couple( - &mut self, - edge1: Interned, - edge2: Interned, - ) { - self.condition_couples.get_mut(edge1).insert(edge2); - } +// /// Store in the cache that every path containing the two given edges resolves to no documents. +// pub fn add_condition_couple( +// &mut self, +// edge1: Interned, +// edge2: Interned, +// ) { +// self.condition_couples.get_mut(edge1).insert(edge2); +// } - /// Returns true if the cache can determine that the given path resolves to no documents. - pub fn path_is_dead_end( - &self, - path: &[Interned], - path_bitmap: &SmallBitmap, - ) -> bool { - if path_bitmap.intersects(&self.conditions) { - return true; - } - for condition in path.iter() { - // TODO: typed path - let forbidden_other_edges = self.condition_couples.get(*condition); - if path_bitmap.intersects(forbidden_other_edges) { - return true; - } - } - if self.prefixes.contains_prefix_of_path(path) { - return true; - } - false - } -} +// /// Returns true if the cache can determine that the given path resolves to no documents. +// pub fn path_is_dead_end( +// &self, +// path: &[Interned], +// path_bitmap: &SmallBitmap, +// ) -> bool { +// if path_bitmap.intersects(&self.conditions) { +// return true; +// } +// for condition in path.iter() { +// let forbidden_other_edges = self.condition_couples.get(*condition); +// if path_bitmap.intersects(forbidden_other_edges) { +// return true; +// } +// } +// if self.prefixes.contains_prefix_of_path(path) { +// return true; +// } +// false +// } +// } diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 02a68b811..977d6c96b 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -20,7 +20,8 @@ use std::collections::HashSet; use std::hash::Hash; pub use condition_docids_cache::ConditionDocIdsCache; -pub use dead_end_path_cache::DeadEndPathCache; +// pub use dead_end_path_cache::DeadEndPathCache; +pub use path_set::DeadEndsCache; pub use proximity::{ProximityCondition, ProximityGraph}; use roaring::RoaringBitmap; pub use typo::{TypoCondition, TypoGraph}; @@ -113,7 +114,7 @@ pub trait RankingRuleGraphTrait: Sized { fn log_state( graph: &RankingRuleGraph, paths: &[Vec>], - dead_end_path_cache: &DeadEndPathCache, + dead_end_path_cache: &DeadEndsCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, cost: u16, diff --git a/milli/src/search/new/ranking_rule_graph/path_set.rs b/milli/src/search/new/ranking_rule_graph/path_set.rs index 1a87c9604..0aa4472dc 100644 --- a/milli/src/search/new/ranking_rule_graph/path_set.rs +++ b/milli/src/search/new/ranking_rule_graph/path_set.rs @@ -2,104 +2,165 @@ // For the empty_prefixes field in the EmptyPathsCache only :/ // but it could be used for more, like efficient computing of a set of paths -use crate::search::new::interner::Interned; +use crate::search::new::{ + interner::{FixedSizeInterner, Interned}, + small_bitmap::SmallBitmap, +}; -/// A set of `Vec>` implemented as a prefix tree. -pub struct PathSet { +pub struct DeadEndsCache { nodes: Vec<(Interned, Self)>, - is_end: bool, + pub forbidden: SmallBitmap, } - -impl Clone for PathSet { - fn clone(&self) -> Self { - Self { nodes: self.nodes.clone(), is_end: self.is_end } +impl DeadEndsCache { + pub fn new(for_interner: &FixedSizeInterner) -> Self { + Self { nodes: vec![], forbidden: SmallBitmap::for_interned_values_in(for_interner) } } -} - -impl std::fmt::Debug for PathSet { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("PathSet").field("nodes", &self.nodes).field("is_end", &self.is_end).finish() + pub fn forbid_condition(&mut self, condition: Interned) { + self.forbidden.insert(condition); } -} - -impl Default for PathSet { - fn default() -> Self { - Self { nodes: Default::default(), is_end: Default::default() } - } -} - -impl PathSet { - pub fn insert(&mut self, mut edges: impl Iterator>) { - match edges.next() { - None => { - self.is_end = true; - } - Some(first_edge) => { - for (edge, next_node) in &mut self.nodes { - if edge == &first_edge { - return next_node.insert(edges); - } - } - let mut rest = PathSet::default(); - rest.insert(edges); - self.nodes.push((first_edge, rest)); + fn advance(&mut self, condition: Interned) -> Option<&mut Self> { + for (e, next_node) in &mut self.nodes { + if condition == *e { + return Some(next_node); } } + None } - - pub fn remove_edge(&mut self, forbidden_edge: Interned) { - let mut i = 0; - while i < self.nodes.len() { - let should_remove = if self.nodes[i].0 == forbidden_edge { - true - } else if !self.nodes[i].1.nodes.is_empty() { - self.nodes[i].1.remove_edge(forbidden_edge); - self.nodes[i].1.nodes.is_empty() + pub fn forbidden_conditions_after_prefix( + &mut self, + mut prefix: &[Interned], + ) -> Option> { + let mut cursor = self; + for c in prefix.iter() { + if let Some(next) = cursor.advance(*c) { + cursor = next; } else { - false - }; - if should_remove { - self.nodes.remove(i); - } else { - i += 1; + return None; } } + Some(cursor.forbidden.clone()) } - - pub fn final_edges_after_prefix( - &self, - prefix: &[Interned], - visit: &mut impl FnMut(Interned), + pub fn forbid_condition_after_prefix( + &mut self, + mut prefix: impl Iterator>, + forbidden: Interned, ) { - let [first_edge, remaining_prefix @ ..] = prefix else { - for node in self.nodes.iter() { - if node.1.is_end { - visit(node.0) - } + match prefix.next() { + None => { + self.forbidden.insert(forbidden); } - return - }; - for (edge, rest) in self.nodes.iter() { - if edge == first_edge { - return rest.final_edges_after_prefix(remaining_prefix, visit); - } - } - } - - pub fn contains_prefix_of_path(&self, path: &[Interned]) -> bool { - if self.is_end { - return true; - } - match path { - [] => false, - [first_edge, remaining_path @ ..] => { - for (edge, rest) in self.nodes.iter() { - if edge == first_edge { - return rest.contains_prefix_of_path(remaining_path); + Some(first_condition) => { + for (condition, next_node) in &mut self.nodes { + if condition == &first_condition { + return next_node.forbid_condition_after_prefix(prefix, forbidden); } } - false + let mut rest = DeadEndsCache { + nodes: vec![], + forbidden: SmallBitmap::new(self.forbidden.universe_length()), + }; + rest.forbid_condition_after_prefix(prefix, forbidden); + self.nodes.push((first_condition, rest)); } } } } +// /// A set of `Vec>` implemented as a prefix tree. +// pub struct PathSet { +// nodes: Vec<(Interned, Self)>, +// is_end: bool, +// } + +// impl Clone for PathSet { +// fn clone(&self) -> Self { +// Self { nodes: self.nodes.clone(), is_end: self.is_end } +// } +// } + +// impl std::fmt::Debug for PathSet { +// fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +// f.debug_struct("PathSet").field("nodes", &self.nodes).field("is_end", &self.is_end).finish() +// } +// } + +// impl Default for PathSet { +// fn default() -> Self { +// Self { nodes: Default::default(), is_end: Default::default() } +// } +// } + +// impl PathSet { +// pub fn insert(&mut self, mut conditions: impl Iterator>) { +// match conditions.next() { +// None => { +// self.is_end = true; +// } +// Some(first_condition) => { +// for (condition, next_node) in &mut self.nodes { +// if condition == &first_condition { +// return next_node.insert(conditions); +// } +// } +// let mut rest = PathSet::default(); +// rest.insert(conditions); +// self.nodes.push((first_condition, rest)); +// } +// } +// } + +// pub fn remove_condition(&mut self, forbidden_condition: Interned) { +// let mut i = 0; +// while i < self.nodes.len() { +// let should_remove = if self.nodes[i].0 == forbidden_condition { +// true +// } else if !self.nodes[i].1.nodes.is_empty() { +// self.nodes[i].1.remove_condition(forbidden_condition); +// self.nodes[i].1.nodes.is_empty() +// } else { +// false +// }; +// if should_remove { +// self.nodes.remove(i); +// } else { +// i += 1; +// } +// } +// } + +// pub fn final_conditions_after_prefix( +// &self, +// prefix: &[Interned], +// visit: &mut impl FnMut(Interned), +// ) { +// let [first_condition, remaining_prefix @ ..] = prefix else { +// for node in self.nodes.iter() { +// if node.1.is_end { +// visit(node.0) +// } +// } +// return +// }; +// for (condition, rest) in self.nodes.iter() { +// if condition == first_condition { +// return rest.final_conditions_after_prefix(remaining_prefix, visit); +// } +// } +// } + +// pub fn contains_prefix_of_path(&self, path: &[Interned]) -> bool { +// if self.is_end { +// return true; +// } +// match path { +// [] => false, +// [first_condition, remaining_path @ ..] => { +// for (condition, rest) in self.nodes.iter() { +// if condition == first_condition { +// return rest.contains_prefix_of_path(remaining_path); +// } +// } +// false +// } +// } +// } +// } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 9a6080301..690200773 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -6,8 +6,7 @@ use std::iter::FromIterator; use roaring::RoaringBitmap; -use super::dead_end_path_cache::DeadEndPathCache; -use super::{RankingRuleGraph, RankingRuleGraphTrait}; +use super::{RankingRuleGraph, RankingRuleGraphTrait, DeadEndsCache}; use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::logger::SearchLogger; use crate::search::new::query_term::{Phrase, QueryTerm}; @@ -67,7 +66,7 @@ impl RankingRuleGraphTrait for ProximityGraph { fn log_state( graph: &RankingRuleGraph, paths: &[Vec>], - dead_end_path_cache: &DeadEndPathCache, + dead_end_path_cache: &DeadEndsCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, cost: u16, diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index 0b81ec0ec..d6553a49d 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -1,7 +1,6 @@ use roaring::RoaringBitmap; -use super::dead_end_path_cache::DeadEndPathCache; -use super::{RankingRuleGraph, RankingRuleGraphTrait}; +use super::{RankingRuleGraph, RankingRuleGraphTrait, DeadEndsCache}; use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::logger::SearchLogger; use crate::search::new::query_graph::QueryNodeData; @@ -137,7 +136,7 @@ impl RankingRuleGraphTrait for TypoGraph { fn log_state( graph: &RankingRuleGraph, paths: &[Vec>], - dead_end_path_cache: &DeadEndPathCache, + dead_end_path_cache: &DeadEndsCache, universe: &RoaringBitmap, distances: &MappedInterner)>, QueryNode>, cost: u16, diff --git a/milli/src/search/new/small_bitmap.rs b/milli/src/search/new/small_bitmap.rs index 975c5343f..faf9c077c 100644 --- a/milli/src/search/new/small_bitmap.rs +++ b/milli/src/search/new/small_bitmap.rs @@ -28,6 +28,12 @@ impl SmallBitmap { } } } + pub fn universe_length(&self) -> u16 { + match &self.internal { + SmallBitmapInternal::Tiny(_) => 64, + SmallBitmapInternal::Small(xs) => 64 * xs.len() as u16, + } + } pub fn from_iter( xs: impl Iterator>, for_interner: &FixedSizeInterner,