diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 66dd33036..594405891 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -3,7 +3,53 @@ #[cfg(test)] #[global_allocator] -static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; +pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; + +// #[cfg(test)] +// pub mod allocator { +// use std::alloc::{GlobalAlloc, System}; +// use std::sync::atomic::{self, AtomicI64}; + +// #[global_allocator] +// pub static ALLOC: CountingAlloc = CountingAlloc { +// max_resident: AtomicI64::new(0), +// resident: AtomicI64::new(0), +// allocated: AtomicI64::new(0), +// }; + +// pub struct CountingAlloc { +// pub max_resident: AtomicI64, +// pub resident: AtomicI64, +// pub allocated: AtomicI64, +// } +// unsafe impl GlobalAlloc for CountingAlloc { +// unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 { +// self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst); +// let old_resident = +// self.resident.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst); + +// let resident = old_resident + layout.size() as i64; +// self.max_resident.fetch_max(resident, atomic::Ordering::SeqCst); + +// // if layout.size() > 1_000_000 { +// // eprintln!( +// // "allocating {} with new resident size: {resident}", +// // layout.size() / 1_000_000 +// // ); +// // // let trace = std::backtrace::Backtrace::capture(); +// // // let t = trace.to_string(); +// // // eprintln!("{t}"); +// // } + +// System.alloc(layout) +// } + +// unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) { +// self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::Relaxed); +// System.dealloc(ptr, layout) +// } +// } +// } #[macro_use] pub mod documents; diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 264686b0a..e5a0fbad6 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -3,12 +3,11 @@ use roaring::RoaringBitmap; use super::db_cache::DatabaseCache; use super::logger::SearchLogger; -use super::ranking_rule_graph::cheapest_paths::KCheapestPathsState; use super::ranking_rule_graph::edge_docids_cache::EdgeDocidsCache; use super::ranking_rule_graph::empty_paths_cache::EmptyPathsCache; -use super::ranking_rule_graph::paths_map::PathsMap; + use super::ranking_rule_graph::{RankingRuleGraph, RankingRuleGraphTrait}; -use super::{QueryGraph, RankingRule, RankingRuleOutput}; +use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput}; use crate::{Index, Result}; @@ -24,9 +23,40 @@ impl GraphBasedRankingRule { pub struct GraphBasedRankingRuleState { graph: RankingRuleGraph, - cheapest_paths_state: Option, edge_docids_cache: EdgeDocidsCache, empty_paths_cache: EmptyPathsCache, + all_distances: Vec>, + cur_distance_idx: usize, +} + +fn remove_empty_edges<'transaction, G: RankingRuleGraphTrait>( + graph: &mut RankingRuleGraph, + edge_docids_cache: &mut EdgeDocidsCache, + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + universe: &RoaringBitmap, + empty_paths_cache: &mut EmptyPathsCache, +) -> Result<()> { + for edge_index in 0..graph.all_edges.len() as u32 { + if graph.all_edges[edge_index as usize].is_none() { + continue; + } + let docids = edge_docids_cache + .get_edge_docids(index, txn, db_cache, edge_index, &*graph, universe)?; + match docids { + BitmapOrAllRef::Bitmap(bitmap) => { + if bitmap.is_disjoint(universe) { + graph.remove_edge(edge_index); + empty_paths_cache.forbid_edge(edge_index); + edge_docids_cache.cache.remove(&edge_index); + continue; + } + } + BitmapOrAllRef::All => continue, + } + } + Ok(()) } impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGraph> @@ -41,18 +71,31 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap txn: &'transaction RoTxn, db_cache: &mut DatabaseCache<'transaction>, _logger: &mut dyn SearchLogger, - _universe: &RoaringBitmap, + universe: &RoaringBitmap, query_graph: &QueryGraph, ) -> Result<()> { // TODO: update old state instead of starting from scratch - let graph = RankingRuleGraph::build(index, txn, db_cache, query_graph.clone())?; + let mut graph = RankingRuleGraph::build(index, txn, db_cache, query_graph.clone())?; + let mut edge_docids_cache = EdgeDocidsCache::default(); + let mut empty_paths_cache = EmptyPathsCache::new(graph.all_edges.len()); + + remove_empty_edges( + &mut graph, + &mut edge_docids_cache, + index, + txn, + db_cache, + universe, + &mut empty_paths_cache, + )?; + let all_distances = graph.initialize_distances_cheapest(); - let cheapest_paths_state = KCheapestPathsState::new(&graph); let state = GraphBasedRankingRuleState { graph, - cheapest_paths_state, - edge_docids_cache: <_>::default(), - empty_paths_cache: <_>::default(), + edge_docids_cache, + empty_paths_cache, + all_distances, + cur_distance_idx: 0, }; self.state = Some(state); @@ -70,34 +113,42 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap ) -> Result>> { assert!(universe.len() > 1); let mut state = self.state.take().unwrap(); - if state.cheapest_paths_state.is_none() { + remove_empty_edges( + &mut state.graph, + &mut state.edge_docids_cache, + index, + txn, + db_cache, + universe, + &mut state.empty_paths_cache, + )?; + + if state.cur_distance_idx + >= state.all_distances[state.graph.query_graph.root_node as usize].len() + { + self.state = None; return Ok(None); } + let cost = + state.all_distances[state.graph.query_graph.root_node as usize][state.cur_distance_idx]; + state.cur_distance_idx += 1; - let mut paths = PathsMap::default(); + let paths = state.graph.paths_of_cost( + state.graph.query_graph.root_node as usize, + cost, + &state.all_distances, + &state.empty_paths_cache, + ); - while paths.is_empty() { - let Some(cheapest_paths_state) = state.cheapest_paths_state.take() else { - break; - }; - if let Some(next_cheapest_paths_state) = cheapest_paths_state - .compute_paths_of_next_lowest_cost( - &mut state.graph, - &state.empty_paths_cache, - &mut paths, - ) - { - state.cheapest_paths_state = Some(next_cheapest_paths_state); - } else { - break; - } - } - - if paths.is_empty() && state.cheapest_paths_state.is_none() { - return Ok(None); - } - - G::log_state(&state.graph, &paths, &state.empty_paths_cache, logger); + G::log_state( + &state.graph, + &paths, + &state.empty_paths_cache, + universe, + &state.all_distances, + cost, + logger, + ); let bucket = state.graph.resolve_paths( index, diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index a7a3f8793..d2ce627dc 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -1,6 +1,8 @@ + use rand::random; use roaring::RoaringBitmap; use std::fs::File; +use std::time::Instant; use std::{io::Write, path::PathBuf}; use crate::new::ranking_rule_graph::typo::TypoGraph; @@ -9,7 +11,7 @@ use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::new::ranking_rule_graph::empty_paths_cache::EmptyPathsCache; use crate::new::ranking_rule_graph::{Edge, EdgeDetails, RankingRuleGraphTrait}; use crate::new::ranking_rule_graph::{ - paths_map::PathsMap, proximity::ProximityGraph, RankingRuleGraph, + proximity::ProximityGraph, RankingRuleGraph, }; use super::{RankingRule, SearchLogger}; @@ -19,14 +21,18 @@ pub enum SearchEvents { ranking_rule_idx: usize, query: QueryGraph, universe: RoaringBitmap, + time: Instant, }, RankingRuleNextBucket { ranking_rule_idx: usize, universe: RoaringBitmap, + candidates: RoaringBitmap, + time: Instant, }, RankingRuleEndIteration { ranking_rule_idx: usize, universe: RoaringBitmap, + time: Instant, }, ExtendResults { new: Vec, @@ -36,20 +42,27 @@ pub enum SearchEvents { }, ProximityState { graph: RankingRuleGraph, - paths: PathsMap, + paths: Vec>, empty_paths_cache: EmptyPathsCache, + universe: RoaringBitmap, + distances: Vec>, + cost: u64, }, TypoState { graph: RankingRuleGraph, - paths: PathsMap, + paths: Vec>, empty_paths_cache: EmptyPathsCache, + universe: RoaringBitmap, + distances: Vec>, + cost: u64, }, - RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap }, + RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap, time: Instant, }, } pub struct DetailedSearchLogger { folder_path: PathBuf, initial_query: Option, + initial_query_time: Option, initial_universe: Option, ranking_rules_ids: Option>, events: Vec, @@ -58,17 +71,19 @@ impl DetailedSearchLogger { pub fn new(folder_path: &str) -> Self { Self { folder_path: PathBuf::new().join(folder_path), - initial_query: <_>::default(), - initial_universe: <_>::default(), - ranking_rules_ids: <_>::default(), - events: <_>::default(), + initial_query: None, + initial_query_time: None, + initial_universe: None, + ranking_rules_ids: None, + events: vec![], } } } impl SearchLogger for DetailedSearchLogger { - fn initial_query(&mut self, query: &QueryGraph) { + fn initial_query(&mut self, query: &QueryGraph, time: Instant) { self.initial_query = Some(query.clone()); + self.initial_query_time = Some(time); } fn initial_universe(&mut self, universe: &RoaringBitmap) { @@ -84,11 +99,13 @@ impl SearchLogger for DetailedSearchLogger { _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, query: &QueryGraph, universe: &RoaringBitmap, + time: Instant, ) { self.events.push(SearchEvents::RankingRuleStartIteration { ranking_rule_idx, query: query.clone(), universe: universe.clone(), + time, }) } @@ -97,10 +114,14 @@ impl SearchLogger for DetailedSearchLogger { ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, universe: &RoaringBitmap, + candidates: &RoaringBitmap, + time: Instant, ) { self.events.push(SearchEvents::RankingRuleNextBucket { ranking_rule_idx, universe: universe.clone(), + candidates: candidates.clone(), + time, }) } fn skip_bucket_ranking_rule<'transaction>( @@ -108,10 +129,12 @@ impl SearchLogger for DetailedSearchLogger { ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, candidates: &RoaringBitmap, + time: Instant, ) { self.events.push(SearchEvents::RankingRuleSkipBucket { ranking_rule_idx, candidates: candidates.clone(), + time }) } @@ -120,10 +143,12 @@ impl SearchLogger for DetailedSearchLogger { ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>, universe: &RoaringBitmap, + time: Instant, ) { self.events.push(SearchEvents::RankingRuleEndIteration { ranking_rule_idx, universe: universe.clone(), + time }) } fn add_to_results(&mut self, docids: &[u32]) { @@ -134,18 +159,19 @@ impl SearchLogger for DetailedSearchLogger { self.events.push(SearchEvents::WordsState { query_graph: query_graph.clone() }); } - fn log_proximity_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &PathsMap, empty_paths_cache: &EmptyPathsCache) { - self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.clone(), empty_paths_cache: empty_paths_cache.clone() }) + fn log_proximity_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec>, cost: u64,) { + self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.to_vec(), empty_paths_cache: empty_paths_cache.clone(), universe: universe.clone(), distances, cost }) } - fn log_typo_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &PathsMap, empty_paths_cache: &EmptyPathsCache) { - self.events.push(SearchEvents::TypoState { graph: query_graph.clone(), paths: paths_map.clone(), empty_paths_cache: empty_paths_cache.clone() }) + fn log_typo_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec>, cost: u64,) { + self.events.push(SearchEvents::TypoState { graph: query_graph.clone(), paths: paths_map.to_vec(), empty_paths_cache: empty_paths_cache.clone(), universe: universe.clone(), distances, cost }) } } impl DetailedSearchLogger { pub fn write_d2_description(&self) { + let mut prev_time = self.initial_query_time.unwrap(); let mut timestamp = vec![]; fn activated_id(timestamp: &[usize]) -> String { let mut s = String::new(); @@ -164,13 +190,16 @@ impl DetailedSearchLogger { writeln!(&mut file, "{idx}: {rr_id}").unwrap(); } writeln!(&mut file, "results").unwrap(); + // writeln!(&mut file, "time").unwrap(); for event in self.events.iter() { match event { - SearchEvents::RankingRuleStartIteration { ranking_rule_idx, .. } => { - + SearchEvents::RankingRuleStartIteration { ranking_rule_idx, time, .. } => { + let elapsed = time.duration_since(prev_time); + prev_time = *time; let parent_activated_id = activated_id(×tamp); timestamp.push(0); let self_activated_id = activated_id(×tamp); + // writeln!(&mut file, "time.{self_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap(); if *ranking_rule_idx != 0 { let parent_ranking_rule_idx = ranking_rule_idx - 1; writeln!( @@ -186,16 +215,22 @@ impl DetailedSearchLogger { }} }}").unwrap(); } - SearchEvents::RankingRuleNextBucket { ranking_rule_idx, .. } => { + SearchEvents::RankingRuleNextBucket { ranking_rule_idx, time, universe, candidates } => { + let elapsed = time.duration_since(prev_time); + prev_time = *time; let old_activated_id = activated_id(×tamp); + // writeln!(&mut file, "time.{old_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap(); *timestamp.last_mut().unwrap() += 1; let next_activated_id = activated_id(×tamp); writeln!(&mut file, - "{ranking_rule_idx}.{old_activated_id} -> {ranking_rule_idx}.{next_activated_id} : next bucket",) + "{ranking_rule_idx}.{old_activated_id} -> {ranking_rule_idx}.{next_activated_id} : next bucket {}/{}", candidates.len(), universe.len()) .unwrap(); } - SearchEvents::RankingRuleSkipBucket { ranking_rule_idx, candidates } => { + SearchEvents::RankingRuleSkipBucket { ranking_rule_idx, candidates, time } => { + let elapsed = time.duration_since(prev_time); + prev_time = *time; let old_activated_id = activated_id(×tamp); + // writeln!(&mut file, "time.{old_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap(); *timestamp.last_mut().unwrap() += 1; let next_activated_id = activated_id(×tamp); let len = candidates.len(); @@ -203,8 +238,12 @@ impl DetailedSearchLogger { "{ranking_rule_idx}.{old_activated_id} -> {ranking_rule_idx}.{next_activated_id} : skip bucket ({len})",) .unwrap(); } - SearchEvents::RankingRuleEndIteration { ranking_rule_idx, .. } => { + SearchEvents::RankingRuleEndIteration { ranking_rule_idx, time, .. } => { + let elapsed = time.duration_since(prev_time); + prev_time = *time; let cur_activated_id = activated_id(×tamp); + // writeln!(&mut file, "time.{cur_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap(); + timestamp.pop(); let parent_activated_id = activated_id(×tamp); let parent_ranking_rule = if *ranking_rule_idx == 0 { @@ -254,43 +293,48 @@ results.{random} {{ link: \"{id}.d2.svg\" }}").unwrap(); }, - SearchEvents::ProximityState { graph, paths, empty_paths_cache } => { + SearchEvents::ProximityState { graph, paths, empty_paths_cache, universe, distances, cost } => { let cur_ranking_rule = timestamp.len() - 1; let cur_activated_id = activated_id(×tamp); let id = format!("{cur_ranking_rule}.{cur_activated_id}"); let new_file_path = self.folder_path.join(format!("{id}.d2")); let mut new_file = std::fs::File::create(new_file_path).unwrap(); - Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, &mut new_file); + Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, distances.clone(), &mut new_file); writeln!( &mut file, "{id} {{ link: \"{id}.d2.svg\" -}}").unwrap(); + tooltip: \"cost {cost}, universe len: {}\" +}}", universe.len()).unwrap(); }, - SearchEvents::TypoState { graph, paths, empty_paths_cache } => { + SearchEvents::TypoState { graph, paths, empty_paths_cache, universe, distances, cost } => { let cur_ranking_rule = timestamp.len() - 1; let cur_activated_id = activated_id(×tamp); let id = format!("{cur_ranking_rule}.{cur_activated_id}"); let new_file_path = self.folder_path.join(format!("{id}.d2")); let mut new_file = std::fs::File::create(new_file_path).unwrap(); - Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, &mut new_file); + Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, distances.clone(), &mut new_file); writeln!( &mut file, "{id} {{ link: \"{id}.d2.svg\" -}}").unwrap(); + tooltip: \"cost {cost}, universe len: {}\" +}}", universe.len()).unwrap(); }, } } writeln!(&mut file, "}}").unwrap(); } - fn query_node_d2_desc(node_idx: usize, node: &QueryNode, file: &mut File) { + fn query_node_d2_desc(node_idx: usize, node: &QueryNode, distances: &[u64], file: &mut File) { match &node { QueryNode::Term(LocatedQueryTerm { value, .. }) => { match value { - QueryTerm::Phrase(_) => todo!(), - QueryTerm::Word { derivations: WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db } } => { + QueryTerm::Phrase { phrase } => { + let phrase_str = phrase.description(); + writeln!(file,"{node_idx} : \"{phrase_str}\"").unwrap(); + }, + QueryTerm::Word { derivations: WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db, synonyms, split_words } } => { writeln!(file,"{node_idx} : \"{original}\" {{ shape: class").unwrap(); for w in zero_typo { @@ -302,9 +346,19 @@ shape: class").unwrap(); for w in two_typos { writeln!(file, "\"{w}\" : 2").unwrap(); } + if let Some((left, right)) = split_words { + writeln!(file, "\"{left} {right}\" : split_words").unwrap(); + } + for synonym in synonyms { + writeln!(file, "\"{}\" : synonym", synonym.description()).unwrap(); + } if *use_prefix_db { writeln!(file, "use prefix DB : true").unwrap(); } + // for (i, d) in distances.iter().enumerate() { + // writeln!(file, "\"distances\" : {d}").unwrap(); + // } + writeln!(file, "}}").unwrap(); }, } @@ -324,14 +378,14 @@ shape: class").unwrap(); if matches!(query_graph.nodes[node], QueryNode::Deleted) { continue; } - Self::query_node_d2_desc(node, &query_graph.nodes[node], file); + Self::query_node_d2_desc(node, &query_graph.nodes[node], &[], file); for edge in query_graph.edges[node].successors.iter() { writeln!(file, "{node} -> {edge};\n").unwrap(); } } } - fn ranking_rule_graph_d2_description(graph: &RankingRuleGraph, paths: &PathsMap, empty_paths_cache: &EmptyPathsCache, file: &mut File) { + fn ranking_rule_graph_d2_description(graph: &RankingRuleGraph, paths: &[Vec], _empty_paths_cache: &EmptyPathsCache, distances: Vec>, file: &mut File) { writeln!(file,"direction: right").unwrap(); writeln!(file, "Proximity Graph {{").unwrap(); @@ -339,7 +393,8 @@ shape: class").unwrap(); if matches!(node, QueryNode::Deleted) { continue; } - Self::query_node_d2_desc(node_idx, node, file); + let distances = &distances[node_idx]; + Self::query_node_d2_desc(node_idx, node, distances.as_slice(), file); } for edge in graph.all_edges.iter().flatten() { let Edge { from_node, to_node, details, .. } = edge; @@ -362,26 +417,39 @@ shape: class").unwrap(); } writeln!(file, "}}").unwrap(); + // writeln!(file, "Distances {{").unwrap(); + // Self::paths_d2_description(graph, paths, file); + // writeln!(file, "}}").unwrap(); + + writeln!(file, "Shortest Paths {{").unwrap(); - Self::paths_d2_description(graph, "", paths, file); + Self::paths_d2_description(graph, paths, file); writeln!(file, "}}").unwrap(); - writeln!(file, "Empty Path Prefixes {{").unwrap(); - Self::paths_d2_description(graph, "", &empty_paths_cache.empty_prefixes, file); - writeln!(file, "}}").unwrap(); + // writeln!(file, "Empty Edge Couples {{").unwrap(); + // for (i, (e1, e2)) in empty_paths_cache.empty_couple_edges.iter().enumerate() { + // writeln!(file, "{i} : \"\" {{").unwrap(); + // Self::edge_d2_description(graph, *e1, file); + // Self::edge_d2_description(graph, *e2, file); + // writeln!(file, "{e1} -- {e2}").unwrap(); + // writeln!(file, "}}").unwrap(); + // } + // writeln!(file, "}}").unwrap(); - writeln!(file, "Removed Edges {{").unwrap(); - for edge_idx in empty_paths_cache.empty_edges.iter() { - writeln!(file, "{edge_idx}").unwrap(); - } - writeln!(file, "}}").unwrap(); + // writeln!(file, "Removed Edges {{").unwrap(); + // for edge_idx in empty_paths_cache.empty_edges.iter() { + // writeln!(file, "{edge_idx}").unwrap(); + // } + // writeln!(file, "}}").unwrap(); } - fn edge_d2_description(graph: &RankingRuleGraph, paths_idx: &str, edge_idx: u32, file: &mut File) -> String { + fn edge_d2_description(graph: &RankingRuleGraph, edge_idx: u32, file: &mut File) { let Edge { from_node, to_node, cost, .. } = graph.all_edges[edge_idx as usize].as_ref().unwrap() ; let from_node = &graph.query_graph.nodes[*from_node as usize]; let from_node_desc = match from_node { QueryNode::Term(term) => match &term.value { - QueryTerm::Phrase(_) => todo!(), + QueryTerm::Phrase { phrase } => { + phrase.description() + }, QueryTerm::Word { derivations } => derivations.original.clone(), }, QueryNode::Deleted => panic!(), @@ -391,27 +459,28 @@ shape: class").unwrap(); let to_node = &graph.query_graph.nodes[*to_node as usize]; let to_node_desc = match to_node { QueryNode::Term(term) => match &term.value { - QueryTerm::Phrase(_) => todo!(), + QueryTerm::Phrase { phrase } => phrase.description(), QueryTerm::Word { derivations } => derivations.original.clone(), }, QueryNode::Deleted => panic!(), QueryNode::Start => "START".to_owned(), QueryNode::End => "END".to_owned(), }; - let edge_id = format!("{paths_idx}{edge_idx}"); - writeln!(file, "{edge_id}: \"{from_node_desc}->{to_node_desc} [{cost}]\" {{ + writeln!(file, "{edge_idx}: \"{from_node_desc}->{to_node_desc} [{cost}]\" {{ shape: class }}").unwrap(); - edge_id } - fn paths_d2_description(graph: &RankingRuleGraph, paths_idx: &str, paths: &PathsMap, file: &mut File) { - for (edge_idx, rest) in paths.nodes.iter() { - let edge_id = Self::edge_d2_description(graph, paths_idx, *edge_idx, file); - for (dest_edge_idx, _) in rest.nodes.iter() { - let dest_edge_id = format!("{paths_idx}{edge_idx}{dest_edge_idx}"); - writeln!(file, "{edge_id} -> {dest_edge_id}").unwrap(); + fn paths_d2_description(graph: &RankingRuleGraph, paths: &[Vec], file: &mut File) { + for (path_idx, edge_indexes) in paths.iter().enumerate() { + writeln!(file, "{path_idx} {{").unwrap(); + for edge_idx in edge_indexes.iter() { + Self::edge_d2_description(graph, *edge_idx, file); } - Self::paths_d2_description(graph, &format!("{paths_idx}{edge_idx}"), rest, file); + for couple_edges in edge_indexes.windows(2) { + let [src_edge_idx, dest_edge_idx] = couple_edges else { panic!() }; + writeln!(file, "{src_edge_idx} -> {dest_edge_idx}").unwrap(); + } + writeln!(file, "}}").unwrap(); } } } diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 4e119ae42..079bb892c 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -2,28 +2,31 @@ pub mod detailed; use roaring::RoaringBitmap; +use std::time::Instant; use super::{ ranking_rule_graph::{ - empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, proximity::ProximityGraph, - typo::TypoGraph, RankingRuleGraph, + empty_paths_cache::EmptyPathsCache, proximity::ProximityGraph, typo::TypoGraph, + RankingRuleGraph, }, RankingRule, RankingRuleQueryTrait, }; pub struct DefaultSearchLogger; impl SearchLogger for DefaultSearchLogger { - fn initial_query(&mut self, _query: &Q) {} + fn initial_query(&mut self, _query: &Q, _time: Instant) {} fn initial_universe(&mut self, _universe: &RoaringBitmap) {} fn ranking_rules(&mut self, _rr: &[&mut dyn RankingRule]) {} + fn start_iteration_ranking_rule<'transaction>( &mut self, _ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule<'transaction, Q>, _query: &Q, _universe: &RoaringBitmap, + _time: Instant, ) { } @@ -32,6 +35,8 @@ impl SearchLogger for DefaultSearchLogger { _ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule<'transaction, Q>, _universe: &RoaringBitmap, + _candidates: &RoaringBitmap, + _time: Instant, ) { } fn skip_bucket_ranking_rule<'transaction>( @@ -39,6 +44,7 @@ impl SearchLogger for DefaultSearchLogger { _ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule<'transaction, Q>, _candidates: &RoaringBitmap, + _time: Instant, ) { } @@ -47,6 +53,7 @@ impl SearchLogger for DefaultSearchLogger { _ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule<'transaction, Q>, _universe: &RoaringBitmap, + _time: Instant, ) { } @@ -57,22 +64,28 @@ impl SearchLogger for DefaultSearchLogger { fn log_proximity_state( &mut self, _query_graph: &RankingRuleGraph, - _paths_map: &PathsMap, + _paths_map: &[Vec], _empty_paths_cache: &EmptyPathsCache, + _universe: &RoaringBitmap, + _distances: Vec>, + _cost: u64, ) { } fn log_typo_state( &mut self, - query_graph: &RankingRuleGraph, - paths: &PathsMap, - empty_paths_cache: &EmptyPathsCache, + _query_graph: &RankingRuleGraph, + _paths: &[Vec], + _empty_paths_cache: &EmptyPathsCache, + _universe: &RoaringBitmap, + _distances: Vec>, + _cost: u64, ) { } } pub trait SearchLogger { - fn initial_query(&mut self, query: &Q); + fn initial_query(&mut self, query: &Q, time: Instant); fn initial_universe(&mut self, universe: &RoaringBitmap); fn ranking_rules(&mut self, rr: &[&mut dyn RankingRule]); @@ -83,24 +96,29 @@ pub trait SearchLogger { ranking_rule: &dyn RankingRule<'transaction, Q>, query: &Q, universe: &RoaringBitmap, + time: Instant, ); fn next_bucket_ranking_rule<'transaction>( &mut self, ranking_rule_idx: usize, ranking_rule: &dyn RankingRule<'transaction, Q>, universe: &RoaringBitmap, + candidates: &RoaringBitmap, + time: Instant, ); fn skip_bucket_ranking_rule<'transaction>( &mut self, ranking_rule_idx: usize, ranking_rule: &dyn RankingRule<'transaction, Q>, candidates: &RoaringBitmap, + time: Instant, ); fn end_iteration_ranking_rule<'transaction>( &mut self, ranking_rule_idx: usize, ranking_rule: &dyn RankingRule<'transaction, Q>, universe: &RoaringBitmap, + time: Instant, ); fn add_to_results(&mut self, docids: &[u32]); @@ -109,14 +127,20 @@ pub trait SearchLogger { fn log_proximity_state( &mut self, query_graph: &RankingRuleGraph, - paths: &PathsMap, + paths: &[Vec], empty_paths_cache: &EmptyPathsCache, + universe: &RoaringBitmap, + _distances: Vec>, + cost: u64, ); fn log_typo_state( &mut self, query_graph: &RankingRuleGraph, - paths: &PathsMap, + paths: &[Vec], empty_paths_cache: &EmptyPathsCache, + universe: &RoaringBitmap, + _distances: Vec>, + cost: u64, ); } diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index e46f6ce66..2377f1c84 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -1,10 +1,8 @@ -use std::collections::{BTreeMap, HashSet}; - -use roaring::RoaringBitmap; +#![allow(clippy::too_many_arguments)] use super::empty_paths_cache::EmptyPathsCache; -use super::paths_map::PathsMap; -use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait}; +use super::{RankingRuleGraph, RankingRuleGraphTrait}; +use std::collections::VecDeque; #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct Path { @@ -12,226 +10,119 @@ pub struct Path { pub cost: u64, } -struct DijkstraState { - unvisited: RoaringBitmap, // should be a small bitset? - distances: Vec, // or binary heap, or btreemap? (f64, usize) - edges: Vec, - edge_costs: Vec, - paths: Vec>, -} - -pub struct KCheapestPathsState { - cheapest_paths: PathsMap, - potential_cheapest_paths: BTreeMap>, - pub kth_cheapest_path: Path, -} - -impl KCheapestPathsState { - pub fn next_cost(&self) -> u64 { - self.kth_cheapest_path.cost - } - - pub fn new( - graph: &RankingRuleGraph, - ) -> Option { - let Some(cheapest_path) = graph.cheapest_path_to_end(graph.query_graph.root_node) else { - return None - }; - let cheapest_paths = PathsMap::from_paths(&[cheapest_path.clone()]); - let potential_cheapest_paths = BTreeMap::new(); - Some(KCheapestPathsState { - cheapest_paths, - potential_cheapest_paths, - kth_cheapest_path: cheapest_path, - }) - } - - pub fn remove_empty_paths(mut self, empty_paths_cache: &EmptyPathsCache) -> Option { - self.cheapest_paths.remove_edges(&empty_paths_cache.empty_edges); - self.cheapest_paths.remove_prefixes(&empty_paths_cache.empty_prefixes); - - let mut costs_to_delete = HashSet::new(); - for (cost, potential_cheapest_paths) in self.potential_cheapest_paths.iter_mut() { - potential_cheapest_paths.remove_edges(&empty_paths_cache.empty_edges); - potential_cheapest_paths.remove_prefixes(&empty_paths_cache.empty_prefixes); - if potential_cheapest_paths.is_empty() { - costs_to_delete.insert(*cost); - } - } - for cost in costs_to_delete { - self.potential_cheapest_paths.remove(&cost); - } - - if self.cheapest_paths.is_empty() {} - - todo!() - } - - pub fn compute_paths_of_next_lowest_cost( - mut self, - graph: &mut RankingRuleGraph, +impl RankingRuleGraph { + pub fn paths_of_cost( + &self, + from: usize, + cost: u64, + all_distances: &[Vec], empty_paths_cache: &EmptyPathsCache, - into_map: &mut PathsMap, - ) -> Option { - if !empty_paths_cache.path_is_empty(&self.kth_cheapest_path.edges) { - into_map.add_path(&self.kth_cheapest_path); + ) -> Vec> { + let mut paths = vec![]; + self.paths_of_cost_rec( + from, + all_distances, + cost, + &mut vec![], + &mut paths, + &vec![false; self.all_edges.len()], + empty_paths_cache, + ); + paths + } + pub fn paths_of_cost_rec( + &self, + from: usize, + all_distances: &[Vec], + cost: u64, + prev_edges: &mut Vec, + paths: &mut Vec>, + forbidden_edges: &[bool], + empty_paths_cache: &EmptyPathsCache, + ) { + let distances = &all_distances[from]; + if !distances.contains(&cost) { + panic!(); } - let cur_cost = self.kth_cheapest_path.cost; - while self.kth_cheapest_path.cost <= cur_cost { - if let Some(next_self) = self.compute_next_cheapest_paths(graph, empty_paths_cache) { - self = next_self; - if self.kth_cheapest_path.cost == cur_cost - && !empty_paths_cache.path_is_empty(&self.kth_cheapest_path.edges) + let tos = &self.query_graph.edges[from].successors; + let mut valid_edges = vec![]; + for to in tos { + self.visit_edges::<()>(from as u32, to, |edge_idx, edge| { + if cost >= edge.cost as u64 + && all_distances[to as usize].contains(&(cost - edge.cost as u64)) + && !forbidden_edges[edge_idx as usize] { - into_map.add_path(&self.kth_cheapest_path); - } else { - break; + valid_edges.push((edge_idx, edge.cost, to)); } - } else { - return None; - } + std::ops::ControlFlow::Continue(()) + }); } - Some(self) - } - fn compute_next_cheapest_paths( - mut self, - graph: &mut RankingRuleGraph, - empty_paths_cache: &EmptyPathsCache, - ) -> Option { - // for all nodes in the last cheapest path (called spur_node), except last one... - for (i, edge_idx) in self.kth_cheapest_path.edges[..self.kth_cheapest_path.edges.len() - 1] - .iter() - .enumerate() - { - let Some(edge) = graph.all_edges[*edge_idx as usize].as_ref() else { continue; }; - let Edge { from_node: spur_node, .. } = edge; - - let root_path = &self.kth_cheapest_path.edges[..i]; - if empty_paths_cache.path_is_empty(root_path) { + for (edge_idx, edge_cost, to) in valid_edges { + prev_edges.push(edge_idx); + if empty_paths_cache.empty_prefixes.contains_prefix_of_path(prev_edges) { continue; } - - let root_cost = root_path.iter().fold(0, |sum, next| { - sum + graph.all_edges[*next as usize].as_ref().unwrap().cost as u64 - }); - - let mut tmp_removed_edges = vec![]; - // for all the paths already found that share a common prefix with the root path - // we delete the edge from the spur node to the next one - for edge_index_to_remove in self.cheapest_paths.edge_indices_after_prefix(root_path) { - let was_removed = - graph.node_edges[*spur_node as usize].remove(edge_index_to_remove); - if was_removed { - tmp_removed_edges.push(edge_index_to_remove); - } + let mut new_forbidden_edges = forbidden_edges.to_vec(); + for edge_idx in empty_paths_cache.empty_couple_edges[edge_idx as usize].iter() { + new_forbidden_edges[*edge_idx as usize] = true; + } + for edge_idx in empty_paths_cache.empty_prefixes.final_edges_ater_prefix(prev_edges) { + new_forbidden_edges[edge_idx as usize] = true; } - // Compute the cheapest path from the spur node to the destination - // we will combine it with the root path to get a potential kth cheapest path - let spur_path = graph.cheapest_path_to_end(*spur_node); - // restore the temporarily removed edges - graph.node_edges[*spur_node as usize].extend(tmp_removed_edges); - - let Some(spur_path) = spur_path else { continue; }; - let total_cost = root_cost + spur_path.cost; - let total_path = Path { - edges: root_path.iter().chain(spur_path.edges.iter()).cloned().collect(), - cost: total_cost, - }; - let entry = self.potential_cheapest_paths.entry(total_cost).or_default(); - entry.add_path(&total_path); + if to == self.query_graph.end_node { + paths.push(prev_edges.clone()); + } else { + self.paths_of_cost_rec( + to as usize, + all_distances, + cost - edge_cost as u64, + prev_edges, + paths, + &new_forbidden_edges, + empty_paths_cache, + ) + } + prev_edges.pop(); } - while let Some(mut next_cheapest_paths_entry) = self.potential_cheapest_paths.first_entry() + } + + pub fn initialize_distances_cheapest(&self) -> Vec> { + let mut distances_to_end: Vec> = vec![vec![]; self.query_graph.nodes.len()]; + let mut enqueued = vec![false; self.query_graph.nodes.len()]; + + let mut node_stack = VecDeque::new(); + + distances_to_end[self.query_graph.end_node as usize] = vec![0]; + for prev_node in + self.query_graph.edges[self.query_graph.end_node as usize].predecessors.iter() { - let cost = *next_cheapest_paths_entry.key(); - let next_cheapest_paths = next_cheapest_paths_entry.get_mut(); + node_stack.push_back(prev_node as usize); + enqueued[prev_node as usize] = true; + } - while let Some((next_cheapest_path, cost2)) = next_cheapest_paths.remove_first() { - assert_eq!(cost, cost2); - // NOTE: it is important not to discard the paths that are forbidden due to a - // forbidden prefix, because the cheapest path algorithm (Dijkstra) cannot take - // this property into account. - if next_cheapest_path - .iter() - .any(|edge_index| graph.all_edges[*edge_index as usize].is_none()) - { - continue; - } else { - self.cheapest_paths.insert(next_cheapest_path.iter().copied(), cost); - - if next_cheapest_paths.is_empty() { - next_cheapest_paths_entry.remove(); + while let Some(cur_node) = node_stack.pop_front() { + let mut self_distances = vec![]; + for succ_node in self.query_graph.edges[cur_node].successors.iter() { + let succ_distances = &distances_to_end[succ_node as usize]; + let _ = self.visit_edges::<()>(cur_node as u32, succ_node, |_, edge| { + for succ_distance in succ_distances { + self_distances.push(edge.cost as u64 + succ_distance); } - self.kth_cheapest_path = Path { edges: next_cheapest_path, cost }; - - return Some(self); + std::ops::ControlFlow::Continue(()) + }); + } + self_distances.sort_unstable(); + self_distances.dedup(); + distances_to_end[cur_node] = self_distances; + for prev_node in self.query_graph.edges[cur_node].predecessors.iter() { + if !enqueued[prev_node as usize] { + node_stack.push_back(prev_node as usize); + enqueued[prev_node as usize] = true; } } - let _ = next_cheapest_paths_entry.remove_entry(); } - None - } -} - -impl RankingRuleGraph { - fn cheapest_path_to_end(&self, from: u32) -> Option { - let mut dijkstra = DijkstraState { - unvisited: (0..self.query_graph.nodes.len() as u32).collect(), - distances: vec![u64::MAX; self.query_graph.nodes.len()], - edges: vec![u32::MAX; self.query_graph.nodes.len()], - edge_costs: vec![u8::MAX; self.query_graph.nodes.len()], - paths: vec![None; self.query_graph.nodes.len()], - }; - dijkstra.distances[from as usize] = 0; - - // TODO: could use a binary heap here to store the distances, or a btreemap - while let Some(cur_node) = - dijkstra.unvisited.iter().min_by_key(|&n| dijkstra.distances[n as usize]) - { - let cur_node_dist = dijkstra.distances[cur_node as usize]; - if cur_node_dist == u64::MAX { - return None; - } - if cur_node == self.query_graph.end_node { - break; - } - - let succ_cur_node = &self.successors[cur_node as usize]; - let unvisited_succ_cur_node = succ_cur_node & &dijkstra.unvisited; - for succ in unvisited_succ_cur_node { - let Some((cheapest_edge, cheapest_edge_cost)) = self.cheapest_edge(cur_node, succ) else { - continue - }; - - let old_dist_succ = &mut dijkstra.distances[succ as usize]; - let new_potential_distance = cur_node_dist + cheapest_edge_cost as u64; - if new_potential_distance < *old_dist_succ { - *old_dist_succ = new_potential_distance; - dijkstra.edges[succ as usize] = cheapest_edge; - dijkstra.edge_costs[succ as usize] = cheapest_edge_cost; - dijkstra.paths[succ as usize] = Some(cur_node); - } - } - dijkstra.unvisited.remove(cur_node); - } - - let mut cur = self.query_graph.end_node; - let mut path_edges = vec![]; - while let Some(n) = dijkstra.paths[cur as usize] { - path_edges.push(dijkstra.edges[cur as usize]); - cur = n; - } - path_edges.reverse(); - Some(Path { - edges: path_edges, - cost: dijkstra.distances[self.query_graph.end_node as usize], - }) - } - - pub fn cheapest_edge(&self, cur_node: u32, succ: u32) -> Option<(u32, u8)> { - self.visit_edges(cur_node, succ, |edge_idx, edge| { - std::ops::ControlFlow::Break((edge_idx, edge.cost)) - }) + distances_to_end } } diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs index cb3e3da38..ef2eba895 100644 --- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs @@ -32,16 +32,19 @@ impl EdgeDocidsCache { db_cache: &mut DatabaseCache<'transaction>, edge_index: u32, graph: &RankingRuleGraph, + // TODO: maybe universe doesn't belong here + universe: &RoaringBitmap, ) -> Result> { - if self.cache.contains_key(&edge_index) { - return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index])); - } let edge = graph.all_edges[edge_index as usize].as_ref().unwrap(); match &edge.details { EdgeDetails::Unconditional => Ok(BitmapOrAllRef::All), EdgeDetails::Data(details) => { - let docids = G::compute_docids(index, txn, db_cache, details)?; + if self.cache.contains_key(&edge_index) { + return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index])); + } + // TODO: maybe universe doesn't belong here + let docids = universe & G::compute_docids(index, txn, db_cache, details)?; let _ = self.cache.insert(edge_index, docids); let docids = &self.cache[&edge_index]; diff --git a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs index db68838b5..bbfe2eedd 100644 --- a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs @@ -1,26 +1,60 @@ -use roaring::RoaringBitmap; - use super::paths_map::PathsMap; -#[derive(Default, Clone)] +#[derive(Clone)] pub struct EmptyPathsCache { - pub empty_edges: RoaringBitmap, + pub empty_edges: Vec, pub empty_prefixes: PathsMap<()>, + pub empty_couple_edges: Vec>, } impl EmptyPathsCache { + pub fn new(all_edges_len: usize) -> Self { + Self { + empty_edges: vec![false; all_edges_len], + empty_prefixes: PathsMap::default(), + empty_couple_edges: vec![vec![]; all_edges_len], + } + } pub fn forbid_edge(&mut self, edge_idx: u32) { - self.empty_edges.insert(edge_idx); + self.empty_edges[edge_idx as usize] = true; + self.empty_couple_edges[edge_idx as usize] = vec![]; self.empty_prefixes.remove_edge(&edge_idx); + for edges2 in self.empty_couple_edges.iter_mut() { + if let Some(edge2_pos) = edges2.iter().position(|e| *e == edge_idx) { + edges2.swap_remove(edge2_pos); + } + } + } + pub fn forbid_prefix(&mut self, prefix: &[u32]) { + self.empty_prefixes.insert(prefix.iter().copied(), ()); + } + pub fn forbid_couple_edges(&mut self, edge1: u32, edge2: u32) { + assert!(!self.empty_couple_edges[edge1 as usize].contains(&edge2)); + self.empty_couple_edges[edge1 as usize].push(edge2); } pub fn path_is_empty(&self, path: &[u32]) -> bool { for edge in path { - if self.empty_edges.contains(*edge) { + if self.empty_edges[*edge as usize] { return true; } } if self.empty_prefixes.contains_prefix_of_path(path) { return true; } + for (edge1, edges2) in self.empty_couple_edges.iter().enumerate() { + if let Some(pos_edge1) = path.iter().position(|e| *e == edge1 as u32) { + if path[pos_edge1..].iter().any(|e| edges2.contains(e)) { + return true; + } + } + } + // for (edge1, edge2) in self.empty_couple_edges.iter() { + // if path.contains(edge1) && path.contains(edge2) { + // return true; + // } + // } + // if self.empty_prefixes.contains_prefix_of_path(path) { + // return true; + // } false } } diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index b1adb80fc..ac5e1f46b 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -13,7 +13,6 @@ use heed::RoTxn; use roaring::RoaringBitmap; use self::empty_paths_cache::EmptyPathsCache; -use self::paths_map::PathsMap; use super::db_cache::DatabaseCache; use super::logger::SearchLogger; @@ -83,8 +82,11 @@ pub trait RankingRuleGraphTrait: Sized { fn log_state( graph: &RankingRuleGraph, - paths: &PathsMap, + paths: &[Vec], empty_paths_cache: &EmptyPathsCache, + universe: &RoaringBitmap, + distances: &[Vec], + cost: u64, logger: &mut dyn SearchLogger, ); } @@ -135,7 +137,7 @@ impl RankingRuleGraph { None } - fn remove_edge(&mut self, edge_index: u32) { + pub fn remove_edge(&mut self, edge_index: u32) { let edge_opt = &mut self.all_edges[edge_index as usize]; let Some(edge) = &edge_opt else { return }; let (from_node, _to_node) = (edge.from_node, edge.to_node); @@ -151,44 +153,4 @@ impl RankingRuleGraph { } self.successors[from_node as usize] = new_successors_from_node; } - - pub fn graphviz(&self) -> String { - let mut desc = String::new(); - desc.push_str("digraph G {\nrankdir = LR;\nnode [shape = \"record\"]\n"); - - for (node_idx, node) in self.query_graph.nodes.iter().enumerate() { - if matches!(node, QueryNode::Deleted) { - continue; - } - desc.push_str(&format!("{node_idx} [label = {:?}]", node)); - if node_idx == self.query_graph.root_node as usize { - desc.push_str("[color = blue]"); - } else if node_idx == self.query_graph.end_node as usize { - desc.push_str("[color = red]"); - } - desc.push_str(";\n"); - } - for edge in self.all_edges.iter().flatten() { - let Edge { from_node, to_node, details, .. } = edge; - - match &details { - EdgeDetails::Unconditional => { - desc.push_str(&format!( - "{from_node} -> {to_node} [label = \"always cost {cost}\"];\n", - cost = edge.cost, - )); - } - EdgeDetails::Data(details) => { - desc.push_str(&format!( - "{from_node} -> {to_node} [label = \"cost {cost} {edge_label}\"];\n", - cost = edge.cost, - edge_label = G::graphviz_edge_details_label(details) - )); - } - } - } - - desc.push('}'); - desc - } } diff --git a/milli/src/search/new/ranking_rule_graph/paths_map.rs b/milli/src/search/new/ranking_rule_graph/paths_map.rs index 111b55140..3b01508c9 100644 --- a/milli/src/search/new/ranking_rule_graph/paths_map.rs +++ b/milli/src/search/new/ranking_rule_graph/paths_map.rs @@ -1,12 +1,11 @@ -use std::collections::hash_map::DefaultHasher; -use std::fmt::Write; -use std::hash::{Hash, Hasher}; + + + use roaring::RoaringBitmap; use super::cheapest_paths::Path; -use super::{Edge, EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait}; -use crate::new::QueryNode; + #[derive(Debug, Clone)] pub struct PathsMap { @@ -157,6 +156,24 @@ impl PathsMap { } } + pub fn final_edges_ater_prefix(&self, prefix: &[u32]) -> Vec { + let [first_edge, remaining_prefix @ ..] = prefix else { + return self.nodes.iter().filter_map(|n| { + if n.1.value.is_some() { + Some(n.0) + } else { + None + } + }).collect(); + }; + for (edge, rest) in self.nodes.iter() { + if edge == first_edge { + return rest.final_edges_ater_prefix(remaining_prefix); + } + } + vec![] + } + pub fn edge_indices_after_prefix(&self, prefix: &[u32]) -> Vec { let [first_edge, remaining_prefix @ ..] = prefix else { return self.nodes.iter().map(|n| n.0).collect(); @@ -185,88 +202,4 @@ impl PathsMap { } } } - - pub fn graphviz(&self, graph: &RankingRuleGraph) -> String { - let mut desc = String::new(); - desc.push_str("digraph G {\n"); - self.graphviz_rec(&mut desc, vec![], graph); - desc.push_str("\n}\n"); - desc - } - fn graphviz_rec( - &self, - desc: &mut String, - path_from: Vec, - graph: &RankingRuleGraph, - ) { - let id_from = { - let mut h = DefaultHasher::new(); - path_from.hash(&mut h); - h.finish() - }; - for (edge_idx, rest) in self.nodes.iter() { - let Some(Edge { from_node, to_node, cost, .. }) = graph.all_edges[*edge_idx as usize].as_ref() else { - continue; - }; - let mut path_to = path_from.clone(); - path_to.push({ - let mut h = DefaultHasher::new(); - edge_idx.hash(&mut h); - h.finish() - }); - let id_to = { - let mut h = DefaultHasher::new(); - path_to.hash(&mut h); - h.finish() - }; - writeln!(desc, "{id_to} [label = \"{from_node}→{to_node} [{cost}]\"];").unwrap(); - writeln!(desc, "{id_from} -> {id_to};").unwrap(); - - rest.graphviz_rec(desc, path_to, graph); - } - } -} - -impl RankingRuleGraph { - pub fn graphviz_with_path(&self, path: &Path) -> String { - let mut desc = String::new(); - desc.push_str("digraph G {\nrankdir = LR;\nnode [shape = \"record\"]\n"); - - for (node_idx, node) in self.query_graph.nodes.iter().enumerate() { - if matches!(node, QueryNode::Deleted) { - continue; - } - desc.push_str(&format!("{node_idx} [label = {:?}]", node)); - if node_idx == self.query_graph.root_node as usize { - desc.push_str("[color = blue]"); - } else if node_idx == self.query_graph.end_node as usize { - desc.push_str("[color = red]"); - } - desc.push_str(";\n"); - } - - for (edge_idx, edge) in self.all_edges.iter().enumerate() { - let Some(edge) = edge else { continue }; - let Edge { from_node, to_node, .. } = edge; - let color = if path.edges.contains(&(edge_idx as u32)) { "red" } else { "green" }; - match &edge.details { - EdgeDetails::Unconditional => { - desc.push_str(&format!( - "{from_node} -> {to_node} [label = \"cost {cost}\", color = {color}];\n", - cost = edge.cost, - )); - } - EdgeDetails::Data(details) => { - desc.push_str(&format!( - "{from_node} -> {to_node} [label = \"cost {cost} {edge_label}\", color = {color}];\n", - cost = edge.cost, - edge_label = G::graphviz_edge_details_label(details), - )); - } - } - } - - desc.push('}'); - desc - } } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 6d2fefa65..9b4fa8edf 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -16,9 +16,9 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result { match value1 { QueryTerm::Word { derivations } => (derivations.clone(), *pos1.end()), - QueryTerm::Phrase(phrase1) => { + QueryTerm::Phrase { phrase: phrase1 } => { // TODO: remove second unwrap - let original = phrase1.last().unwrap().as_ref().unwrap().clone(); + let original = phrase1.words.last().unwrap().as_ref().unwrap().clone(); ( WordDerivations { original: original.clone(), @@ -26,6 +26,8 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result Result( let (derivations2, pos2, ngram_len2) = match value2 { QueryTerm::Word { derivations } => (derivations.clone(), *pos2.start(), pos2.len()), - QueryTerm::Phrase(phrase2) => { + QueryTerm::Phrase { phrase: phrase2 } => { // TODO: remove second unwrap - let original = phrase2.last().unwrap().as_ref().unwrap().clone(); + let original = phrase2.words.last().unwrap().as_ref().unwrap().clone(); ( WordDerivations { original: original.clone(), @@ -73,6 +77,8 @@ pub fn visit_to_node<'transaction, 'from_data>( one_typo: vec![], two_typos: vec![], use_prefix_db: false, + synonyms: vec![], + split_words: None, }, *pos2.start(), 1, diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index fc1a44310..5b3869ea8 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -2,18 +2,21 @@ pub mod build; pub mod compute_docids; use heed::RoTxn; +use roaring::RoaringBitmap; use super::empty_paths_cache::EmptyPathsCache; -use super::paths_map::PathsMap; + use super::{EdgeDetails, RankingRuleGraphTrait}; use crate::new::db_cache::DatabaseCache; +use crate::new::logger::SearchLogger; use crate::new::query_term::WordDerivations; -use crate::new::QueryNode; +use crate::new::{QueryGraph, QueryNode}; use crate::{Index, Result}; +// TODO: intern the strings, refer to them by their pointer? + #[derive(Debug, Clone)] pub enum WordPair { - // TODO: add WordsSwapped and WordPrefixSwapped case Words { left: String, right: String }, WordsSwapped { left: String, right: String }, WordPrefix { left: String, right_prefix: String }, @@ -22,6 +25,7 @@ pub enum WordPair { #[derive(Clone)] pub struct ProximityEdge { + // TODO: use a list of pointers to the word pairs instead? pairs: Vec, proximity: u8, } @@ -67,10 +71,20 @@ impl RankingRuleGraphTrait for ProximityGraph { fn log_state( graph: &super::RankingRuleGraph, - paths: &PathsMap, + paths: &[Vec], empty_paths_cache: &EmptyPathsCache, - logger: &mut dyn crate::new::logger::SearchLogger, + universe: &RoaringBitmap, + distances: &[Vec], + cost: u64, + logger: &mut dyn SearchLogger, ) { - logger.log_proximity_state(graph, paths, empty_paths_cache); + logger.log_proximity_state( + graph, + paths, + empty_paths_cache, + universe, + distances.to_vec(), + cost, + ); } } diff --git a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs index 90650340f..f3394206b 100644 --- a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs @@ -5,7 +5,7 @@ use roaring::{MultiOps, RoaringBitmap}; use super::edge_docids_cache::EdgeDocidsCache; use super::empty_paths_cache::EmptyPathsCache; -use super::paths_map::PathsMap; + use super::{RankingRuleGraph, RankingRuleGraphTrait}; use crate::new::db_cache::DatabaseCache; @@ -21,44 +21,65 @@ impl RankingRuleGraph { edge_docids_cache: &mut EdgeDocidsCache, empty_paths_cache: &mut EmptyPathsCache, universe: &RoaringBitmap, - mut paths: PathsMap, + mut paths: Vec>, ) -> Result { + paths.sort_unstable(); + let mut needs_filtering = false; let mut path_bitmaps = vec![]; + 'path_loop: loop { + if needs_filtering { + for path in paths.iter_mut() { + if empty_paths_cache.path_is_empty(path) { + path.clear(); + } + } + needs_filtering = false; + } + let Some(edge_indexes) = paths.pop() else { + break; + }; - paths.remove_edges(&empty_paths_cache.empty_edges); - paths.remove_prefixes(&empty_paths_cache.empty_prefixes); + if edge_indexes.is_empty() { + continue; + } - 'path_loop: while let Some((edge_indexes, _)) = paths.remove_first() { - // if path is excluded, continue... - let mut processed_edges = vec![]; let mut path_bitmap = universe.clone(); + let mut visited_edges = vec![]; + let mut cached_edge_docids = vec![]; 'edge_loop: for edge_index in edge_indexes { - processed_edges.push(edge_index); - let edge_docids = - edge_docids_cache.get_edge_docids(index, txn, db_cache, edge_index, self)?; + visited_edges.push(edge_index); + let edge_docids = edge_docids_cache + .get_edge_docids(index, txn, db_cache, edge_index, self, universe)?; match edge_docids { BitmapOrAllRef::Bitmap(edge_docids) => { + cached_edge_docids.push((edge_index, edge_docids.clone())); + let (_, edge_docids) = cached_edge_docids.last().unwrap(); if edge_docids.is_disjoint(universe) { // 1. Store in the cache that this edge is empty for this universe empty_paths_cache.forbid_edge(edge_index); - // 2. remove all the paths that contain this edge for this universe - paths.remove_edge(&edge_index); - // 3. remove this edge from the proximity graph - + // 2. remove this edge from the proximity graph self.remove_edge(edge_index); - - // 4. continue executing this function again on the remaining paths + edge_docids_cache.cache.remove(&edge_index); + needs_filtering = true; + // 3. continue executing this function again on the remaining paths continue 'path_loop; } else { path_bitmap &= edge_docids; if path_bitmap.is_disjoint(universe) { - // 1. Store in the cache that this prefix is empty for this universe - empty_paths_cache - .empty_prefixes - .insert(processed_edges.iter().copied(), ()); - // 2. remove all the paths beginning with this prefix - paths.remove_prefix(&processed_edges); - // 3. continue executing this function again on the remaining paths? + needs_filtering = true; + empty_paths_cache.forbid_prefix(&visited_edges); + // if the intersection between this edge and any + // previous one is disjoint with the universe, + // then we add these two edges to the empty_path_cache + for (edge_index2, edge_docids2) in + cached_edge_docids[..cached_edge_docids.len() - 1].iter() + { + let intersection = edge_docids & edge_docids2; + if intersection.is_disjoint(universe) { + empty_paths_cache + .forbid_couple_edges(*edge_index2, edge_index); + } + } continue 'path_loop; } } @@ -68,6 +89,7 @@ impl RankingRuleGraph { } path_bitmaps.push(path_bitmap); } + Ok(MultiOps::union(path_bitmaps)) } } diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index 55a45e3c3..c9ca7c229 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -2,16 +2,18 @@ use heed::{BytesDecode, RoTxn}; use roaring::RoaringBitmap; use super::empty_paths_cache::EmptyPathsCache; -use super::paths_map::PathsMap; -use super::{EdgeDetails, RankingRuleGraphTrait}; + +use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait}; use crate::new::db_cache::DatabaseCache; -use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; -use crate::new::QueryNode; +use crate::new::logger::SearchLogger; +use crate::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations}; +use crate::new::resolve_query_graph::resolve_phrase; +use crate::new::{QueryGraph, QueryNode}; use crate::{Index, Result, RoaringBitmapCodec}; #[derive(Clone)] pub enum TypoEdge { - Phrase, + Phrase { phrase: Phrase }, Word { derivations: WordDerivations, nbr_typos: u8 }, } @@ -23,7 +25,7 @@ impl RankingRuleGraphTrait for TypoGraph { fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String { match edge { - TypoEdge::Phrase => format!(", 0 typos"), + TypoEdge::Phrase { .. } => ", 0 typos".to_owned(), TypoEdge::Word { nbr_typos, .. } => format!(", {nbr_typos} typos"), } } @@ -33,9 +35,9 @@ impl RankingRuleGraphTrait for TypoGraph { txn: &'transaction RoTxn, db_cache: &mut DatabaseCache<'transaction>, edge: &Self::EdgeDetails, - ) -> Result { + ) -> Result { match edge { - TypoEdge::Phrase => todo!(), + TypoEdge::Phrase { phrase } => resolve_phrase(index, txn, db_cache, phrase), TypoEdge::Word { derivations, nbr_typos } => { let words = match nbr_typos { 0 => &derivations.zero_typo, @@ -68,21 +70,23 @@ impl RankingRuleGraphTrait for TypoGraph { _index: &Index, _txn: &'transaction RoTxn, _db_cache: &mut DatabaseCache<'transaction>, - from_node: &QueryNode, + _from_node: &QueryNode, ) -> Result> { Ok(Some(())) } fn build_visit_to_node<'from_data, 'transaction: 'from_data>( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + _index: &Index, + _txn: &'transaction RoTxn, + _db_cache: &mut DatabaseCache<'transaction>, to_node: &QueryNode, - from_node_data: &'from_data Self::BuildVisitedFromNode, + _from_node_data: &'from_data Self::BuildVisitedFromNode, ) -> Result)>> { match to_node { QueryNode::Term(LocatedQueryTerm { value, .. }) => match value { - QueryTerm::Phrase(_) => Ok(vec![(0, EdgeDetails::Data(TypoEdge::Phrase))]), + QueryTerm::Phrase { phrase } => { + Ok(vec![(0, EdgeDetails::Data(TypoEdge::Phrase { phrase: phrase.clone() }))]) + } QueryTerm::Word { derivations } => { let mut edges = vec![]; if !derivations.zero_typo.is_empty() || derivations.use_prefix_db { @@ -121,11 +125,14 @@ impl RankingRuleGraphTrait for TypoGraph { } fn log_state( - graph: &super::RankingRuleGraph, - paths: &PathsMap, + graph: &RankingRuleGraph, + paths: &[Vec], empty_paths_cache: &EmptyPathsCache, - logger: &mut dyn crate::new::logger::SearchLogger, + universe: &RoaringBitmap, + distances: &[Vec], + cost: u64, + logger: &mut dyn SearchLogger, ) { - logger.log_typo_state(graph, paths, empty_paths_cache); + logger.log_typo_state(graph, paths, empty_paths_cache, universe, distances.to_vec(), cost); } } diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index f023f94d1..9b3bcb38c 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -1,3 +1,5 @@ +use std::time::Instant; + use heed::RoTxn; use roaring::RoaringBitmap; @@ -9,7 +11,7 @@ use crate::new::graph_based_ranking_rule::GraphBasedRankingRule; use crate::new::ranking_rule_graph::proximity::ProximityGraph; use crate::new::ranking_rule_graph::typo::TypoGraph; use crate::new::words::Words; -use crate::search::new::sort::Sort; +// use crate::search::new::sort::Sort; use crate::{Filter, Index, Result, TermsMatchingStrategy}; pub trait RankingRuleOutputIter<'transaction, Query> { @@ -123,13 +125,14 @@ pub fn execute_search<'transaction>( length: usize, logger: &mut dyn SearchLogger, ) -> Result> { + logger.initial_query(query_graph, Instant::now()); let words = &mut Words::new(TermsMatchingStrategy::Last); - let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?; + // let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?; let proximity = &mut GraphBasedRankingRule::::new("proximity".to_owned()); let typo = &mut GraphBasedRankingRule::::new("typo".to_owned()); // TODO: ranking rules given as argument let mut ranking_rules: Vec<&mut dyn RankingRule<'transaction, QueryGraph>> = - vec![words, typo, proximity, sort]; + vec![words, typo, proximity /*sort*/]; logger.ranking_rules(&ranking_rules); @@ -144,7 +147,13 @@ pub fn execute_search<'transaction>( } let ranking_rules_len = ranking_rules.len(); - logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, &universe); + logger.start_iteration_ranking_rule( + 0, + ranking_rules[0], + query_graph, + &universe, + Instant::now(), + ); ranking_rules[0].start_iteration(index, txn, db_cache, logger, &universe, query_graph)?; let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len]; @@ -154,11 +163,12 @@ pub fn execute_search<'transaction>( macro_rules! back { () => { - // assert!(candidates[cur_ranking_rule_index].is_empty()); + assert!(candidates[cur_ranking_rule_index].is_empty()); logger.end_iteration_ranking_rule( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index], &candidates[cur_ranking_rule_index], + Instant::now(), ); candidates[cur_ranking_rule_index].clear(); ranking_rules[cur_ranking_rule_index].end_iteration(index, txn, db_cache, logger); @@ -187,6 +197,7 @@ pub fn execute_search<'transaction>( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index], &candidates, + Instant::now(), ); } else { let all_candidates = candidates.iter().collect::>(); @@ -196,6 +207,7 @@ pub fn execute_search<'transaction>( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index], &skipped_candidates.into_iter().collect(), + Instant::now(), ); let candidates = candidates .iter() @@ -219,24 +231,26 @@ pub fn execute_search<'transaction>( // The universe for this bucket is zero or one element, so we don't need to sort // anything, just extend the results and go back to the parent ranking rule. if candidates[cur_ranking_rule_index].len() <= 1 { - candidates[cur_ranking_rule_index].clear(); maybe_add_to_results!(&candidates[cur_ranking_rule_index]); + candidates[cur_ranking_rule_index].clear(); back!(); continue; } - logger.next_bucket_ranking_rule( - cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index], - &candidates[cur_ranking_rule_index], - ); - let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(index, txn, db_cache, logger, &candidates[cur_ranking_rule_index])? else { // TODO: add remaining candidates automatically here? back!(); continue; }; + logger.next_bucket_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index], + &candidates[cur_ranking_rule_index], + &next_bucket.candidates, + Instant::now(), + ); + assert!(candidates[cur_ranking_rule_index].is_superset(&next_bucket.candidates)); candidates[cur_ranking_rule_index] -= &next_bucket.candidates; @@ -255,6 +269,7 @@ pub fn execute_search<'transaction>( ranking_rules[cur_ranking_rule_index], &next_bucket.query, &candidates[cur_ranking_rule_index], + Instant::now(), ); ranking_rules[cur_ranking_rule_index].start_iteration( index, @@ -271,17 +286,18 @@ pub fn execute_search<'transaction>( #[cfg(test)] mod tests { - use std::fs::File; - use std::io::{BufRead, BufReader, Cursor, Seek}; - use std::time::Instant; - - use heed::EnvOpenOptions; - use super::execute_search; + // use crate::allocator::ALLOC; use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::index::tests::TempIndex; use crate::new::db_cache::DatabaseCache; - use crate::new::logger::detailed::DetailedSearchLogger; + use big_s::S; + use heed::EnvOpenOptions; + use maplit::hashset; + use std::fs::File; + use std::io::{BufRead, BufReader, Cursor, Seek}; + use std::time::Instant; + // use crate::new::logger::detailed::DetailedSearchLogger; use crate::new::logger::{DefaultSearchLogger, SearchLogger}; use crate::new::make_query_graph; use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; @@ -323,16 +339,119 @@ mod tests { let mut db_cache = DatabaseCache::default(); let query_graph = - make_query_graph(&index, &txn, &mut db_cache, "b b b b b b b b b b").unwrap(); - println!("{}", query_graph.graphviz()); - logger.initial_query(&query_graph); + make_query_graph(&index, &txn, &mut db_cache, "releases from poison by the government") + .unwrap(); + logger.initial_query(&query_graph, Instant::now()); let results = - execute_search(&index, &txn, &mut db_cache, &query_graph, None, 0, 20, &mut logger) + execute_search(&index, &txn, &mut db_cache, &query_graph, None, 0, 50, &mut logger) .unwrap(); println!("{results:?}") } + #[test] + fn search_wiki_new() { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_wiki").unwrap(); + let txn = index.read_txn().unwrap(); + + println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len()); + + let primary_key = index.primary_key(&txn).unwrap().unwrap(); + let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); + // loop { + let start = Instant::now(); + + let mut db_cache = DatabaseCache::default(); + + let query_graph = make_query_graph( + &index, + &txn, + &mut db_cache, + "which a the releases from poison by the government", + ) + .unwrap(); + + // let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); + + let results = execute_search( + &index, + &txn, + &mut db_cache, + &query_graph, + None, + 0, + 20, + &mut DefaultSearchLogger, + // &mut logger, + ) + .unwrap(); + + // logger.write_d2_description(); + + let elapsed = start.elapsed(); + + let ids = index + .documents(&txn, results.iter().copied()) + .unwrap() + .into_iter() + .map(|x| { + let obkv = &x.1; + let id = obkv.get(primary_key).unwrap(); + let id: serde_json::Value = serde_json::from_slice(id).unwrap(); + id.as_str().unwrap().to_owned() + }) + .collect::>(); + + println!("{}us: {results:?}", elapsed.as_micros()); + println!("external ids: {ids:?}"); + // println!("max_resident: {}", ALLOC.max_resident.load(std::sync::atomic::Ordering::SeqCst)); + // println!("allocated: {}", ALLOC.allocated.load(std::sync::atomic::Ordering::SeqCst)); + // } + } + + #[test] + fn search_wiki_old() { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_wiki").unwrap(); + + let txn = index.read_txn().unwrap(); + + let rr = index.criteria(&txn).unwrap(); + println!("{rr:?}"); + + let primary_key = index.primary_key(&txn).unwrap().unwrap(); + let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); + + let start = Instant::now(); + + let mut s = Search::new(&txn, &index); + s.query("releases from poison by the government"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); + let docs = s.execute().unwrap(); + + let elapsed = start.elapsed(); + + let ids = index + .documents(&txn, docs.documents_ids.iter().copied()) + .unwrap() + .into_iter() + .map(|x| { + let obkv = &x.1; + let id = obkv.get(primary_key).unwrap(); + let id: serde_json::Value = serde_json::from_slice(id).unwrap(); + id.as_str().unwrap().to_owned() + }) + .collect::>(); + + println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); + println!("external ids: {ids:?}"); + } #[test] fn search_movies_new() { let mut options = EnvOpenOptions::new(); @@ -343,7 +462,7 @@ mod tests { let primary_key = index.primary_key(&txn).unwrap().unwrap(); let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); - + // loop { let start = Instant::now(); let mut db_cache = DatabaseCache::default(); @@ -352,7 +471,7 @@ mod tests { make_query_graph(&index, &txn, &mut db_cache, "releases from poison by the government") .unwrap(); - let mut logger = DetailedSearchLogger::new("log"); + let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); let results = execute_search( &index, @@ -360,9 +479,10 @@ mod tests { &mut db_cache, &query_graph, None, - 5, + 0, 20, - &mut logger, //&mut DefaultSearchLogger, + // &mut DefaultSearchLogger, + &mut logger, ) .unwrap(); @@ -384,6 +504,7 @@ mod tests { println!("{}us: {results:?}", elapsed.as_micros()); println!("external ids: {ids:?}"); + // } } #[test] @@ -392,19 +513,39 @@ mod tests { options.map_size(100 * 1024 * 1024 * 1024); // 100 GB let index = Index::new(options, "data_movies").unwrap(); + let txn = index.read_txn().unwrap(); + let rr = index.criteria(&txn).unwrap(); + println!("{rr:?}"); + + let primary_key = index.primary_key(&txn).unwrap().unwrap(); + let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); + let start = Instant::now(); let mut s = Search::new(&txn, &index); - s.query("b b b b b b b b b b"); + s.query("releases from poison by the government"); s.terms_matching_strategy(TermsMatchingStrategy::Last); s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); let docs = s.execute().unwrap(); let elapsed = start.elapsed(); + let ids = index + .documents(&txn, docs.documents_ids.iter().copied()) + .unwrap() + .into_iter() + .map(|x| { + let obkv = &x.1; + let id = obkv.get(primary_key).unwrap(); + let id: serde_json::Value = serde_json::from_slice(id).unwrap(); + id.as_str().unwrap().to_owned() + }) + .collect::>(); + println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); + println!("external ids: {ids:?}"); } #[test] @@ -420,10 +561,16 @@ mod tests { builder.set_min_word_len_one_typo(5); builder.set_min_word_len_two_typos(100); - - builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]); + builder.set_sortable_fields(hashset! { S("release_date") }); + builder.set_criteria(vec![ + Criterion::Words, + Criterion::Typo, + Criterion::Proximity, + Criterion::Asc("release_date".to_owned()), + ]); builder.execute(|_| (), || false).unwrap(); + wtxn.commit().unwrap(); } #[test] @@ -445,6 +592,7 @@ mod tests { builder.set_searchable_fields(searchable_fields); let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); builder.set_filterable_fields(filterable_fields); + builder.set_min_word_len_one_typo(5); builder.set_min_word_len_two_typos(100); builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]); @@ -467,6 +615,48 @@ mod tests { index.prepare_for_closing().wait(); } + #[test] + fn _index_wiki() { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_wiki").unwrap(); + let mut wtxn = index.write_txn().unwrap(); + + // let primary_key = "id"; + let searchable_fields = vec!["body", "title", "url"]; + // let filterable_fields = vec![]; + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + // builder.set_primary_key(primary_key.to_owned()); + let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + // let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); + // builder.set_filterable_fields(filterable_fields); + + // builder.set_min_word_len_one_typo(5); + // builder.set_min_word_len_two_typos(100); + builder.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]); + builder.execute(|_| (), || false).unwrap(); + + let config = IndexerConfig::default(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false) + .unwrap(); + + let documents = documents_from( + "/Users/meilisearch/Documents/milli2/benchmarks/datasets/smol-wiki-articles.csv", + "csv", + ); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + } fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader { let reader = File::open(filename)