mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-11 14:04:31 +01:00
Rewrite cheapest path algorithm and empty path cache
It is now much simpler and has much better performance.
This commit is contained in:
parent
caa1e1b923
commit
c27ea2677f
@ -3,7 +3,53 @@
|
||||
|
||||
#[cfg(test)]
|
||||
#[global_allocator]
|
||||
static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
|
||||
pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
|
||||
|
||||
// #[cfg(test)]
|
||||
// pub mod allocator {
|
||||
// use std::alloc::{GlobalAlloc, System};
|
||||
// use std::sync::atomic::{self, AtomicI64};
|
||||
|
||||
// #[global_allocator]
|
||||
// pub static ALLOC: CountingAlloc = CountingAlloc {
|
||||
// max_resident: AtomicI64::new(0),
|
||||
// resident: AtomicI64::new(0),
|
||||
// allocated: AtomicI64::new(0),
|
||||
// };
|
||||
|
||||
// pub struct CountingAlloc {
|
||||
// pub max_resident: AtomicI64,
|
||||
// pub resident: AtomicI64,
|
||||
// pub allocated: AtomicI64,
|
||||
// }
|
||||
// unsafe impl GlobalAlloc for CountingAlloc {
|
||||
// unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 {
|
||||
// self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst);
|
||||
// let old_resident =
|
||||
// self.resident.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst);
|
||||
|
||||
// let resident = old_resident + layout.size() as i64;
|
||||
// self.max_resident.fetch_max(resident, atomic::Ordering::SeqCst);
|
||||
|
||||
// // if layout.size() > 1_000_000 {
|
||||
// // eprintln!(
|
||||
// // "allocating {} with new resident size: {resident}",
|
||||
// // layout.size() / 1_000_000
|
||||
// // );
|
||||
// // // let trace = std::backtrace::Backtrace::capture();
|
||||
// // // let t = trace.to_string();
|
||||
// // // eprintln!("{t}");
|
||||
// // }
|
||||
|
||||
// System.alloc(layout)
|
||||
// }
|
||||
|
||||
// unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) {
|
||||
// self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::Relaxed);
|
||||
// System.dealloc(ptr, layout)
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
#[macro_use]
|
||||
pub mod documents;
|
||||
|
@ -3,12 +3,11 @@ use roaring::RoaringBitmap;
|
||||
|
||||
use super::db_cache::DatabaseCache;
|
||||
use super::logger::SearchLogger;
|
||||
use super::ranking_rule_graph::cheapest_paths::KCheapestPathsState;
|
||||
use super::ranking_rule_graph::edge_docids_cache::EdgeDocidsCache;
|
||||
use super::ranking_rule_graph::empty_paths_cache::EmptyPathsCache;
|
||||
use super::ranking_rule_graph::paths_map::PathsMap;
|
||||
|
||||
use super::ranking_rule_graph::{RankingRuleGraph, RankingRuleGraphTrait};
|
||||
use super::{QueryGraph, RankingRule, RankingRuleOutput};
|
||||
use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput};
|
||||
|
||||
use crate::{Index, Result};
|
||||
|
||||
@ -24,9 +23,40 @@ impl<G: RankingRuleGraphTrait> GraphBasedRankingRule<G> {
|
||||
|
||||
pub struct GraphBasedRankingRuleState<G: RankingRuleGraphTrait> {
|
||||
graph: RankingRuleGraph<G>,
|
||||
cheapest_paths_state: Option<KCheapestPathsState>,
|
||||
edge_docids_cache: EdgeDocidsCache<G>,
|
||||
empty_paths_cache: EmptyPathsCache,
|
||||
all_distances: Vec<Vec<u64>>,
|
||||
cur_distance_idx: usize,
|
||||
}
|
||||
|
||||
fn remove_empty_edges<'transaction, G: RankingRuleGraphTrait>(
|
||||
graph: &mut RankingRuleGraph<G>,
|
||||
edge_docids_cache: &mut EdgeDocidsCache<G>,
|
||||
index: &Index,
|
||||
txn: &'transaction RoTxn,
|
||||
db_cache: &mut DatabaseCache<'transaction>,
|
||||
universe: &RoaringBitmap,
|
||||
empty_paths_cache: &mut EmptyPathsCache,
|
||||
) -> Result<()> {
|
||||
for edge_index in 0..graph.all_edges.len() as u32 {
|
||||
if graph.all_edges[edge_index as usize].is_none() {
|
||||
continue;
|
||||
}
|
||||
let docids = edge_docids_cache
|
||||
.get_edge_docids(index, txn, db_cache, edge_index, &*graph, universe)?;
|
||||
match docids {
|
||||
BitmapOrAllRef::Bitmap(bitmap) => {
|
||||
if bitmap.is_disjoint(universe) {
|
||||
graph.remove_edge(edge_index);
|
||||
empty_paths_cache.forbid_edge(edge_index);
|
||||
edge_docids_cache.cache.remove(&edge_index);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
BitmapOrAllRef::All => continue,
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGraph>
|
||||
@ -41,18 +71,31 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap
|
||||
txn: &'transaction RoTxn,
|
||||
db_cache: &mut DatabaseCache<'transaction>,
|
||||
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
_universe: &RoaringBitmap,
|
||||
universe: &RoaringBitmap,
|
||||
query_graph: &QueryGraph,
|
||||
) -> Result<()> {
|
||||
// TODO: update old state instead of starting from scratch
|
||||
let graph = RankingRuleGraph::build(index, txn, db_cache, query_graph.clone())?;
|
||||
let mut graph = RankingRuleGraph::build(index, txn, db_cache, query_graph.clone())?;
|
||||
let mut edge_docids_cache = EdgeDocidsCache::default();
|
||||
let mut empty_paths_cache = EmptyPathsCache::new(graph.all_edges.len());
|
||||
|
||||
remove_empty_edges(
|
||||
&mut graph,
|
||||
&mut edge_docids_cache,
|
||||
index,
|
||||
txn,
|
||||
db_cache,
|
||||
universe,
|
||||
&mut empty_paths_cache,
|
||||
)?;
|
||||
let all_distances = graph.initialize_distances_cheapest();
|
||||
|
||||
let cheapest_paths_state = KCheapestPathsState::new(&graph);
|
||||
let state = GraphBasedRankingRuleState {
|
||||
graph,
|
||||
cheapest_paths_state,
|
||||
edge_docids_cache: <_>::default(),
|
||||
empty_paths_cache: <_>::default(),
|
||||
edge_docids_cache,
|
||||
empty_paths_cache,
|
||||
all_distances,
|
||||
cur_distance_idx: 0,
|
||||
};
|
||||
|
||||
self.state = Some(state);
|
||||
@ -70,34 +113,42 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap
|
||||
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
|
||||
assert!(universe.len() > 1);
|
||||
let mut state = self.state.take().unwrap();
|
||||
if state.cheapest_paths_state.is_none() {
|
||||
remove_empty_edges(
|
||||
&mut state.graph,
|
||||
&mut state.edge_docids_cache,
|
||||
index,
|
||||
txn,
|
||||
db_cache,
|
||||
universe,
|
||||
&mut state.empty_paths_cache,
|
||||
)?;
|
||||
|
||||
if state.cur_distance_idx
|
||||
>= state.all_distances[state.graph.query_graph.root_node as usize].len()
|
||||
{
|
||||
self.state = None;
|
||||
return Ok(None);
|
||||
}
|
||||
let cost =
|
||||
state.all_distances[state.graph.query_graph.root_node as usize][state.cur_distance_idx];
|
||||
state.cur_distance_idx += 1;
|
||||
|
||||
let mut paths = PathsMap::default();
|
||||
let paths = state.graph.paths_of_cost(
|
||||
state.graph.query_graph.root_node as usize,
|
||||
cost,
|
||||
&state.all_distances,
|
||||
&state.empty_paths_cache,
|
||||
);
|
||||
|
||||
while paths.is_empty() {
|
||||
let Some(cheapest_paths_state) = state.cheapest_paths_state.take() else {
|
||||
break;
|
||||
};
|
||||
if let Some(next_cheapest_paths_state) = cheapest_paths_state
|
||||
.compute_paths_of_next_lowest_cost(
|
||||
&mut state.graph,
|
||||
&state.empty_paths_cache,
|
||||
&mut paths,
|
||||
)
|
||||
{
|
||||
state.cheapest_paths_state = Some(next_cheapest_paths_state);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if paths.is_empty() && state.cheapest_paths_state.is_none() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
G::log_state(&state.graph, &paths, &state.empty_paths_cache, logger);
|
||||
G::log_state(
|
||||
&state.graph,
|
||||
&paths,
|
||||
&state.empty_paths_cache,
|
||||
universe,
|
||||
&state.all_distances,
|
||||
cost,
|
||||
logger,
|
||||
);
|
||||
|
||||
let bucket = state.graph.resolve_paths(
|
||||
index,
|
||||
|
@ -1,6 +1,8 @@
|
||||
|
||||
use rand::random;
|
||||
use roaring::RoaringBitmap;
|
||||
use std::fs::File;
|
||||
use std::time::Instant;
|
||||
use std::{io::Write, path::PathBuf};
|
||||
|
||||
use crate::new::ranking_rule_graph::typo::TypoGraph;
|
||||
@ -9,7 +11,7 @@ use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
|
||||
use crate::new::ranking_rule_graph::empty_paths_cache::EmptyPathsCache;
|
||||
use crate::new::ranking_rule_graph::{Edge, EdgeDetails, RankingRuleGraphTrait};
|
||||
use crate::new::ranking_rule_graph::{
|
||||
paths_map::PathsMap, proximity::ProximityGraph, RankingRuleGraph,
|
||||
proximity::ProximityGraph, RankingRuleGraph,
|
||||
};
|
||||
|
||||
use super::{RankingRule, SearchLogger};
|
||||
@ -19,14 +21,18 @@ pub enum SearchEvents {
|
||||
ranking_rule_idx: usize,
|
||||
query: QueryGraph,
|
||||
universe: RoaringBitmap,
|
||||
time: Instant,
|
||||
},
|
||||
RankingRuleNextBucket {
|
||||
ranking_rule_idx: usize,
|
||||
universe: RoaringBitmap,
|
||||
candidates: RoaringBitmap,
|
||||
time: Instant,
|
||||
},
|
||||
RankingRuleEndIteration {
|
||||
ranking_rule_idx: usize,
|
||||
universe: RoaringBitmap,
|
||||
time: Instant,
|
||||
},
|
||||
ExtendResults {
|
||||
new: Vec<u32>,
|
||||
@ -36,20 +42,27 @@ pub enum SearchEvents {
|
||||
},
|
||||
ProximityState {
|
||||
graph: RankingRuleGraph<ProximityGraph>,
|
||||
paths: PathsMap<u64>,
|
||||
paths: Vec<Vec<u32>>,
|
||||
empty_paths_cache: EmptyPathsCache,
|
||||
universe: RoaringBitmap,
|
||||
distances: Vec<Vec<u64>>,
|
||||
cost: u64,
|
||||
},
|
||||
TypoState {
|
||||
graph: RankingRuleGraph<TypoGraph>,
|
||||
paths: PathsMap<u64>,
|
||||
paths: Vec<Vec<u32>>,
|
||||
empty_paths_cache: EmptyPathsCache,
|
||||
universe: RoaringBitmap,
|
||||
distances: Vec<Vec<u64>>,
|
||||
cost: u64,
|
||||
},
|
||||
RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap },
|
||||
RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap, time: Instant, },
|
||||
}
|
||||
|
||||
pub struct DetailedSearchLogger {
|
||||
folder_path: PathBuf,
|
||||
initial_query: Option<QueryGraph>,
|
||||
initial_query_time: Option<Instant>,
|
||||
initial_universe: Option<RoaringBitmap>,
|
||||
ranking_rules_ids: Option<Vec<String>>,
|
||||
events: Vec<SearchEvents>,
|
||||
@ -58,17 +71,19 @@ impl DetailedSearchLogger {
|
||||
pub fn new(folder_path: &str) -> Self {
|
||||
Self {
|
||||
folder_path: PathBuf::new().join(folder_path),
|
||||
initial_query: <_>::default(),
|
||||
initial_universe: <_>::default(),
|
||||
ranking_rules_ids: <_>::default(),
|
||||
events: <_>::default(),
|
||||
initial_query: None,
|
||||
initial_query_time: None,
|
||||
initial_universe: None,
|
||||
ranking_rules_ids: None,
|
||||
events: vec![],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SearchLogger<QueryGraph> for DetailedSearchLogger {
|
||||
fn initial_query(&mut self, query: &QueryGraph) {
|
||||
fn initial_query(&mut self, query: &QueryGraph, time: Instant) {
|
||||
self.initial_query = Some(query.clone());
|
||||
self.initial_query_time = Some(time);
|
||||
}
|
||||
|
||||
fn initial_universe(&mut self, universe: &RoaringBitmap) {
|
||||
@ -84,11 +99,13 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
|
||||
_ranking_rule: &dyn RankingRule<'transaction, QueryGraph>,
|
||||
query: &QueryGraph,
|
||||
universe: &RoaringBitmap,
|
||||
time: Instant,
|
||||
) {
|
||||
self.events.push(SearchEvents::RankingRuleStartIteration {
|
||||
ranking_rule_idx,
|
||||
query: query.clone(),
|
||||
universe: universe.clone(),
|
||||
time,
|
||||
})
|
||||
}
|
||||
|
||||
@ -97,10 +114,14 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
|
||||
ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<'transaction, QueryGraph>,
|
||||
universe: &RoaringBitmap,
|
||||
candidates: &RoaringBitmap,
|
||||
time: Instant,
|
||||
) {
|
||||
self.events.push(SearchEvents::RankingRuleNextBucket {
|
||||
ranking_rule_idx,
|
||||
universe: universe.clone(),
|
||||
candidates: candidates.clone(),
|
||||
time,
|
||||
})
|
||||
}
|
||||
fn skip_bucket_ranking_rule<'transaction>(
|
||||
@ -108,10 +129,12 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
|
||||
ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<'transaction, QueryGraph>,
|
||||
candidates: &RoaringBitmap,
|
||||
time: Instant,
|
||||
) {
|
||||
self.events.push(SearchEvents::RankingRuleSkipBucket {
|
||||
ranking_rule_idx,
|
||||
candidates: candidates.clone(),
|
||||
time
|
||||
})
|
||||
}
|
||||
|
||||
@ -120,10 +143,12 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
|
||||
ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<'transaction, QueryGraph>,
|
||||
universe: &RoaringBitmap,
|
||||
time: Instant,
|
||||
) {
|
||||
self.events.push(SearchEvents::RankingRuleEndIteration {
|
||||
ranking_rule_idx,
|
||||
universe: universe.clone(),
|
||||
time
|
||||
})
|
||||
}
|
||||
fn add_to_results(&mut self, docids: &[u32]) {
|
||||
@ -134,18 +159,19 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
|
||||
self.events.push(SearchEvents::WordsState { query_graph: query_graph.clone() });
|
||||
}
|
||||
|
||||
fn log_proximity_state(&mut self, query_graph: &RankingRuleGraph<ProximityGraph>, paths_map: &PathsMap<u64>, empty_paths_cache: &EmptyPathsCache) {
|
||||
self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.clone(), empty_paths_cache: empty_paths_cache.clone() })
|
||||
fn log_proximity_state(&mut self, query_graph: &RankingRuleGraph<ProximityGraph>, paths_map: &[Vec<u32>], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec<Vec<u64>>, cost: u64,) {
|
||||
self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.to_vec(), empty_paths_cache: empty_paths_cache.clone(), universe: universe.clone(), distances, cost })
|
||||
}
|
||||
|
||||
fn log_typo_state(&mut self, query_graph: &RankingRuleGraph<TypoGraph>, paths_map: &PathsMap<u64>, empty_paths_cache: &EmptyPathsCache) {
|
||||
self.events.push(SearchEvents::TypoState { graph: query_graph.clone(), paths: paths_map.clone(), empty_paths_cache: empty_paths_cache.clone() })
|
||||
fn log_typo_state(&mut self, query_graph: &RankingRuleGraph<TypoGraph>, paths_map: &[Vec<u32>], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec<Vec<u64>>, cost: u64,) {
|
||||
self.events.push(SearchEvents::TypoState { graph: query_graph.clone(), paths: paths_map.to_vec(), empty_paths_cache: empty_paths_cache.clone(), universe: universe.clone(), distances, cost })
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl DetailedSearchLogger {
|
||||
pub fn write_d2_description(&self) {
|
||||
let mut prev_time = self.initial_query_time.unwrap();
|
||||
let mut timestamp = vec![];
|
||||
fn activated_id(timestamp: &[usize]) -> String {
|
||||
let mut s = String::new();
|
||||
@ -164,13 +190,16 @@ impl DetailedSearchLogger {
|
||||
writeln!(&mut file, "{idx}: {rr_id}").unwrap();
|
||||
}
|
||||
writeln!(&mut file, "results").unwrap();
|
||||
// writeln!(&mut file, "time").unwrap();
|
||||
for event in self.events.iter() {
|
||||
match event {
|
||||
SearchEvents::RankingRuleStartIteration { ranking_rule_idx, .. } => {
|
||||
|
||||
SearchEvents::RankingRuleStartIteration { ranking_rule_idx, time, .. } => {
|
||||
let elapsed = time.duration_since(prev_time);
|
||||
prev_time = *time;
|
||||
let parent_activated_id = activated_id(×tamp);
|
||||
timestamp.push(0);
|
||||
let self_activated_id = activated_id(×tamp);
|
||||
// writeln!(&mut file, "time.{self_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap();
|
||||
if *ranking_rule_idx != 0 {
|
||||
let parent_ranking_rule_idx = ranking_rule_idx - 1;
|
||||
writeln!(
|
||||
@ -186,16 +215,22 @@ impl DetailedSearchLogger {
|
||||
}}
|
||||
}}").unwrap();
|
||||
}
|
||||
SearchEvents::RankingRuleNextBucket { ranking_rule_idx, .. } => {
|
||||
SearchEvents::RankingRuleNextBucket { ranking_rule_idx, time, universe, candidates } => {
|
||||
let elapsed = time.duration_since(prev_time);
|
||||
prev_time = *time;
|
||||
let old_activated_id = activated_id(×tamp);
|
||||
// writeln!(&mut file, "time.{old_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap();
|
||||
*timestamp.last_mut().unwrap() += 1;
|
||||
let next_activated_id = activated_id(×tamp);
|
||||
writeln!(&mut file,
|
||||
"{ranking_rule_idx}.{old_activated_id} -> {ranking_rule_idx}.{next_activated_id} : next bucket",)
|
||||
"{ranking_rule_idx}.{old_activated_id} -> {ranking_rule_idx}.{next_activated_id} : next bucket {}/{}", candidates.len(), universe.len())
|
||||
.unwrap();
|
||||
}
|
||||
SearchEvents::RankingRuleSkipBucket { ranking_rule_idx, candidates } => {
|
||||
SearchEvents::RankingRuleSkipBucket { ranking_rule_idx, candidates, time } => {
|
||||
let elapsed = time.duration_since(prev_time);
|
||||
prev_time = *time;
|
||||
let old_activated_id = activated_id(×tamp);
|
||||
// writeln!(&mut file, "time.{old_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap();
|
||||
*timestamp.last_mut().unwrap() += 1;
|
||||
let next_activated_id = activated_id(×tamp);
|
||||
let len = candidates.len();
|
||||
@ -203,8 +238,12 @@ impl DetailedSearchLogger {
|
||||
"{ranking_rule_idx}.{old_activated_id} -> {ranking_rule_idx}.{next_activated_id} : skip bucket ({len})",)
|
||||
.unwrap();
|
||||
}
|
||||
SearchEvents::RankingRuleEndIteration { ranking_rule_idx, .. } => {
|
||||
SearchEvents::RankingRuleEndIteration { ranking_rule_idx, time, .. } => {
|
||||
let elapsed = time.duration_since(prev_time);
|
||||
prev_time = *time;
|
||||
let cur_activated_id = activated_id(×tamp);
|
||||
// writeln!(&mut file, "time.{cur_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap();
|
||||
|
||||
timestamp.pop();
|
||||
let parent_activated_id = activated_id(×tamp);
|
||||
let parent_ranking_rule = if *ranking_rule_idx == 0 {
|
||||
@ -254,43 +293,48 @@ results.{random} {{
|
||||
link: \"{id}.d2.svg\"
|
||||
}}").unwrap();
|
||||
},
|
||||
SearchEvents::ProximityState { graph, paths, empty_paths_cache } => {
|
||||
SearchEvents::ProximityState { graph, paths, empty_paths_cache, universe, distances, cost } => {
|
||||
let cur_ranking_rule = timestamp.len() - 1;
|
||||
let cur_activated_id = activated_id(×tamp);
|
||||
let id = format!("{cur_ranking_rule}.{cur_activated_id}");
|
||||
let new_file_path = self.folder_path.join(format!("{id}.d2"));
|
||||
let mut new_file = std::fs::File::create(new_file_path).unwrap();
|
||||
Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, &mut new_file);
|
||||
Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, distances.clone(), &mut new_file);
|
||||
writeln!(
|
||||
&mut file,
|
||||
"{id} {{
|
||||
link: \"{id}.d2.svg\"
|
||||
}}").unwrap();
|
||||
tooltip: \"cost {cost}, universe len: {}\"
|
||||
}}", universe.len()).unwrap();
|
||||
},
|
||||
SearchEvents::TypoState { graph, paths, empty_paths_cache } => {
|
||||
SearchEvents::TypoState { graph, paths, empty_paths_cache, universe, distances, cost } => {
|
||||
let cur_ranking_rule = timestamp.len() - 1;
|
||||
let cur_activated_id = activated_id(×tamp);
|
||||
let id = format!("{cur_ranking_rule}.{cur_activated_id}");
|
||||
let new_file_path = self.folder_path.join(format!("{id}.d2"));
|
||||
let mut new_file = std::fs::File::create(new_file_path).unwrap();
|
||||
Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, &mut new_file);
|
||||
Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, distances.clone(), &mut new_file);
|
||||
writeln!(
|
||||
&mut file,
|
||||
"{id} {{
|
||||
link: \"{id}.d2.svg\"
|
||||
}}").unwrap();
|
||||
tooltip: \"cost {cost}, universe len: {}\"
|
||||
}}", universe.len()).unwrap();
|
||||
},
|
||||
}
|
||||
}
|
||||
writeln!(&mut file, "}}").unwrap();
|
||||
}
|
||||
|
||||
fn query_node_d2_desc(node_idx: usize, node: &QueryNode, file: &mut File) {
|
||||
fn query_node_d2_desc(node_idx: usize, node: &QueryNode, distances: &[u64], file: &mut File) {
|
||||
match &node {
|
||||
QueryNode::Term(LocatedQueryTerm { value, .. }) => {
|
||||
match value {
|
||||
QueryTerm::Phrase(_) => todo!(),
|
||||
QueryTerm::Word { derivations: WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db } } => {
|
||||
QueryTerm::Phrase { phrase } => {
|
||||
let phrase_str = phrase.description();
|
||||
writeln!(file,"{node_idx} : \"{phrase_str}\"").unwrap();
|
||||
},
|
||||
QueryTerm::Word { derivations: WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db, synonyms, split_words } } => {
|
||||
writeln!(file,"{node_idx} : \"{original}\" {{
|
||||
shape: class").unwrap();
|
||||
for w in zero_typo {
|
||||
@ -302,9 +346,19 @@ shape: class").unwrap();
|
||||
for w in two_typos {
|
||||
writeln!(file, "\"{w}\" : 2").unwrap();
|
||||
}
|
||||
if let Some((left, right)) = split_words {
|
||||
writeln!(file, "\"{left} {right}\" : split_words").unwrap();
|
||||
}
|
||||
for synonym in synonyms {
|
||||
writeln!(file, "\"{}\" : synonym", synonym.description()).unwrap();
|
||||
}
|
||||
if *use_prefix_db {
|
||||
writeln!(file, "use prefix DB : true").unwrap();
|
||||
}
|
||||
// for (i, d) in distances.iter().enumerate() {
|
||||
// writeln!(file, "\"distances\" : {d}").unwrap();
|
||||
// }
|
||||
|
||||
writeln!(file, "}}").unwrap();
|
||||
},
|
||||
}
|
||||
@ -324,14 +378,14 @@ shape: class").unwrap();
|
||||
if matches!(query_graph.nodes[node], QueryNode::Deleted) {
|
||||
continue;
|
||||
}
|
||||
Self::query_node_d2_desc(node, &query_graph.nodes[node], file);
|
||||
Self::query_node_d2_desc(node, &query_graph.nodes[node], &[], file);
|
||||
|
||||
for edge in query_graph.edges[node].successors.iter() {
|
||||
writeln!(file, "{node} -> {edge};\n").unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
fn ranking_rule_graph_d2_description<R: RankingRuleGraphTrait>(graph: &RankingRuleGraph<R>, paths: &PathsMap<u64>, empty_paths_cache: &EmptyPathsCache, file: &mut File) {
|
||||
fn ranking_rule_graph_d2_description<R: RankingRuleGraphTrait>(graph: &RankingRuleGraph<R>, paths: &[Vec<u32>], _empty_paths_cache: &EmptyPathsCache, distances: Vec<Vec<u64>>, file: &mut File) {
|
||||
writeln!(file,"direction: right").unwrap();
|
||||
|
||||
writeln!(file, "Proximity Graph {{").unwrap();
|
||||
@ -339,7 +393,8 @@ shape: class").unwrap();
|
||||
if matches!(node, QueryNode::Deleted) {
|
||||
continue;
|
||||
}
|
||||
Self::query_node_d2_desc(node_idx, node, file);
|
||||
let distances = &distances[node_idx];
|
||||
Self::query_node_d2_desc(node_idx, node, distances.as_slice(), file);
|
||||
}
|
||||
for edge in graph.all_edges.iter().flatten() {
|
||||
let Edge { from_node, to_node, details, .. } = edge;
|
||||
@ -362,26 +417,39 @@ shape: class").unwrap();
|
||||
}
|
||||
writeln!(file, "}}").unwrap();
|
||||
|
||||
// writeln!(file, "Distances {{").unwrap();
|
||||
// Self::paths_d2_description(graph, paths, file);
|
||||
// writeln!(file, "}}").unwrap();
|
||||
|
||||
|
||||
writeln!(file, "Shortest Paths {{").unwrap();
|
||||
Self::paths_d2_description(graph, "", paths, file);
|
||||
Self::paths_d2_description(graph, paths, file);
|
||||
writeln!(file, "}}").unwrap();
|
||||
|
||||
writeln!(file, "Empty Path Prefixes {{").unwrap();
|
||||
Self::paths_d2_description(graph, "", &empty_paths_cache.empty_prefixes, file);
|
||||
writeln!(file, "}}").unwrap();
|
||||
// writeln!(file, "Empty Edge Couples {{").unwrap();
|
||||
// for (i, (e1, e2)) in empty_paths_cache.empty_couple_edges.iter().enumerate() {
|
||||
// writeln!(file, "{i} : \"\" {{").unwrap();
|
||||
// Self::edge_d2_description(graph, *e1, file);
|
||||
// Self::edge_d2_description(graph, *e2, file);
|
||||
// writeln!(file, "{e1} -- {e2}").unwrap();
|
||||
// writeln!(file, "}}").unwrap();
|
||||
// }
|
||||
// writeln!(file, "}}").unwrap();
|
||||
|
||||
writeln!(file, "Removed Edges {{").unwrap();
|
||||
for edge_idx in empty_paths_cache.empty_edges.iter() {
|
||||
writeln!(file, "{edge_idx}").unwrap();
|
||||
}
|
||||
writeln!(file, "}}").unwrap();
|
||||
// writeln!(file, "Removed Edges {{").unwrap();
|
||||
// for edge_idx in empty_paths_cache.empty_edges.iter() {
|
||||
// writeln!(file, "{edge_idx}").unwrap();
|
||||
// }
|
||||
// writeln!(file, "}}").unwrap();
|
||||
}
|
||||
fn edge_d2_description<R: RankingRuleGraphTrait>(graph: &RankingRuleGraph<R>, paths_idx: &str, edge_idx: u32, file: &mut File) -> String {
|
||||
fn edge_d2_description<R: RankingRuleGraphTrait>(graph: &RankingRuleGraph<R>, edge_idx: u32, file: &mut File) {
|
||||
let Edge { from_node, to_node, cost, .. } = graph.all_edges[edge_idx as usize].as_ref().unwrap() ;
|
||||
let from_node = &graph.query_graph.nodes[*from_node as usize];
|
||||
let from_node_desc = match from_node {
|
||||
QueryNode::Term(term) => match &term.value {
|
||||
QueryTerm::Phrase(_) => todo!(),
|
||||
QueryTerm::Phrase { phrase } => {
|
||||
phrase.description()
|
||||
},
|
||||
QueryTerm::Word { derivations } => derivations.original.clone(),
|
||||
},
|
||||
QueryNode::Deleted => panic!(),
|
||||
@ -391,27 +459,28 @@ shape: class").unwrap();
|
||||
let to_node = &graph.query_graph.nodes[*to_node as usize];
|
||||
let to_node_desc = match to_node {
|
||||
QueryNode::Term(term) => match &term.value {
|
||||
QueryTerm::Phrase(_) => todo!(),
|
||||
QueryTerm::Phrase { phrase } => phrase.description(),
|
||||
QueryTerm::Word { derivations } => derivations.original.clone(),
|
||||
},
|
||||
QueryNode::Deleted => panic!(),
|
||||
QueryNode::Start => "START".to_owned(),
|
||||
QueryNode::End => "END".to_owned(),
|
||||
};
|
||||
let edge_id = format!("{paths_idx}{edge_idx}");
|
||||
writeln!(file, "{edge_id}: \"{from_node_desc}->{to_node_desc} [{cost}]\" {{
|
||||
writeln!(file, "{edge_idx}: \"{from_node_desc}->{to_node_desc} [{cost}]\" {{
|
||||
shape: class
|
||||
}}").unwrap();
|
||||
edge_id
|
||||
}
|
||||
fn paths_d2_description<R: RankingRuleGraphTrait, T>(graph: &RankingRuleGraph<R>, paths_idx: &str, paths: &PathsMap<T>, file: &mut File) {
|
||||
for (edge_idx, rest) in paths.nodes.iter() {
|
||||
let edge_id = Self::edge_d2_description(graph, paths_idx, *edge_idx, file);
|
||||
for (dest_edge_idx, _) in rest.nodes.iter() {
|
||||
let dest_edge_id = format!("{paths_idx}{edge_idx}{dest_edge_idx}");
|
||||
writeln!(file, "{edge_id} -> {dest_edge_id}").unwrap();
|
||||
fn paths_d2_description<R: RankingRuleGraphTrait>(graph: &RankingRuleGraph<R>, paths: &[Vec<u32>], file: &mut File) {
|
||||
for (path_idx, edge_indexes) in paths.iter().enumerate() {
|
||||
writeln!(file, "{path_idx} {{").unwrap();
|
||||
for edge_idx in edge_indexes.iter() {
|
||||
Self::edge_d2_description(graph, *edge_idx, file);
|
||||
}
|
||||
Self::paths_d2_description(graph, &format!("{paths_idx}{edge_idx}"), rest, file);
|
||||
for couple_edges in edge_indexes.windows(2) {
|
||||
let [src_edge_idx, dest_edge_idx] = couple_edges else { panic!() };
|
||||
writeln!(file, "{src_edge_idx} -> {dest_edge_idx}").unwrap();
|
||||
}
|
||||
writeln!(file, "}}").unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2,28 +2,31 @@
|
||||
pub mod detailed;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
use std::time::Instant;
|
||||
|
||||
use super::{
|
||||
ranking_rule_graph::{
|
||||
empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, proximity::ProximityGraph,
|
||||
typo::TypoGraph, RankingRuleGraph,
|
||||
empty_paths_cache::EmptyPathsCache, proximity::ProximityGraph, typo::TypoGraph,
|
||||
RankingRuleGraph,
|
||||
},
|
||||
RankingRule, RankingRuleQueryTrait,
|
||||
};
|
||||
|
||||
pub struct DefaultSearchLogger;
|
||||
impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
|
||||
fn initial_query(&mut self, _query: &Q) {}
|
||||
fn initial_query(&mut self, _query: &Q, _time: Instant) {}
|
||||
|
||||
fn initial_universe(&mut self, _universe: &RoaringBitmap) {}
|
||||
|
||||
fn ranking_rules(&mut self, _rr: &[&mut dyn RankingRule<Q>]) {}
|
||||
|
||||
fn start_iteration_ranking_rule<'transaction>(
|
||||
&mut self,
|
||||
_ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<'transaction, Q>,
|
||||
_query: &Q,
|
||||
_universe: &RoaringBitmap,
|
||||
_time: Instant,
|
||||
) {
|
||||
}
|
||||
|
||||
@ -32,6 +35,8 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
|
||||
_ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<'transaction, Q>,
|
||||
_universe: &RoaringBitmap,
|
||||
_candidates: &RoaringBitmap,
|
||||
_time: Instant,
|
||||
) {
|
||||
}
|
||||
fn skip_bucket_ranking_rule<'transaction>(
|
||||
@ -39,6 +44,7 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
|
||||
_ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<'transaction, Q>,
|
||||
_candidates: &RoaringBitmap,
|
||||
_time: Instant,
|
||||
) {
|
||||
}
|
||||
|
||||
@ -47,6 +53,7 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
|
||||
_ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<'transaction, Q>,
|
||||
_universe: &RoaringBitmap,
|
||||
_time: Instant,
|
||||
) {
|
||||
}
|
||||
|
||||
@ -57,22 +64,28 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
|
||||
fn log_proximity_state(
|
||||
&mut self,
|
||||
_query_graph: &RankingRuleGraph<ProximityGraph>,
|
||||
_paths_map: &PathsMap<u64>,
|
||||
_paths_map: &[Vec<u32>],
|
||||
_empty_paths_cache: &EmptyPathsCache,
|
||||
_universe: &RoaringBitmap,
|
||||
_distances: Vec<Vec<u64>>,
|
||||
_cost: u64,
|
||||
) {
|
||||
}
|
||||
|
||||
fn log_typo_state(
|
||||
&mut self,
|
||||
query_graph: &RankingRuleGraph<TypoGraph>,
|
||||
paths: &PathsMap<u64>,
|
||||
empty_paths_cache: &EmptyPathsCache,
|
||||
_query_graph: &RankingRuleGraph<TypoGraph>,
|
||||
_paths: &[Vec<u32>],
|
||||
_empty_paths_cache: &EmptyPathsCache,
|
||||
_universe: &RoaringBitmap,
|
||||
_distances: Vec<Vec<u64>>,
|
||||
_cost: u64,
|
||||
) {
|
||||
}
|
||||
}
|
||||
|
||||
pub trait SearchLogger<Q: RankingRuleQueryTrait> {
|
||||
fn initial_query(&mut self, query: &Q);
|
||||
fn initial_query(&mut self, query: &Q, time: Instant);
|
||||
fn initial_universe(&mut self, universe: &RoaringBitmap);
|
||||
|
||||
fn ranking_rules(&mut self, rr: &[&mut dyn RankingRule<Q>]);
|
||||
@ -83,24 +96,29 @@ pub trait SearchLogger<Q: RankingRuleQueryTrait> {
|
||||
ranking_rule: &dyn RankingRule<'transaction, Q>,
|
||||
query: &Q,
|
||||
universe: &RoaringBitmap,
|
||||
time: Instant,
|
||||
);
|
||||
fn next_bucket_ranking_rule<'transaction>(
|
||||
&mut self,
|
||||
ranking_rule_idx: usize,
|
||||
ranking_rule: &dyn RankingRule<'transaction, Q>,
|
||||
universe: &RoaringBitmap,
|
||||
candidates: &RoaringBitmap,
|
||||
time: Instant,
|
||||
);
|
||||
fn skip_bucket_ranking_rule<'transaction>(
|
||||
&mut self,
|
||||
ranking_rule_idx: usize,
|
||||
ranking_rule: &dyn RankingRule<'transaction, Q>,
|
||||
candidates: &RoaringBitmap,
|
||||
time: Instant,
|
||||
);
|
||||
fn end_iteration_ranking_rule<'transaction>(
|
||||
&mut self,
|
||||
ranking_rule_idx: usize,
|
||||
ranking_rule: &dyn RankingRule<'transaction, Q>,
|
||||
universe: &RoaringBitmap,
|
||||
time: Instant,
|
||||
);
|
||||
fn add_to_results(&mut self, docids: &[u32]);
|
||||
|
||||
@ -109,14 +127,20 @@ pub trait SearchLogger<Q: RankingRuleQueryTrait> {
|
||||
fn log_proximity_state(
|
||||
&mut self,
|
||||
query_graph: &RankingRuleGraph<ProximityGraph>,
|
||||
paths: &PathsMap<u64>,
|
||||
paths: &[Vec<u32>],
|
||||
empty_paths_cache: &EmptyPathsCache,
|
||||
universe: &RoaringBitmap,
|
||||
_distances: Vec<Vec<u64>>,
|
||||
cost: u64,
|
||||
);
|
||||
|
||||
fn log_typo_state(
|
||||
&mut self,
|
||||
query_graph: &RankingRuleGraph<TypoGraph>,
|
||||
paths: &PathsMap<u64>,
|
||||
paths: &[Vec<u32>],
|
||||
empty_paths_cache: &EmptyPathsCache,
|
||||
universe: &RoaringBitmap,
|
||||
_distances: Vec<Vec<u64>>,
|
||||
cost: u64,
|
||||
);
|
||||
}
|
||||
|
@ -1,10 +1,8 @@
|
||||
use std::collections::{BTreeMap, HashSet};
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
#![allow(clippy::too_many_arguments)]
|
||||
|
||||
use super::empty_paths_cache::EmptyPathsCache;
|
||||
use super::paths_map::PathsMap;
|
||||
use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait};
|
||||
use super::{RankingRuleGraph, RankingRuleGraphTrait};
|
||||
use std::collections::VecDeque;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct Path {
|
||||
@ -12,226 +10,119 @@ pub struct Path {
|
||||
pub cost: u64,
|
||||
}
|
||||
|
||||
struct DijkstraState {
|
||||
unvisited: RoaringBitmap, // should be a small bitset?
|
||||
distances: Vec<u64>, // or binary heap, or btreemap? (f64, usize)
|
||||
edges: Vec<u32>,
|
||||
edge_costs: Vec<u8>,
|
||||
paths: Vec<Option<u32>>,
|
||||
}
|
||||
|
||||
pub struct KCheapestPathsState {
|
||||
cheapest_paths: PathsMap<u64>,
|
||||
potential_cheapest_paths: BTreeMap<u64, PathsMap<u64>>,
|
||||
pub kth_cheapest_path: Path,
|
||||
}
|
||||
|
||||
impl KCheapestPathsState {
|
||||
pub fn next_cost(&self) -> u64 {
|
||||
self.kth_cheapest_path.cost
|
||||
}
|
||||
|
||||
pub fn new<G: RankingRuleGraphTrait>(
|
||||
graph: &RankingRuleGraph<G>,
|
||||
) -> Option<KCheapestPathsState> {
|
||||
let Some(cheapest_path) = graph.cheapest_path_to_end(graph.query_graph.root_node) else {
|
||||
return None
|
||||
};
|
||||
let cheapest_paths = PathsMap::from_paths(&[cheapest_path.clone()]);
|
||||
let potential_cheapest_paths = BTreeMap::new();
|
||||
Some(KCheapestPathsState {
|
||||
cheapest_paths,
|
||||
potential_cheapest_paths,
|
||||
kth_cheapest_path: cheapest_path,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn remove_empty_paths(mut self, empty_paths_cache: &EmptyPathsCache) -> Option<Self> {
|
||||
self.cheapest_paths.remove_edges(&empty_paths_cache.empty_edges);
|
||||
self.cheapest_paths.remove_prefixes(&empty_paths_cache.empty_prefixes);
|
||||
|
||||
let mut costs_to_delete = HashSet::new();
|
||||
for (cost, potential_cheapest_paths) in self.potential_cheapest_paths.iter_mut() {
|
||||
potential_cheapest_paths.remove_edges(&empty_paths_cache.empty_edges);
|
||||
potential_cheapest_paths.remove_prefixes(&empty_paths_cache.empty_prefixes);
|
||||
if potential_cheapest_paths.is_empty() {
|
||||
costs_to_delete.insert(*cost);
|
||||
}
|
||||
}
|
||||
for cost in costs_to_delete {
|
||||
self.potential_cheapest_paths.remove(&cost);
|
||||
}
|
||||
|
||||
if self.cheapest_paths.is_empty() {}
|
||||
|
||||
todo!()
|
||||
}
|
||||
|
||||
pub fn compute_paths_of_next_lowest_cost<G: RankingRuleGraphTrait>(
|
||||
mut self,
|
||||
graph: &mut RankingRuleGraph<G>,
|
||||
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||
pub fn paths_of_cost(
|
||||
&self,
|
||||
from: usize,
|
||||
cost: u64,
|
||||
all_distances: &[Vec<u64>],
|
||||
empty_paths_cache: &EmptyPathsCache,
|
||||
into_map: &mut PathsMap<u64>,
|
||||
) -> Option<Self> {
|
||||
if !empty_paths_cache.path_is_empty(&self.kth_cheapest_path.edges) {
|
||||
into_map.add_path(&self.kth_cheapest_path);
|
||||
) -> Vec<Vec<u32>> {
|
||||
let mut paths = vec![];
|
||||
self.paths_of_cost_rec(
|
||||
from,
|
||||
all_distances,
|
||||
cost,
|
||||
&mut vec![],
|
||||
&mut paths,
|
||||
&vec![false; self.all_edges.len()],
|
||||
empty_paths_cache,
|
||||
);
|
||||
paths
|
||||
}
|
||||
pub fn paths_of_cost_rec(
|
||||
&self,
|
||||
from: usize,
|
||||
all_distances: &[Vec<u64>],
|
||||
cost: u64,
|
||||
prev_edges: &mut Vec<u32>,
|
||||
paths: &mut Vec<Vec<u32>>,
|
||||
forbidden_edges: &[bool],
|
||||
empty_paths_cache: &EmptyPathsCache,
|
||||
) {
|
||||
let distances = &all_distances[from];
|
||||
if !distances.contains(&cost) {
|
||||
panic!();
|
||||
}
|
||||
let cur_cost = self.kth_cheapest_path.cost;
|
||||
while self.kth_cheapest_path.cost <= cur_cost {
|
||||
if let Some(next_self) = self.compute_next_cheapest_paths(graph, empty_paths_cache) {
|
||||
self = next_self;
|
||||
if self.kth_cheapest_path.cost == cur_cost
|
||||
&& !empty_paths_cache.path_is_empty(&self.kth_cheapest_path.edges)
|
||||
let tos = &self.query_graph.edges[from].successors;
|
||||
let mut valid_edges = vec![];
|
||||
for to in tos {
|
||||
self.visit_edges::<()>(from as u32, to, |edge_idx, edge| {
|
||||
if cost >= edge.cost as u64
|
||||
&& all_distances[to as usize].contains(&(cost - edge.cost as u64))
|
||||
&& !forbidden_edges[edge_idx as usize]
|
||||
{
|
||||
into_map.add_path(&self.kth_cheapest_path);
|
||||
} else {
|
||||
break;
|
||||
valid_edges.push((edge_idx, edge.cost, to));
|
||||
}
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
std::ops::ControlFlow::Continue(())
|
||||
});
|
||||
}
|
||||
Some(self)
|
||||
}
|
||||
|
||||
fn compute_next_cheapest_paths<G: RankingRuleGraphTrait>(
|
||||
mut self,
|
||||
graph: &mut RankingRuleGraph<G>,
|
||||
empty_paths_cache: &EmptyPathsCache,
|
||||
) -> Option<KCheapestPathsState> {
|
||||
// for all nodes in the last cheapest path (called spur_node), except last one...
|
||||
for (i, edge_idx) in self.kth_cheapest_path.edges[..self.kth_cheapest_path.edges.len() - 1]
|
||||
.iter()
|
||||
.enumerate()
|
||||
{
|
||||
let Some(edge) = graph.all_edges[*edge_idx as usize].as_ref() else { continue; };
|
||||
let Edge { from_node: spur_node, .. } = edge;
|
||||
|
||||
let root_path = &self.kth_cheapest_path.edges[..i];
|
||||
if empty_paths_cache.path_is_empty(root_path) {
|
||||
for (edge_idx, edge_cost, to) in valid_edges {
|
||||
prev_edges.push(edge_idx);
|
||||
if empty_paths_cache.empty_prefixes.contains_prefix_of_path(prev_edges) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let root_cost = root_path.iter().fold(0, |sum, next| {
|
||||
sum + graph.all_edges[*next as usize].as_ref().unwrap().cost as u64
|
||||
});
|
||||
|
||||
let mut tmp_removed_edges = vec![];
|
||||
// for all the paths already found that share a common prefix with the root path
|
||||
// we delete the edge from the spur node to the next one
|
||||
for edge_index_to_remove in self.cheapest_paths.edge_indices_after_prefix(root_path) {
|
||||
let was_removed =
|
||||
graph.node_edges[*spur_node as usize].remove(edge_index_to_remove);
|
||||
if was_removed {
|
||||
tmp_removed_edges.push(edge_index_to_remove);
|
||||
}
|
||||
let mut new_forbidden_edges = forbidden_edges.to_vec();
|
||||
for edge_idx in empty_paths_cache.empty_couple_edges[edge_idx as usize].iter() {
|
||||
new_forbidden_edges[*edge_idx as usize] = true;
|
||||
}
|
||||
for edge_idx in empty_paths_cache.empty_prefixes.final_edges_ater_prefix(prev_edges) {
|
||||
new_forbidden_edges[edge_idx as usize] = true;
|
||||
}
|
||||
|
||||
// Compute the cheapest path from the spur node to the destination
|
||||
// we will combine it with the root path to get a potential kth cheapest path
|
||||
let spur_path = graph.cheapest_path_to_end(*spur_node);
|
||||
// restore the temporarily removed edges
|
||||
graph.node_edges[*spur_node as usize].extend(tmp_removed_edges);
|
||||
|
||||
let Some(spur_path) = spur_path else { continue; };
|
||||
let total_cost = root_cost + spur_path.cost;
|
||||
let total_path = Path {
|
||||
edges: root_path.iter().chain(spur_path.edges.iter()).cloned().collect(),
|
||||
cost: total_cost,
|
||||
};
|
||||
let entry = self.potential_cheapest_paths.entry(total_cost).or_default();
|
||||
entry.add_path(&total_path);
|
||||
if to == self.query_graph.end_node {
|
||||
paths.push(prev_edges.clone());
|
||||
} else {
|
||||
self.paths_of_cost_rec(
|
||||
to as usize,
|
||||
all_distances,
|
||||
cost - edge_cost as u64,
|
||||
prev_edges,
|
||||
paths,
|
||||
&new_forbidden_edges,
|
||||
empty_paths_cache,
|
||||
)
|
||||
}
|
||||
prev_edges.pop();
|
||||
}
|
||||
while let Some(mut next_cheapest_paths_entry) = self.potential_cheapest_paths.first_entry()
|
||||
}
|
||||
|
||||
pub fn initialize_distances_cheapest(&self) -> Vec<Vec<u64>> {
|
||||
let mut distances_to_end: Vec<Vec<u64>> = vec![vec![]; self.query_graph.nodes.len()];
|
||||
let mut enqueued = vec![false; self.query_graph.nodes.len()];
|
||||
|
||||
let mut node_stack = VecDeque::new();
|
||||
|
||||
distances_to_end[self.query_graph.end_node as usize] = vec![0];
|
||||
for prev_node in
|
||||
self.query_graph.edges[self.query_graph.end_node as usize].predecessors.iter()
|
||||
{
|
||||
let cost = *next_cheapest_paths_entry.key();
|
||||
let next_cheapest_paths = next_cheapest_paths_entry.get_mut();
|
||||
node_stack.push_back(prev_node as usize);
|
||||
enqueued[prev_node as usize] = true;
|
||||
}
|
||||
|
||||
while let Some((next_cheapest_path, cost2)) = next_cheapest_paths.remove_first() {
|
||||
assert_eq!(cost, cost2);
|
||||
// NOTE: it is important not to discard the paths that are forbidden due to a
|
||||
// forbidden prefix, because the cheapest path algorithm (Dijkstra) cannot take
|
||||
// this property into account.
|
||||
if next_cheapest_path
|
||||
.iter()
|
||||
.any(|edge_index| graph.all_edges[*edge_index as usize].is_none())
|
||||
{
|
||||
continue;
|
||||
} else {
|
||||
self.cheapest_paths.insert(next_cheapest_path.iter().copied(), cost);
|
||||
|
||||
if next_cheapest_paths.is_empty() {
|
||||
next_cheapest_paths_entry.remove();
|
||||
while let Some(cur_node) = node_stack.pop_front() {
|
||||
let mut self_distances = vec![];
|
||||
for succ_node in self.query_graph.edges[cur_node].successors.iter() {
|
||||
let succ_distances = &distances_to_end[succ_node as usize];
|
||||
let _ = self.visit_edges::<()>(cur_node as u32, succ_node, |_, edge| {
|
||||
for succ_distance in succ_distances {
|
||||
self_distances.push(edge.cost as u64 + succ_distance);
|
||||
}
|
||||
self.kth_cheapest_path = Path { edges: next_cheapest_path, cost };
|
||||
|
||||
return Some(self);
|
||||
std::ops::ControlFlow::Continue(())
|
||||
});
|
||||
}
|
||||
self_distances.sort_unstable();
|
||||
self_distances.dedup();
|
||||
distances_to_end[cur_node] = self_distances;
|
||||
for prev_node in self.query_graph.edges[cur_node].predecessors.iter() {
|
||||
if !enqueued[prev_node as usize] {
|
||||
node_stack.push_back(prev_node as usize);
|
||||
enqueued[prev_node as usize] = true;
|
||||
}
|
||||
}
|
||||
let _ = next_cheapest_paths_entry.remove_entry();
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||
fn cheapest_path_to_end(&self, from: u32) -> Option<Path> {
|
||||
let mut dijkstra = DijkstraState {
|
||||
unvisited: (0..self.query_graph.nodes.len() as u32).collect(),
|
||||
distances: vec![u64::MAX; self.query_graph.nodes.len()],
|
||||
edges: vec![u32::MAX; self.query_graph.nodes.len()],
|
||||
edge_costs: vec![u8::MAX; self.query_graph.nodes.len()],
|
||||
paths: vec![None; self.query_graph.nodes.len()],
|
||||
};
|
||||
dijkstra.distances[from as usize] = 0;
|
||||
|
||||
// TODO: could use a binary heap here to store the distances, or a btreemap
|
||||
while let Some(cur_node) =
|
||||
dijkstra.unvisited.iter().min_by_key(|&n| dijkstra.distances[n as usize])
|
||||
{
|
||||
let cur_node_dist = dijkstra.distances[cur_node as usize];
|
||||
if cur_node_dist == u64::MAX {
|
||||
return None;
|
||||
}
|
||||
if cur_node == self.query_graph.end_node {
|
||||
break;
|
||||
}
|
||||
|
||||
let succ_cur_node = &self.successors[cur_node as usize];
|
||||
let unvisited_succ_cur_node = succ_cur_node & &dijkstra.unvisited;
|
||||
for succ in unvisited_succ_cur_node {
|
||||
let Some((cheapest_edge, cheapest_edge_cost)) = self.cheapest_edge(cur_node, succ) else {
|
||||
continue
|
||||
};
|
||||
|
||||
let old_dist_succ = &mut dijkstra.distances[succ as usize];
|
||||
let new_potential_distance = cur_node_dist + cheapest_edge_cost as u64;
|
||||
if new_potential_distance < *old_dist_succ {
|
||||
*old_dist_succ = new_potential_distance;
|
||||
dijkstra.edges[succ as usize] = cheapest_edge;
|
||||
dijkstra.edge_costs[succ as usize] = cheapest_edge_cost;
|
||||
dijkstra.paths[succ as usize] = Some(cur_node);
|
||||
}
|
||||
}
|
||||
dijkstra.unvisited.remove(cur_node);
|
||||
}
|
||||
|
||||
let mut cur = self.query_graph.end_node;
|
||||
let mut path_edges = vec![];
|
||||
while let Some(n) = dijkstra.paths[cur as usize] {
|
||||
path_edges.push(dijkstra.edges[cur as usize]);
|
||||
cur = n;
|
||||
}
|
||||
path_edges.reverse();
|
||||
Some(Path {
|
||||
edges: path_edges,
|
||||
cost: dijkstra.distances[self.query_graph.end_node as usize],
|
||||
})
|
||||
}
|
||||
|
||||
pub fn cheapest_edge(&self, cur_node: u32, succ: u32) -> Option<(u32, u8)> {
|
||||
self.visit_edges(cur_node, succ, |edge_idx, edge| {
|
||||
std::ops::ControlFlow::Break((edge_idx, edge.cost))
|
||||
})
|
||||
distances_to_end
|
||||
}
|
||||
}
|
||||
|
@ -32,16 +32,19 @@ impl<G: RankingRuleGraphTrait> EdgeDocidsCache<G> {
|
||||
db_cache: &mut DatabaseCache<'transaction>,
|
||||
edge_index: u32,
|
||||
graph: &RankingRuleGraph<G>,
|
||||
// TODO: maybe universe doesn't belong here
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<BitmapOrAllRef<'s>> {
|
||||
if self.cache.contains_key(&edge_index) {
|
||||
return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index]));
|
||||
}
|
||||
let edge = graph.all_edges[edge_index as usize].as_ref().unwrap();
|
||||
|
||||
match &edge.details {
|
||||
EdgeDetails::Unconditional => Ok(BitmapOrAllRef::All),
|
||||
EdgeDetails::Data(details) => {
|
||||
let docids = G::compute_docids(index, txn, db_cache, details)?;
|
||||
if self.cache.contains_key(&edge_index) {
|
||||
return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index]));
|
||||
}
|
||||
// TODO: maybe universe doesn't belong here
|
||||
let docids = universe & G::compute_docids(index, txn, db_cache, details)?;
|
||||
|
||||
let _ = self.cache.insert(edge_index, docids);
|
||||
let docids = &self.cache[&edge_index];
|
||||
|
@ -1,26 +1,60 @@
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::paths_map::PathsMap;
|
||||
|
||||
#[derive(Default, Clone)]
|
||||
#[derive(Clone)]
|
||||
pub struct EmptyPathsCache {
|
||||
pub empty_edges: RoaringBitmap,
|
||||
pub empty_edges: Vec<bool>,
|
||||
pub empty_prefixes: PathsMap<()>,
|
||||
pub empty_couple_edges: Vec<Vec<u32>>,
|
||||
}
|
||||
impl EmptyPathsCache {
|
||||
pub fn new(all_edges_len: usize) -> Self {
|
||||
Self {
|
||||
empty_edges: vec![false; all_edges_len],
|
||||
empty_prefixes: PathsMap::default(),
|
||||
empty_couple_edges: vec![vec![]; all_edges_len],
|
||||
}
|
||||
}
|
||||
pub fn forbid_edge(&mut self, edge_idx: u32) {
|
||||
self.empty_edges.insert(edge_idx);
|
||||
self.empty_edges[edge_idx as usize] = true;
|
||||
self.empty_couple_edges[edge_idx as usize] = vec![];
|
||||
self.empty_prefixes.remove_edge(&edge_idx);
|
||||
for edges2 in self.empty_couple_edges.iter_mut() {
|
||||
if let Some(edge2_pos) = edges2.iter().position(|e| *e == edge_idx) {
|
||||
edges2.swap_remove(edge2_pos);
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn forbid_prefix(&mut self, prefix: &[u32]) {
|
||||
self.empty_prefixes.insert(prefix.iter().copied(), ());
|
||||
}
|
||||
pub fn forbid_couple_edges(&mut self, edge1: u32, edge2: u32) {
|
||||
assert!(!self.empty_couple_edges[edge1 as usize].contains(&edge2));
|
||||
self.empty_couple_edges[edge1 as usize].push(edge2);
|
||||
}
|
||||
pub fn path_is_empty(&self, path: &[u32]) -> bool {
|
||||
for edge in path {
|
||||
if self.empty_edges.contains(*edge) {
|
||||
if self.empty_edges[*edge as usize] {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if self.empty_prefixes.contains_prefix_of_path(path) {
|
||||
return true;
|
||||
}
|
||||
for (edge1, edges2) in self.empty_couple_edges.iter().enumerate() {
|
||||
if let Some(pos_edge1) = path.iter().position(|e| *e == edge1 as u32) {
|
||||
if path[pos_edge1..].iter().any(|e| edges2.contains(e)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
// for (edge1, edge2) in self.empty_couple_edges.iter() {
|
||||
// if path.contains(edge1) && path.contains(edge2) {
|
||||
// return true;
|
||||
// }
|
||||
// }
|
||||
// if self.empty_prefixes.contains_prefix_of_path(path) {
|
||||
// return true;
|
||||
// }
|
||||
false
|
||||
}
|
||||
}
|
||||
|
@ -13,7 +13,6 @@ use heed::RoTxn;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use self::empty_paths_cache::EmptyPathsCache;
|
||||
use self::paths_map::PathsMap;
|
||||
|
||||
use super::db_cache::DatabaseCache;
|
||||
use super::logger::SearchLogger;
|
||||
@ -83,8 +82,11 @@ pub trait RankingRuleGraphTrait: Sized {
|
||||
|
||||
fn log_state(
|
||||
graph: &RankingRuleGraph<Self>,
|
||||
paths: &PathsMap<u64>,
|
||||
paths: &[Vec<u32>],
|
||||
empty_paths_cache: &EmptyPathsCache,
|
||||
universe: &RoaringBitmap,
|
||||
distances: &[Vec<u64>],
|
||||
cost: u64,
|
||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
);
|
||||
}
|
||||
@ -135,7 +137,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||
None
|
||||
}
|
||||
|
||||
fn remove_edge(&mut self, edge_index: u32) {
|
||||
pub fn remove_edge(&mut self, edge_index: u32) {
|
||||
let edge_opt = &mut self.all_edges[edge_index as usize];
|
||||
let Some(edge) = &edge_opt else { return };
|
||||
let (from_node, _to_node) = (edge.from_node, edge.to_node);
|
||||
@ -151,44 +153,4 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||
}
|
||||
self.successors[from_node as usize] = new_successors_from_node;
|
||||
}
|
||||
|
||||
pub fn graphviz(&self) -> String {
|
||||
let mut desc = String::new();
|
||||
desc.push_str("digraph G {\nrankdir = LR;\nnode [shape = \"record\"]\n");
|
||||
|
||||
for (node_idx, node) in self.query_graph.nodes.iter().enumerate() {
|
||||
if matches!(node, QueryNode::Deleted) {
|
||||
continue;
|
||||
}
|
||||
desc.push_str(&format!("{node_idx} [label = {:?}]", node));
|
||||
if node_idx == self.query_graph.root_node as usize {
|
||||
desc.push_str("[color = blue]");
|
||||
} else if node_idx == self.query_graph.end_node as usize {
|
||||
desc.push_str("[color = red]");
|
||||
}
|
||||
desc.push_str(";\n");
|
||||
}
|
||||
for edge in self.all_edges.iter().flatten() {
|
||||
let Edge { from_node, to_node, details, .. } = edge;
|
||||
|
||||
match &details {
|
||||
EdgeDetails::Unconditional => {
|
||||
desc.push_str(&format!(
|
||||
"{from_node} -> {to_node} [label = \"always cost {cost}\"];\n",
|
||||
cost = edge.cost,
|
||||
));
|
||||
}
|
||||
EdgeDetails::Data(details) => {
|
||||
desc.push_str(&format!(
|
||||
"{from_node} -> {to_node} [label = \"cost {cost} {edge_label}\"];\n",
|
||||
cost = edge.cost,
|
||||
edge_label = G::graphviz_edge_details_label(details)
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
desc.push('}');
|
||||
desc
|
||||
}
|
||||
}
|
||||
|
@ -1,12 +1,11 @@
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::fmt::Write;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
|
||||
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::cheapest_paths::Path;
|
||||
use super::{Edge, EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait};
|
||||
use crate::new::QueryNode;
|
||||
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PathsMap<V> {
|
||||
@ -157,6 +156,24 @@ impl<V> PathsMap<V> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn final_edges_ater_prefix(&self, prefix: &[u32]) -> Vec<u32> {
|
||||
let [first_edge, remaining_prefix @ ..] = prefix else {
|
||||
return self.nodes.iter().filter_map(|n| {
|
||||
if n.1.value.is_some() {
|
||||
Some(n.0)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}).collect();
|
||||
};
|
||||
for (edge, rest) in self.nodes.iter() {
|
||||
if edge == first_edge {
|
||||
return rest.final_edges_ater_prefix(remaining_prefix);
|
||||
}
|
||||
}
|
||||
vec![]
|
||||
}
|
||||
|
||||
pub fn edge_indices_after_prefix(&self, prefix: &[u32]) -> Vec<u32> {
|
||||
let [first_edge, remaining_prefix @ ..] = prefix else {
|
||||
return self.nodes.iter().map(|n| n.0).collect();
|
||||
@ -185,88 +202,4 @@ impl<V> PathsMap<V> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn graphviz<G: RankingRuleGraphTrait>(&self, graph: &RankingRuleGraph<G>) -> String {
|
||||
let mut desc = String::new();
|
||||
desc.push_str("digraph G {\n");
|
||||
self.graphviz_rec(&mut desc, vec![], graph);
|
||||
desc.push_str("\n}\n");
|
||||
desc
|
||||
}
|
||||
fn graphviz_rec<G: RankingRuleGraphTrait>(
|
||||
&self,
|
||||
desc: &mut String,
|
||||
path_from: Vec<u64>,
|
||||
graph: &RankingRuleGraph<G>,
|
||||
) {
|
||||
let id_from = {
|
||||
let mut h = DefaultHasher::new();
|
||||
path_from.hash(&mut h);
|
||||
h.finish()
|
||||
};
|
||||
for (edge_idx, rest) in self.nodes.iter() {
|
||||
let Some(Edge { from_node, to_node, cost, .. }) = graph.all_edges[*edge_idx as usize].as_ref() else {
|
||||
continue;
|
||||
};
|
||||
let mut path_to = path_from.clone();
|
||||
path_to.push({
|
||||
let mut h = DefaultHasher::new();
|
||||
edge_idx.hash(&mut h);
|
||||
h.finish()
|
||||
});
|
||||
let id_to = {
|
||||
let mut h = DefaultHasher::new();
|
||||
path_to.hash(&mut h);
|
||||
h.finish()
|
||||
};
|
||||
writeln!(desc, "{id_to} [label = \"{from_node}→{to_node} [{cost}]\"];").unwrap();
|
||||
writeln!(desc, "{id_from} -> {id_to};").unwrap();
|
||||
|
||||
rest.graphviz_rec(desc, path_to, graph);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||
pub fn graphviz_with_path(&self, path: &Path) -> String {
|
||||
let mut desc = String::new();
|
||||
desc.push_str("digraph G {\nrankdir = LR;\nnode [shape = \"record\"]\n");
|
||||
|
||||
for (node_idx, node) in self.query_graph.nodes.iter().enumerate() {
|
||||
if matches!(node, QueryNode::Deleted) {
|
||||
continue;
|
||||
}
|
||||
desc.push_str(&format!("{node_idx} [label = {:?}]", node));
|
||||
if node_idx == self.query_graph.root_node as usize {
|
||||
desc.push_str("[color = blue]");
|
||||
} else if node_idx == self.query_graph.end_node as usize {
|
||||
desc.push_str("[color = red]");
|
||||
}
|
||||
desc.push_str(";\n");
|
||||
}
|
||||
|
||||
for (edge_idx, edge) in self.all_edges.iter().enumerate() {
|
||||
let Some(edge) = edge else { continue };
|
||||
let Edge { from_node, to_node, .. } = edge;
|
||||
let color = if path.edges.contains(&(edge_idx as u32)) { "red" } else { "green" };
|
||||
match &edge.details {
|
||||
EdgeDetails::Unconditional => {
|
||||
desc.push_str(&format!(
|
||||
"{from_node} -> {to_node} [label = \"cost {cost}\", color = {color}];\n",
|
||||
cost = edge.cost,
|
||||
));
|
||||
}
|
||||
EdgeDetails::Data(details) => {
|
||||
desc.push_str(&format!(
|
||||
"{from_node} -> {to_node} [label = \"cost {cost} {edge_label}\", color = {color}];\n",
|
||||
cost = edge.cost,
|
||||
edge_label = G::graphviz_edge_details_label(details),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
desc.push('}');
|
||||
desc
|
||||
}
|
||||
}
|
||||
|
@ -16,9 +16,9 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result<Option<(WordDerivations,
|
||||
QueryNode::Term(LocatedQueryTerm { value: value1, positions: pos1 }) => {
|
||||
match value1 {
|
||||
QueryTerm::Word { derivations } => (derivations.clone(), *pos1.end()),
|
||||
QueryTerm::Phrase(phrase1) => {
|
||||
QueryTerm::Phrase { phrase: phrase1 } => {
|
||||
// TODO: remove second unwrap
|
||||
let original = phrase1.last().unwrap().as_ref().unwrap().clone();
|
||||
let original = phrase1.words.last().unwrap().as_ref().unwrap().clone();
|
||||
(
|
||||
WordDerivations {
|
||||
original: original.clone(),
|
||||
@ -26,6 +26,8 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result<Option<(WordDerivations,
|
||||
one_typo: vec![],
|
||||
two_typos: vec![],
|
||||
use_prefix_db: false,
|
||||
synonyms: vec![],
|
||||
split_words: None,
|
||||
},
|
||||
*pos1.end(),
|
||||
)
|
||||
@ -39,6 +41,8 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result<Option<(WordDerivations,
|
||||
one_typo: vec![],
|
||||
two_typos: vec![],
|
||||
use_prefix_db: false,
|
||||
synonyms: vec![],
|
||||
split_words: None,
|
||||
},
|
||||
-100,
|
||||
),
|
||||
@ -63,9 +67,9 @@ pub fn visit_to_node<'transaction, 'from_data>(
|
||||
|
||||
let (derivations2, pos2, ngram_len2) = match value2 {
|
||||
QueryTerm::Word { derivations } => (derivations.clone(), *pos2.start(), pos2.len()),
|
||||
QueryTerm::Phrase(phrase2) => {
|
||||
QueryTerm::Phrase { phrase: phrase2 } => {
|
||||
// TODO: remove second unwrap
|
||||
let original = phrase2.last().unwrap().as_ref().unwrap().clone();
|
||||
let original = phrase2.words.last().unwrap().as_ref().unwrap().clone();
|
||||
(
|
||||
WordDerivations {
|
||||
original: original.clone(),
|
||||
@ -73,6 +77,8 @@ pub fn visit_to_node<'transaction, 'from_data>(
|
||||
one_typo: vec![],
|
||||
two_typos: vec![],
|
||||
use_prefix_db: false,
|
||||
synonyms: vec![],
|
||||
split_words: None,
|
||||
},
|
||||
*pos2.start(),
|
||||
1,
|
||||
|
@ -2,18 +2,21 @@ pub mod build;
|
||||
pub mod compute_docids;
|
||||
|
||||
use heed::RoTxn;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::empty_paths_cache::EmptyPathsCache;
|
||||
use super::paths_map::PathsMap;
|
||||
|
||||
use super::{EdgeDetails, RankingRuleGraphTrait};
|
||||
use crate::new::db_cache::DatabaseCache;
|
||||
use crate::new::logger::SearchLogger;
|
||||
use crate::new::query_term::WordDerivations;
|
||||
use crate::new::QueryNode;
|
||||
use crate::new::{QueryGraph, QueryNode};
|
||||
use crate::{Index, Result};
|
||||
|
||||
// TODO: intern the strings, refer to them by their pointer?
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum WordPair {
|
||||
// TODO: add WordsSwapped and WordPrefixSwapped case
|
||||
Words { left: String, right: String },
|
||||
WordsSwapped { left: String, right: String },
|
||||
WordPrefix { left: String, right_prefix: String },
|
||||
@ -22,6 +25,7 @@ pub enum WordPair {
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct ProximityEdge {
|
||||
// TODO: use a list of pointers to the word pairs instead?
|
||||
pairs: Vec<WordPair>,
|
||||
proximity: u8,
|
||||
}
|
||||
@ -67,10 +71,20 @@ impl RankingRuleGraphTrait for ProximityGraph {
|
||||
|
||||
fn log_state(
|
||||
graph: &super::RankingRuleGraph<Self>,
|
||||
paths: &PathsMap<u64>,
|
||||
paths: &[Vec<u32>],
|
||||
empty_paths_cache: &EmptyPathsCache,
|
||||
logger: &mut dyn crate::new::logger::SearchLogger<crate::new::QueryGraph>,
|
||||
universe: &RoaringBitmap,
|
||||
distances: &[Vec<u64>],
|
||||
cost: u64,
|
||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
) {
|
||||
logger.log_proximity_state(graph, paths, empty_paths_cache);
|
||||
logger.log_proximity_state(
|
||||
graph,
|
||||
paths,
|
||||
empty_paths_cache,
|
||||
universe,
|
||||
distances.to_vec(),
|
||||
cost,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -5,7 +5,7 @@ use roaring::{MultiOps, RoaringBitmap};
|
||||
|
||||
use super::edge_docids_cache::EdgeDocidsCache;
|
||||
use super::empty_paths_cache::EmptyPathsCache;
|
||||
use super::paths_map::PathsMap;
|
||||
|
||||
use super::{RankingRuleGraph, RankingRuleGraphTrait};
|
||||
use crate::new::db_cache::DatabaseCache;
|
||||
|
||||
@ -21,44 +21,65 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||
edge_docids_cache: &mut EdgeDocidsCache<G>,
|
||||
empty_paths_cache: &mut EmptyPathsCache,
|
||||
universe: &RoaringBitmap,
|
||||
mut paths: PathsMap<u64>,
|
||||
mut paths: Vec<Vec<u32>>,
|
||||
) -> Result<RoaringBitmap> {
|
||||
paths.sort_unstable();
|
||||
let mut needs_filtering = false;
|
||||
let mut path_bitmaps = vec![];
|
||||
'path_loop: loop {
|
||||
if needs_filtering {
|
||||
for path in paths.iter_mut() {
|
||||
if empty_paths_cache.path_is_empty(path) {
|
||||
path.clear();
|
||||
}
|
||||
}
|
||||
needs_filtering = false;
|
||||
}
|
||||
let Some(edge_indexes) = paths.pop() else {
|
||||
break;
|
||||
};
|
||||
|
||||
paths.remove_edges(&empty_paths_cache.empty_edges);
|
||||
paths.remove_prefixes(&empty_paths_cache.empty_prefixes);
|
||||
if edge_indexes.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
'path_loop: while let Some((edge_indexes, _)) = paths.remove_first() {
|
||||
// if path is excluded, continue...
|
||||
let mut processed_edges = vec![];
|
||||
let mut path_bitmap = universe.clone();
|
||||
let mut visited_edges = vec![];
|
||||
let mut cached_edge_docids = vec![];
|
||||
'edge_loop: for edge_index in edge_indexes {
|
||||
processed_edges.push(edge_index);
|
||||
let edge_docids =
|
||||
edge_docids_cache.get_edge_docids(index, txn, db_cache, edge_index, self)?;
|
||||
visited_edges.push(edge_index);
|
||||
let edge_docids = edge_docids_cache
|
||||
.get_edge_docids(index, txn, db_cache, edge_index, self, universe)?;
|
||||
match edge_docids {
|
||||
BitmapOrAllRef::Bitmap(edge_docids) => {
|
||||
cached_edge_docids.push((edge_index, edge_docids.clone()));
|
||||
let (_, edge_docids) = cached_edge_docids.last().unwrap();
|
||||
if edge_docids.is_disjoint(universe) {
|
||||
// 1. Store in the cache that this edge is empty for this universe
|
||||
empty_paths_cache.forbid_edge(edge_index);
|
||||
// 2. remove all the paths that contain this edge for this universe
|
||||
paths.remove_edge(&edge_index);
|
||||
// 3. remove this edge from the proximity graph
|
||||
|
||||
// 2. remove this edge from the proximity graph
|
||||
self.remove_edge(edge_index);
|
||||
|
||||
// 4. continue executing this function again on the remaining paths
|
||||
edge_docids_cache.cache.remove(&edge_index);
|
||||
needs_filtering = true;
|
||||
// 3. continue executing this function again on the remaining paths
|
||||
continue 'path_loop;
|
||||
} else {
|
||||
path_bitmap &= edge_docids;
|
||||
if path_bitmap.is_disjoint(universe) {
|
||||
// 1. Store in the cache that this prefix is empty for this universe
|
||||
empty_paths_cache
|
||||
.empty_prefixes
|
||||
.insert(processed_edges.iter().copied(), ());
|
||||
// 2. remove all the paths beginning with this prefix
|
||||
paths.remove_prefix(&processed_edges);
|
||||
// 3. continue executing this function again on the remaining paths?
|
||||
needs_filtering = true;
|
||||
empty_paths_cache.forbid_prefix(&visited_edges);
|
||||
// if the intersection between this edge and any
|
||||
// previous one is disjoint with the universe,
|
||||
// then we add these two edges to the empty_path_cache
|
||||
for (edge_index2, edge_docids2) in
|
||||
cached_edge_docids[..cached_edge_docids.len() - 1].iter()
|
||||
{
|
||||
let intersection = edge_docids & edge_docids2;
|
||||
if intersection.is_disjoint(universe) {
|
||||
empty_paths_cache
|
||||
.forbid_couple_edges(*edge_index2, edge_index);
|
||||
}
|
||||
}
|
||||
continue 'path_loop;
|
||||
}
|
||||
}
|
||||
@ -68,6 +89,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||
}
|
||||
path_bitmaps.push(path_bitmap);
|
||||
}
|
||||
|
||||
Ok(MultiOps::union(path_bitmaps))
|
||||
}
|
||||
}
|
||||
|
@ -2,16 +2,18 @@ use heed::{BytesDecode, RoTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::empty_paths_cache::EmptyPathsCache;
|
||||
use super::paths_map::PathsMap;
|
||||
use super::{EdgeDetails, RankingRuleGraphTrait};
|
||||
|
||||
use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait};
|
||||
use crate::new::db_cache::DatabaseCache;
|
||||
use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
|
||||
use crate::new::QueryNode;
|
||||
use crate::new::logger::SearchLogger;
|
||||
use crate::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations};
|
||||
use crate::new::resolve_query_graph::resolve_phrase;
|
||||
use crate::new::{QueryGraph, QueryNode};
|
||||
use crate::{Index, Result, RoaringBitmapCodec};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum TypoEdge {
|
||||
Phrase,
|
||||
Phrase { phrase: Phrase },
|
||||
Word { derivations: WordDerivations, nbr_typos: u8 },
|
||||
}
|
||||
|
||||
@ -23,7 +25,7 @@ impl RankingRuleGraphTrait for TypoGraph {
|
||||
|
||||
fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String {
|
||||
match edge {
|
||||
TypoEdge::Phrase => format!(", 0 typos"),
|
||||
TypoEdge::Phrase { .. } => ", 0 typos".to_owned(),
|
||||
TypoEdge::Word { nbr_typos, .. } => format!(", {nbr_typos} typos"),
|
||||
}
|
||||
}
|
||||
@ -33,9 +35,9 @@ impl RankingRuleGraphTrait for TypoGraph {
|
||||
txn: &'transaction RoTxn,
|
||||
db_cache: &mut DatabaseCache<'transaction>,
|
||||
edge: &Self::EdgeDetails,
|
||||
) -> Result<roaring::RoaringBitmap> {
|
||||
) -> Result<RoaringBitmap> {
|
||||
match edge {
|
||||
TypoEdge::Phrase => todo!(),
|
||||
TypoEdge::Phrase { phrase } => resolve_phrase(index, txn, db_cache, phrase),
|
||||
TypoEdge::Word { derivations, nbr_typos } => {
|
||||
let words = match nbr_typos {
|
||||
0 => &derivations.zero_typo,
|
||||
@ -68,21 +70,23 @@ impl RankingRuleGraphTrait for TypoGraph {
|
||||
_index: &Index,
|
||||
_txn: &'transaction RoTxn,
|
||||
_db_cache: &mut DatabaseCache<'transaction>,
|
||||
from_node: &QueryNode,
|
||||
_from_node: &QueryNode,
|
||||
) -> Result<Option<Self::BuildVisitedFromNode>> {
|
||||
Ok(Some(()))
|
||||
}
|
||||
|
||||
fn build_visit_to_node<'from_data, 'transaction: 'from_data>(
|
||||
index: &Index,
|
||||
txn: &'transaction RoTxn,
|
||||
db_cache: &mut DatabaseCache<'transaction>,
|
||||
_index: &Index,
|
||||
_txn: &'transaction RoTxn,
|
||||
_db_cache: &mut DatabaseCache<'transaction>,
|
||||
to_node: &QueryNode,
|
||||
from_node_data: &'from_data Self::BuildVisitedFromNode,
|
||||
_from_node_data: &'from_data Self::BuildVisitedFromNode,
|
||||
) -> Result<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>> {
|
||||
match to_node {
|
||||
QueryNode::Term(LocatedQueryTerm { value, .. }) => match value {
|
||||
QueryTerm::Phrase(_) => Ok(vec![(0, EdgeDetails::Data(TypoEdge::Phrase))]),
|
||||
QueryTerm::Phrase { phrase } => {
|
||||
Ok(vec![(0, EdgeDetails::Data(TypoEdge::Phrase { phrase: phrase.clone() }))])
|
||||
}
|
||||
QueryTerm::Word { derivations } => {
|
||||
let mut edges = vec![];
|
||||
if !derivations.zero_typo.is_empty() || derivations.use_prefix_db {
|
||||
@ -121,11 +125,14 @@ impl RankingRuleGraphTrait for TypoGraph {
|
||||
}
|
||||
|
||||
fn log_state(
|
||||
graph: &super::RankingRuleGraph<Self>,
|
||||
paths: &PathsMap<u64>,
|
||||
graph: &RankingRuleGraph<Self>,
|
||||
paths: &[Vec<u32>],
|
||||
empty_paths_cache: &EmptyPathsCache,
|
||||
logger: &mut dyn crate::new::logger::SearchLogger<crate::new::QueryGraph>,
|
||||
universe: &RoaringBitmap,
|
||||
distances: &[Vec<u64>],
|
||||
cost: u64,
|
||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
) {
|
||||
logger.log_typo_state(graph, paths, empty_paths_cache);
|
||||
logger.log_typo_state(graph, paths, empty_paths_cache, universe, distances.to_vec(), cost);
|
||||
}
|
||||
}
|
||||
|
@ -1,3 +1,5 @@
|
||||
use std::time::Instant;
|
||||
|
||||
use heed::RoTxn;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
@ -9,7 +11,7 @@ use crate::new::graph_based_ranking_rule::GraphBasedRankingRule;
|
||||
use crate::new::ranking_rule_graph::proximity::ProximityGraph;
|
||||
use crate::new::ranking_rule_graph::typo::TypoGraph;
|
||||
use crate::new::words::Words;
|
||||
use crate::search::new::sort::Sort;
|
||||
// use crate::search::new::sort::Sort;
|
||||
use crate::{Filter, Index, Result, TermsMatchingStrategy};
|
||||
|
||||
pub trait RankingRuleOutputIter<'transaction, Query> {
|
||||
@ -123,13 +125,14 @@ pub fn execute_search<'transaction>(
|
||||
length: usize,
|
||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
) -> Result<Vec<u32>> {
|
||||
logger.initial_query(query_graph, Instant::now());
|
||||
let words = &mut Words::new(TermsMatchingStrategy::Last);
|
||||
let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?;
|
||||
// let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?;
|
||||
let proximity = &mut GraphBasedRankingRule::<ProximityGraph>::new("proximity".to_owned());
|
||||
let typo = &mut GraphBasedRankingRule::<TypoGraph>::new("typo".to_owned());
|
||||
// TODO: ranking rules given as argument
|
||||
let mut ranking_rules: Vec<&mut dyn RankingRule<'transaction, QueryGraph>> =
|
||||
vec![words, typo, proximity, sort];
|
||||
vec![words, typo, proximity /*sort*/];
|
||||
|
||||
logger.ranking_rules(&ranking_rules);
|
||||
|
||||
@ -144,7 +147,13 @@ pub fn execute_search<'transaction>(
|
||||
}
|
||||
|
||||
let ranking_rules_len = ranking_rules.len();
|
||||
logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, &universe);
|
||||
logger.start_iteration_ranking_rule(
|
||||
0,
|
||||
ranking_rules[0],
|
||||
query_graph,
|
||||
&universe,
|
||||
Instant::now(),
|
||||
);
|
||||
ranking_rules[0].start_iteration(index, txn, db_cache, logger, &universe, query_graph)?;
|
||||
|
||||
let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len];
|
||||
@ -154,11 +163,12 @@ pub fn execute_search<'transaction>(
|
||||
|
||||
macro_rules! back {
|
||||
() => {
|
||||
// assert!(candidates[cur_ranking_rule_index].is_empty());
|
||||
assert!(candidates[cur_ranking_rule_index].is_empty());
|
||||
logger.end_iteration_ranking_rule(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index],
|
||||
&candidates[cur_ranking_rule_index],
|
||||
Instant::now(),
|
||||
);
|
||||
candidates[cur_ranking_rule_index].clear();
|
||||
ranking_rules[cur_ranking_rule_index].end_iteration(index, txn, db_cache, logger);
|
||||
@ -187,6 +197,7 @@ pub fn execute_search<'transaction>(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index],
|
||||
&candidates,
|
||||
Instant::now(),
|
||||
);
|
||||
} else {
|
||||
let all_candidates = candidates.iter().collect::<Vec<_>>();
|
||||
@ -196,6 +207,7 @@ pub fn execute_search<'transaction>(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index],
|
||||
&skipped_candidates.into_iter().collect(),
|
||||
Instant::now(),
|
||||
);
|
||||
let candidates = candidates
|
||||
.iter()
|
||||
@ -219,24 +231,26 @@ pub fn execute_search<'transaction>(
|
||||
// The universe for this bucket is zero or one element, so we don't need to sort
|
||||
// anything, just extend the results and go back to the parent ranking rule.
|
||||
if candidates[cur_ranking_rule_index].len() <= 1 {
|
||||
candidates[cur_ranking_rule_index].clear();
|
||||
maybe_add_to_results!(&candidates[cur_ranking_rule_index]);
|
||||
candidates[cur_ranking_rule_index].clear();
|
||||
back!();
|
||||
continue;
|
||||
}
|
||||
|
||||
logger.next_bucket_ranking_rule(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index],
|
||||
&candidates[cur_ranking_rule_index],
|
||||
);
|
||||
|
||||
let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(index, txn, db_cache, logger, &candidates[cur_ranking_rule_index])? else {
|
||||
// TODO: add remaining candidates automatically here?
|
||||
back!();
|
||||
continue;
|
||||
};
|
||||
|
||||
logger.next_bucket_ranking_rule(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index],
|
||||
&candidates[cur_ranking_rule_index],
|
||||
&next_bucket.candidates,
|
||||
Instant::now(),
|
||||
);
|
||||
|
||||
assert!(candidates[cur_ranking_rule_index].is_superset(&next_bucket.candidates));
|
||||
candidates[cur_ranking_rule_index] -= &next_bucket.candidates;
|
||||
|
||||
@ -255,6 +269,7 @@ pub fn execute_search<'transaction>(
|
||||
ranking_rules[cur_ranking_rule_index],
|
||||
&next_bucket.query,
|
||||
&candidates[cur_ranking_rule_index],
|
||||
Instant::now(),
|
||||
);
|
||||
ranking_rules[cur_ranking_rule_index].start_iteration(
|
||||
index,
|
||||
@ -271,17 +286,18 @@ pub fn execute_search<'transaction>(
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader, Cursor, Seek};
|
||||
use std::time::Instant;
|
||||
|
||||
use heed::EnvOpenOptions;
|
||||
|
||||
use super::execute_search;
|
||||
// use crate::allocator::ALLOC;
|
||||
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::new::db_cache::DatabaseCache;
|
||||
use crate::new::logger::detailed::DetailedSearchLogger;
|
||||
use big_s::S;
|
||||
use heed::EnvOpenOptions;
|
||||
use maplit::hashset;
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader, Cursor, Seek};
|
||||
use std::time::Instant;
|
||||
// use crate::new::logger::detailed::DetailedSearchLogger;
|
||||
use crate::new::logger::{DefaultSearchLogger, SearchLogger};
|
||||
use crate::new::make_query_graph;
|
||||
use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
||||
@ -323,16 +339,119 @@ mod tests {
|
||||
let mut db_cache = DatabaseCache::default();
|
||||
|
||||
let query_graph =
|
||||
make_query_graph(&index, &txn, &mut db_cache, "b b b b b b b b b b").unwrap();
|
||||
println!("{}", query_graph.graphviz());
|
||||
logger.initial_query(&query_graph);
|
||||
make_query_graph(&index, &txn, &mut db_cache, "releases from poison by the government")
|
||||
.unwrap();
|
||||
logger.initial_query(&query_graph, Instant::now());
|
||||
|
||||
let results =
|
||||
execute_search(&index, &txn, &mut db_cache, &query_graph, None, 0, 20, &mut logger)
|
||||
execute_search(&index, &txn, &mut db_cache, &query_graph, None, 0, 50, &mut logger)
|
||||
.unwrap();
|
||||
println!("{results:?}")
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn search_wiki_new() {
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||
|
||||
let index = Index::new(options, "data_wiki").unwrap();
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len());
|
||||
|
||||
let primary_key = index.primary_key(&txn).unwrap().unwrap();
|
||||
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
|
||||
// loop {
|
||||
let start = Instant::now();
|
||||
|
||||
let mut db_cache = DatabaseCache::default();
|
||||
|
||||
let query_graph = make_query_graph(
|
||||
&index,
|
||||
&txn,
|
||||
&mut db_cache,
|
||||
"which a the releases from poison by the government",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log");
|
||||
|
||||
let results = execute_search(
|
||||
&index,
|
||||
&txn,
|
||||
&mut db_cache,
|
||||
&query_graph,
|
||||
None,
|
||||
0,
|
||||
20,
|
||||
&mut DefaultSearchLogger,
|
||||
// &mut logger,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// logger.write_d2_description();
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
|
||||
let ids = index
|
||||
.documents(&txn, results.iter().copied())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|x| {
|
||||
let obkv = &x.1;
|
||||
let id = obkv.get(primary_key).unwrap();
|
||||
let id: serde_json::Value = serde_json::from_slice(id).unwrap();
|
||||
id.as_str().unwrap().to_owned()
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
println!("{}us: {results:?}", elapsed.as_micros());
|
||||
println!("external ids: {ids:?}");
|
||||
// println!("max_resident: {}", ALLOC.max_resident.load(std::sync::atomic::Ordering::SeqCst));
|
||||
// println!("allocated: {}", ALLOC.allocated.load(std::sync::atomic::Ordering::SeqCst));
|
||||
// }
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn search_wiki_old() {
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||
|
||||
let index = Index::new(options, "data_wiki").unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let rr = index.criteria(&txn).unwrap();
|
||||
println!("{rr:?}");
|
||||
|
||||
let primary_key = index.primary_key(&txn).unwrap().unwrap();
|
||||
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.query("releases from poison by the government");
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
|
||||
let docs = s.execute().unwrap();
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
|
||||
let ids = index
|
||||
.documents(&txn, docs.documents_ids.iter().copied())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|x| {
|
||||
let obkv = &x.1;
|
||||
let id = obkv.get(primary_key).unwrap();
|
||||
let id: serde_json::Value = serde_json::from_slice(id).unwrap();
|
||||
id.as_str().unwrap().to_owned()
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
|
||||
println!("external ids: {ids:?}");
|
||||
}
|
||||
#[test]
|
||||
fn search_movies_new() {
|
||||
let mut options = EnvOpenOptions::new();
|
||||
@ -343,7 +462,7 @@ mod tests {
|
||||
|
||||
let primary_key = index.primary_key(&txn).unwrap().unwrap();
|
||||
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
|
||||
|
||||
// loop {
|
||||
let start = Instant::now();
|
||||
|
||||
let mut db_cache = DatabaseCache::default();
|
||||
@ -352,7 +471,7 @@ mod tests {
|
||||
make_query_graph(&index, &txn, &mut db_cache, "releases from poison by the government")
|
||||
.unwrap();
|
||||
|
||||
let mut logger = DetailedSearchLogger::new("log");
|
||||
let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log");
|
||||
|
||||
let results = execute_search(
|
||||
&index,
|
||||
@ -360,9 +479,10 @@ mod tests {
|
||||
&mut db_cache,
|
||||
&query_graph,
|
||||
None,
|
||||
5,
|
||||
0,
|
||||
20,
|
||||
&mut logger, //&mut DefaultSearchLogger,
|
||||
// &mut DefaultSearchLogger,
|
||||
&mut logger,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
@ -384,6 +504,7 @@ mod tests {
|
||||
|
||||
println!("{}us: {results:?}", elapsed.as_micros());
|
||||
println!("external ids: {ids:?}");
|
||||
// }
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -392,19 +513,39 @@ mod tests {
|
||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||
|
||||
let index = Index::new(options, "data_movies").unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let rr = index.criteria(&txn).unwrap();
|
||||
println!("{rr:?}");
|
||||
|
||||
let primary_key = index.primary_key(&txn).unwrap().unwrap();
|
||||
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.query("b b b b b b b b b b");
|
||||
s.query("releases from poison by the government");
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
|
||||
let docs = s.execute().unwrap();
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
|
||||
let ids = index
|
||||
.documents(&txn, docs.documents_ids.iter().copied())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|x| {
|
||||
let obkv = &x.1;
|
||||
let id = obkv.get(primary_key).unwrap();
|
||||
let id: serde_json::Value = serde_json::from_slice(id).unwrap();
|
||||
id.as_str().unwrap().to_owned()
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
|
||||
println!("external ids: {ids:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -420,10 +561,16 @@ mod tests {
|
||||
|
||||
builder.set_min_word_len_one_typo(5);
|
||||
builder.set_min_word_len_two_typos(100);
|
||||
|
||||
builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
|
||||
builder.set_sortable_fields(hashset! { S("release_date") });
|
||||
builder.set_criteria(vec![
|
||||
Criterion::Words,
|
||||
Criterion::Typo,
|
||||
Criterion::Proximity,
|
||||
Criterion::Asc("release_date".to_owned()),
|
||||
]);
|
||||
|
||||
builder.execute(|_| (), || false).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -445,6 +592,7 @@ mod tests {
|
||||
builder.set_searchable_fields(searchable_fields);
|
||||
let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
|
||||
builder.set_filterable_fields(filterable_fields);
|
||||
|
||||
builder.set_min_word_len_one_typo(5);
|
||||
builder.set_min_word_len_two_typos(100);
|
||||
builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
|
||||
@ -467,6 +615,48 @@ mod tests {
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
}
|
||||
#[test]
|
||||
fn _index_wiki() {
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||
|
||||
let index = Index::new(options, "data_wiki").unwrap();
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
// let primary_key = "id";
|
||||
let searchable_fields = vec!["body", "title", "url"];
|
||||
// let filterable_fields = vec![];
|
||||
let config = IndexerConfig::default();
|
||||
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
||||
// builder.set_primary_key(primary_key.to_owned());
|
||||
let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
|
||||
builder.set_searchable_fields(searchable_fields);
|
||||
// let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
|
||||
// builder.set_filterable_fields(filterable_fields);
|
||||
|
||||
// builder.set_min_word_len_one_typo(5);
|
||||
// builder.set_min_word_len_two_typos(100);
|
||||
builder.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]);
|
||||
builder.execute(|_| (), || false).unwrap();
|
||||
|
||||
let config = IndexerConfig::default();
|
||||
let indexing_config =
|
||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||
let builder =
|
||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false)
|
||||
.unwrap();
|
||||
|
||||
let documents = documents_from(
|
||||
"/Users/meilisearch/Documents/milli2/benchmarks/datasets/smol-wiki-articles.csv",
|
||||
"csv",
|
||||
);
|
||||
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||
user_error.unwrap();
|
||||
builder.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
}
|
||||
|
||||
fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader<impl BufRead + Seek> {
|
||||
let reader = File::open(filename)
|
||||
|
Loading…
x
Reference in New Issue
Block a user