Rewrite cheapest path algorithm and empty path cache

It is now much simpler and has much better performance.
This commit is contained in:
Loïc Lecrenier 2023-03-02 21:27:42 +01:00
parent caa1e1b923
commit c27ea2677f
14 changed files with 782 additions and 530 deletions

View File

@ -3,7 +3,53 @@
#[cfg(test)]
#[global_allocator]
static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
// #[cfg(test)]
// pub mod allocator {
// use std::alloc::{GlobalAlloc, System};
// use std::sync::atomic::{self, AtomicI64};
// #[global_allocator]
// pub static ALLOC: CountingAlloc = CountingAlloc {
// max_resident: AtomicI64::new(0),
// resident: AtomicI64::new(0),
// allocated: AtomicI64::new(0),
// };
// pub struct CountingAlloc {
// pub max_resident: AtomicI64,
// pub resident: AtomicI64,
// pub allocated: AtomicI64,
// }
// unsafe impl GlobalAlloc for CountingAlloc {
// unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 {
// self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst);
// let old_resident =
// self.resident.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst);
// let resident = old_resident + layout.size() as i64;
// self.max_resident.fetch_max(resident, atomic::Ordering::SeqCst);
// // if layout.size() > 1_000_000 {
// // eprintln!(
// // "allocating {} with new resident size: {resident}",
// // layout.size() / 1_000_000
// // );
// // // let trace = std::backtrace::Backtrace::capture();
// // // let t = trace.to_string();
// // // eprintln!("{t}");
// // }
// System.alloc(layout)
// }
// unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) {
// self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::Relaxed);
// System.dealloc(ptr, layout)
// }
// }
// }
#[macro_use]
pub mod documents;

View File

@ -3,12 +3,11 @@ use roaring::RoaringBitmap;
use super::db_cache::DatabaseCache;
use super::logger::SearchLogger;
use super::ranking_rule_graph::cheapest_paths::KCheapestPathsState;
use super::ranking_rule_graph::edge_docids_cache::EdgeDocidsCache;
use super::ranking_rule_graph::empty_paths_cache::EmptyPathsCache;
use super::ranking_rule_graph::paths_map::PathsMap;
use super::ranking_rule_graph::{RankingRuleGraph, RankingRuleGraphTrait};
use super::{QueryGraph, RankingRule, RankingRuleOutput};
use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput};
use crate::{Index, Result};
@ -24,9 +23,40 @@ impl<G: RankingRuleGraphTrait> GraphBasedRankingRule<G> {
pub struct GraphBasedRankingRuleState<G: RankingRuleGraphTrait> {
graph: RankingRuleGraph<G>,
cheapest_paths_state: Option<KCheapestPathsState>,
edge_docids_cache: EdgeDocidsCache<G>,
empty_paths_cache: EmptyPathsCache,
all_distances: Vec<Vec<u64>>,
cur_distance_idx: usize,
}
fn remove_empty_edges<'transaction, G: RankingRuleGraphTrait>(
graph: &mut RankingRuleGraph<G>,
edge_docids_cache: &mut EdgeDocidsCache<G>,
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
universe: &RoaringBitmap,
empty_paths_cache: &mut EmptyPathsCache,
) -> Result<()> {
for edge_index in 0..graph.all_edges.len() as u32 {
if graph.all_edges[edge_index as usize].is_none() {
continue;
}
let docids = edge_docids_cache
.get_edge_docids(index, txn, db_cache, edge_index, &*graph, universe)?;
match docids {
BitmapOrAllRef::Bitmap(bitmap) => {
if bitmap.is_disjoint(universe) {
graph.remove_edge(edge_index);
empty_paths_cache.forbid_edge(edge_index);
edge_docids_cache.cache.remove(&edge_index);
continue;
}
}
BitmapOrAllRef::All => continue,
}
}
Ok(())
}
impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGraph>
@ -41,18 +71,31 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
_logger: &mut dyn SearchLogger<QueryGraph>,
_universe: &RoaringBitmap,
universe: &RoaringBitmap,
query_graph: &QueryGraph,
) -> Result<()> {
// TODO: update old state instead of starting from scratch
let graph = RankingRuleGraph::build(index, txn, db_cache, query_graph.clone())?;
let mut graph = RankingRuleGraph::build(index, txn, db_cache, query_graph.clone())?;
let mut edge_docids_cache = EdgeDocidsCache::default();
let mut empty_paths_cache = EmptyPathsCache::new(graph.all_edges.len());
remove_empty_edges(
&mut graph,
&mut edge_docids_cache,
index,
txn,
db_cache,
universe,
&mut empty_paths_cache,
)?;
let all_distances = graph.initialize_distances_cheapest();
let cheapest_paths_state = KCheapestPathsState::new(&graph);
let state = GraphBasedRankingRuleState {
graph,
cheapest_paths_state,
edge_docids_cache: <_>::default(),
empty_paths_cache: <_>::default(),
edge_docids_cache,
empty_paths_cache,
all_distances,
cur_distance_idx: 0,
};
self.state = Some(state);
@ -70,34 +113,42 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
assert!(universe.len() > 1);
let mut state = self.state.take().unwrap();
if state.cheapest_paths_state.is_none() {
remove_empty_edges(
&mut state.graph,
&mut state.edge_docids_cache,
index,
txn,
db_cache,
universe,
&mut state.empty_paths_cache,
)?;
if state.cur_distance_idx
>= state.all_distances[state.graph.query_graph.root_node as usize].len()
{
self.state = None;
return Ok(None);
}
let cost =
state.all_distances[state.graph.query_graph.root_node as usize][state.cur_distance_idx];
state.cur_distance_idx += 1;
let mut paths = PathsMap::default();
let paths = state.graph.paths_of_cost(
state.graph.query_graph.root_node as usize,
cost,
&state.all_distances,
&state.empty_paths_cache,
);
while paths.is_empty() {
let Some(cheapest_paths_state) = state.cheapest_paths_state.take() else {
break;
};
if let Some(next_cheapest_paths_state) = cheapest_paths_state
.compute_paths_of_next_lowest_cost(
&mut state.graph,
&state.empty_paths_cache,
&mut paths,
)
{
state.cheapest_paths_state = Some(next_cheapest_paths_state);
} else {
break;
}
}
if paths.is_empty() && state.cheapest_paths_state.is_none() {
return Ok(None);
}
G::log_state(&state.graph, &paths, &state.empty_paths_cache, logger);
G::log_state(
&state.graph,
&paths,
&state.empty_paths_cache,
universe,
&state.all_distances,
cost,
logger,
);
let bucket = state.graph.resolve_paths(
index,

View File

@ -1,6 +1,8 @@
use rand::random;
use roaring::RoaringBitmap;
use std::fs::File;
use std::time::Instant;
use std::{io::Write, path::PathBuf};
use crate::new::ranking_rule_graph::typo::TypoGraph;
@ -9,7 +11,7 @@ use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
use crate::new::ranking_rule_graph::empty_paths_cache::EmptyPathsCache;
use crate::new::ranking_rule_graph::{Edge, EdgeDetails, RankingRuleGraphTrait};
use crate::new::ranking_rule_graph::{
paths_map::PathsMap, proximity::ProximityGraph, RankingRuleGraph,
proximity::ProximityGraph, RankingRuleGraph,
};
use super::{RankingRule, SearchLogger};
@ -19,14 +21,18 @@ pub enum SearchEvents {
ranking_rule_idx: usize,
query: QueryGraph,
universe: RoaringBitmap,
time: Instant,
},
RankingRuleNextBucket {
ranking_rule_idx: usize,
universe: RoaringBitmap,
candidates: RoaringBitmap,
time: Instant,
},
RankingRuleEndIteration {
ranking_rule_idx: usize,
universe: RoaringBitmap,
time: Instant,
},
ExtendResults {
new: Vec<u32>,
@ -36,20 +42,27 @@ pub enum SearchEvents {
},
ProximityState {
graph: RankingRuleGraph<ProximityGraph>,
paths: PathsMap<u64>,
paths: Vec<Vec<u32>>,
empty_paths_cache: EmptyPathsCache,
universe: RoaringBitmap,
distances: Vec<Vec<u64>>,
cost: u64,
},
TypoState {
graph: RankingRuleGraph<TypoGraph>,
paths: PathsMap<u64>,
paths: Vec<Vec<u32>>,
empty_paths_cache: EmptyPathsCache,
universe: RoaringBitmap,
distances: Vec<Vec<u64>>,
cost: u64,
},
RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap },
RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap, time: Instant, },
}
pub struct DetailedSearchLogger {
folder_path: PathBuf,
initial_query: Option<QueryGraph>,
initial_query_time: Option<Instant>,
initial_universe: Option<RoaringBitmap>,
ranking_rules_ids: Option<Vec<String>>,
events: Vec<SearchEvents>,
@ -58,17 +71,19 @@ impl DetailedSearchLogger {
pub fn new(folder_path: &str) -> Self {
Self {
folder_path: PathBuf::new().join(folder_path),
initial_query: <_>::default(),
initial_universe: <_>::default(),
ranking_rules_ids: <_>::default(),
events: <_>::default(),
initial_query: None,
initial_query_time: None,
initial_universe: None,
ranking_rules_ids: None,
events: vec![],
}
}
}
impl SearchLogger<QueryGraph> for DetailedSearchLogger {
fn initial_query(&mut self, query: &QueryGraph) {
fn initial_query(&mut self, query: &QueryGraph, time: Instant) {
self.initial_query = Some(query.clone());
self.initial_query_time = Some(time);
}
fn initial_universe(&mut self, universe: &RoaringBitmap) {
@ -84,11 +99,13 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
_ranking_rule: &dyn RankingRule<'transaction, QueryGraph>,
query: &QueryGraph,
universe: &RoaringBitmap,
time: Instant,
) {
self.events.push(SearchEvents::RankingRuleStartIteration {
ranking_rule_idx,
query: query.clone(),
universe: universe.clone(),
time,
})
}
@ -97,10 +114,14 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
ranking_rule_idx: usize,
_ranking_rule: &dyn RankingRule<'transaction, QueryGraph>,
universe: &RoaringBitmap,
candidates: &RoaringBitmap,
time: Instant,
) {
self.events.push(SearchEvents::RankingRuleNextBucket {
ranking_rule_idx,
universe: universe.clone(),
candidates: candidates.clone(),
time,
})
}
fn skip_bucket_ranking_rule<'transaction>(
@ -108,10 +129,12 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
ranking_rule_idx: usize,
_ranking_rule: &dyn RankingRule<'transaction, QueryGraph>,
candidates: &RoaringBitmap,
time: Instant,
) {
self.events.push(SearchEvents::RankingRuleSkipBucket {
ranking_rule_idx,
candidates: candidates.clone(),
time
})
}
@ -120,10 +143,12 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
ranking_rule_idx: usize,
_ranking_rule: &dyn RankingRule<'transaction, QueryGraph>,
universe: &RoaringBitmap,
time: Instant,
) {
self.events.push(SearchEvents::RankingRuleEndIteration {
ranking_rule_idx,
universe: universe.clone(),
time
})
}
fn add_to_results(&mut self, docids: &[u32]) {
@ -134,18 +159,19 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
self.events.push(SearchEvents::WordsState { query_graph: query_graph.clone() });
}
fn log_proximity_state(&mut self, query_graph: &RankingRuleGraph<ProximityGraph>, paths_map: &PathsMap<u64>, empty_paths_cache: &EmptyPathsCache) {
self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.clone(), empty_paths_cache: empty_paths_cache.clone() })
fn log_proximity_state(&mut self, query_graph: &RankingRuleGraph<ProximityGraph>, paths_map: &[Vec<u32>], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec<Vec<u64>>, cost: u64,) {
self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.to_vec(), empty_paths_cache: empty_paths_cache.clone(), universe: universe.clone(), distances, cost })
}
fn log_typo_state(&mut self, query_graph: &RankingRuleGraph<TypoGraph>, paths_map: &PathsMap<u64>, empty_paths_cache: &EmptyPathsCache) {
self.events.push(SearchEvents::TypoState { graph: query_graph.clone(), paths: paths_map.clone(), empty_paths_cache: empty_paths_cache.clone() })
fn log_typo_state(&mut self, query_graph: &RankingRuleGraph<TypoGraph>, paths_map: &[Vec<u32>], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec<Vec<u64>>, cost: u64,) {
self.events.push(SearchEvents::TypoState { graph: query_graph.clone(), paths: paths_map.to_vec(), empty_paths_cache: empty_paths_cache.clone(), universe: universe.clone(), distances, cost })
}
}
impl DetailedSearchLogger {
pub fn write_d2_description(&self) {
let mut prev_time = self.initial_query_time.unwrap();
let mut timestamp = vec![];
fn activated_id(timestamp: &[usize]) -> String {
let mut s = String::new();
@ -164,13 +190,16 @@ impl DetailedSearchLogger {
writeln!(&mut file, "{idx}: {rr_id}").unwrap();
}
writeln!(&mut file, "results").unwrap();
// writeln!(&mut file, "time").unwrap();
for event in self.events.iter() {
match event {
SearchEvents::RankingRuleStartIteration { ranking_rule_idx, .. } => {
SearchEvents::RankingRuleStartIteration { ranking_rule_idx, time, .. } => {
let elapsed = time.duration_since(prev_time);
prev_time = *time;
let parent_activated_id = activated_id(&timestamp);
timestamp.push(0);
let self_activated_id = activated_id(&timestamp);
// writeln!(&mut file, "time.{self_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap();
if *ranking_rule_idx != 0 {
let parent_ranking_rule_idx = ranking_rule_idx - 1;
writeln!(
@ -186,16 +215,22 @@ impl DetailedSearchLogger {
}}
}}").unwrap();
}
SearchEvents::RankingRuleNextBucket { ranking_rule_idx, .. } => {
SearchEvents::RankingRuleNextBucket { ranking_rule_idx, time, universe, candidates } => {
let elapsed = time.duration_since(prev_time);
prev_time = *time;
let old_activated_id = activated_id(&timestamp);
// writeln!(&mut file, "time.{old_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap();
*timestamp.last_mut().unwrap() += 1;
let next_activated_id = activated_id(&timestamp);
writeln!(&mut file,
"{ranking_rule_idx}.{old_activated_id} -> {ranking_rule_idx}.{next_activated_id} : next bucket",)
"{ranking_rule_idx}.{old_activated_id} -> {ranking_rule_idx}.{next_activated_id} : next bucket {}/{}", candidates.len(), universe.len())
.unwrap();
}
SearchEvents::RankingRuleSkipBucket { ranking_rule_idx, candidates } => {
SearchEvents::RankingRuleSkipBucket { ranking_rule_idx, candidates, time } => {
let elapsed = time.duration_since(prev_time);
prev_time = *time;
let old_activated_id = activated_id(&timestamp);
// writeln!(&mut file, "time.{old_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap();
*timestamp.last_mut().unwrap() += 1;
let next_activated_id = activated_id(&timestamp);
let len = candidates.len();
@ -203,8 +238,12 @@ impl DetailedSearchLogger {
"{ranking_rule_idx}.{old_activated_id} -> {ranking_rule_idx}.{next_activated_id} : skip bucket ({len})",)
.unwrap();
}
SearchEvents::RankingRuleEndIteration { ranking_rule_idx, .. } => {
SearchEvents::RankingRuleEndIteration { ranking_rule_idx, time, .. } => {
let elapsed = time.duration_since(prev_time);
prev_time = *time;
let cur_activated_id = activated_id(&timestamp);
// writeln!(&mut file, "time.{cur_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap();
timestamp.pop();
let parent_activated_id = activated_id(&timestamp);
let parent_ranking_rule = if *ranking_rule_idx == 0 {
@ -254,43 +293,48 @@ results.{random} {{
link: \"{id}.d2.svg\"
}}").unwrap();
},
SearchEvents::ProximityState { graph, paths, empty_paths_cache } => {
SearchEvents::ProximityState { graph, paths, empty_paths_cache, universe, distances, cost } => {
let cur_ranking_rule = timestamp.len() - 1;
let cur_activated_id = activated_id(&timestamp);
let id = format!("{cur_ranking_rule}.{cur_activated_id}");
let new_file_path = self.folder_path.join(format!("{id}.d2"));
let mut new_file = std::fs::File::create(new_file_path).unwrap();
Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, &mut new_file);
Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, distances.clone(), &mut new_file);
writeln!(
&mut file,
"{id} {{
link: \"{id}.d2.svg\"
}}").unwrap();
tooltip: \"cost {cost}, universe len: {}\"
}}", universe.len()).unwrap();
},
SearchEvents::TypoState { graph, paths, empty_paths_cache } => {
SearchEvents::TypoState { graph, paths, empty_paths_cache, universe, distances, cost } => {
let cur_ranking_rule = timestamp.len() - 1;
let cur_activated_id = activated_id(&timestamp);
let id = format!("{cur_ranking_rule}.{cur_activated_id}");
let new_file_path = self.folder_path.join(format!("{id}.d2"));
let mut new_file = std::fs::File::create(new_file_path).unwrap();
Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, &mut new_file);
Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, distances.clone(), &mut new_file);
writeln!(
&mut file,
"{id} {{
link: \"{id}.d2.svg\"
}}").unwrap();
tooltip: \"cost {cost}, universe len: {}\"
}}", universe.len()).unwrap();
},
}
}
writeln!(&mut file, "}}").unwrap();
}
fn query_node_d2_desc(node_idx: usize, node: &QueryNode, file: &mut File) {
fn query_node_d2_desc(node_idx: usize, node: &QueryNode, distances: &[u64], file: &mut File) {
match &node {
QueryNode::Term(LocatedQueryTerm { value, .. }) => {
match value {
QueryTerm::Phrase(_) => todo!(),
QueryTerm::Word { derivations: WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db } } => {
QueryTerm::Phrase { phrase } => {
let phrase_str = phrase.description();
writeln!(file,"{node_idx} : \"{phrase_str}\"").unwrap();
},
QueryTerm::Word { derivations: WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db, synonyms, split_words } } => {
writeln!(file,"{node_idx} : \"{original}\" {{
shape: class").unwrap();
for w in zero_typo {
@ -302,9 +346,19 @@ shape: class").unwrap();
for w in two_typos {
writeln!(file, "\"{w}\" : 2").unwrap();
}
if let Some((left, right)) = split_words {
writeln!(file, "\"{left} {right}\" : split_words").unwrap();
}
for synonym in synonyms {
writeln!(file, "\"{}\" : synonym", synonym.description()).unwrap();
}
if *use_prefix_db {
writeln!(file, "use prefix DB : true").unwrap();
}
// for (i, d) in distances.iter().enumerate() {
// writeln!(file, "\"distances\" : {d}").unwrap();
// }
writeln!(file, "}}").unwrap();
},
}
@ -324,14 +378,14 @@ shape: class").unwrap();
if matches!(query_graph.nodes[node], QueryNode::Deleted) {
continue;
}
Self::query_node_d2_desc(node, &query_graph.nodes[node], file);
Self::query_node_d2_desc(node, &query_graph.nodes[node], &[], file);
for edge in query_graph.edges[node].successors.iter() {
writeln!(file, "{node} -> {edge};\n").unwrap();
}
}
}
fn ranking_rule_graph_d2_description<R: RankingRuleGraphTrait>(graph: &RankingRuleGraph<R>, paths: &PathsMap<u64>, empty_paths_cache: &EmptyPathsCache, file: &mut File) {
fn ranking_rule_graph_d2_description<R: RankingRuleGraphTrait>(graph: &RankingRuleGraph<R>, paths: &[Vec<u32>], _empty_paths_cache: &EmptyPathsCache, distances: Vec<Vec<u64>>, file: &mut File) {
writeln!(file,"direction: right").unwrap();
writeln!(file, "Proximity Graph {{").unwrap();
@ -339,7 +393,8 @@ shape: class").unwrap();
if matches!(node, QueryNode::Deleted) {
continue;
}
Self::query_node_d2_desc(node_idx, node, file);
let distances = &distances[node_idx];
Self::query_node_d2_desc(node_idx, node, distances.as_slice(), file);
}
for edge in graph.all_edges.iter().flatten() {
let Edge { from_node, to_node, details, .. } = edge;
@ -362,26 +417,39 @@ shape: class").unwrap();
}
writeln!(file, "}}").unwrap();
// writeln!(file, "Distances {{").unwrap();
// Self::paths_d2_description(graph, paths, file);
// writeln!(file, "}}").unwrap();
writeln!(file, "Shortest Paths {{").unwrap();
Self::paths_d2_description(graph, "", paths, file);
Self::paths_d2_description(graph, paths, file);
writeln!(file, "}}").unwrap();
writeln!(file, "Empty Path Prefixes {{").unwrap();
Self::paths_d2_description(graph, "", &empty_paths_cache.empty_prefixes, file);
writeln!(file, "}}").unwrap();
// writeln!(file, "Empty Edge Couples {{").unwrap();
// for (i, (e1, e2)) in empty_paths_cache.empty_couple_edges.iter().enumerate() {
// writeln!(file, "{i} : \"\" {{").unwrap();
// Self::edge_d2_description(graph, *e1, file);
// Self::edge_d2_description(graph, *e2, file);
// writeln!(file, "{e1} -- {e2}").unwrap();
// writeln!(file, "}}").unwrap();
// }
// writeln!(file, "}}").unwrap();
writeln!(file, "Removed Edges {{").unwrap();
for edge_idx in empty_paths_cache.empty_edges.iter() {
writeln!(file, "{edge_idx}").unwrap();
}
writeln!(file, "}}").unwrap();
// writeln!(file, "Removed Edges {{").unwrap();
// for edge_idx in empty_paths_cache.empty_edges.iter() {
// writeln!(file, "{edge_idx}").unwrap();
// }
// writeln!(file, "}}").unwrap();
}
fn edge_d2_description<R: RankingRuleGraphTrait>(graph: &RankingRuleGraph<R>, paths_idx: &str, edge_idx: u32, file: &mut File) -> String {
fn edge_d2_description<R: RankingRuleGraphTrait>(graph: &RankingRuleGraph<R>, edge_idx: u32, file: &mut File) {
let Edge { from_node, to_node, cost, .. } = graph.all_edges[edge_idx as usize].as_ref().unwrap() ;
let from_node = &graph.query_graph.nodes[*from_node as usize];
let from_node_desc = match from_node {
QueryNode::Term(term) => match &term.value {
QueryTerm::Phrase(_) => todo!(),
QueryTerm::Phrase { phrase } => {
phrase.description()
},
QueryTerm::Word { derivations } => derivations.original.clone(),
},
QueryNode::Deleted => panic!(),
@ -391,27 +459,28 @@ shape: class").unwrap();
let to_node = &graph.query_graph.nodes[*to_node as usize];
let to_node_desc = match to_node {
QueryNode::Term(term) => match &term.value {
QueryTerm::Phrase(_) => todo!(),
QueryTerm::Phrase { phrase } => phrase.description(),
QueryTerm::Word { derivations } => derivations.original.clone(),
},
QueryNode::Deleted => panic!(),
QueryNode::Start => "START".to_owned(),
QueryNode::End => "END".to_owned(),
};
let edge_id = format!("{paths_idx}{edge_idx}");
writeln!(file, "{edge_id}: \"{from_node_desc}->{to_node_desc} [{cost}]\" {{
writeln!(file, "{edge_idx}: \"{from_node_desc}->{to_node_desc} [{cost}]\" {{
shape: class
}}").unwrap();
edge_id
}
fn paths_d2_description<R: RankingRuleGraphTrait, T>(graph: &RankingRuleGraph<R>, paths_idx: &str, paths: &PathsMap<T>, file: &mut File) {
for (edge_idx, rest) in paths.nodes.iter() {
let edge_id = Self::edge_d2_description(graph, paths_idx, *edge_idx, file);
for (dest_edge_idx, _) in rest.nodes.iter() {
let dest_edge_id = format!("{paths_idx}{edge_idx}{dest_edge_idx}");
writeln!(file, "{edge_id} -> {dest_edge_id}").unwrap();
fn paths_d2_description<R: RankingRuleGraphTrait>(graph: &RankingRuleGraph<R>, paths: &[Vec<u32>], file: &mut File) {
for (path_idx, edge_indexes) in paths.iter().enumerate() {
writeln!(file, "{path_idx} {{").unwrap();
for edge_idx in edge_indexes.iter() {
Self::edge_d2_description(graph, *edge_idx, file);
}
Self::paths_d2_description(graph, &format!("{paths_idx}{edge_idx}"), rest, file);
for couple_edges in edge_indexes.windows(2) {
let [src_edge_idx, dest_edge_idx] = couple_edges else { panic!() };
writeln!(file, "{src_edge_idx} -> {dest_edge_idx}").unwrap();
}
writeln!(file, "}}").unwrap();
}
}
}

View File

@ -2,28 +2,31 @@
pub mod detailed;
use roaring::RoaringBitmap;
use std::time::Instant;
use super::{
ranking_rule_graph::{
empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, proximity::ProximityGraph,
typo::TypoGraph, RankingRuleGraph,
empty_paths_cache::EmptyPathsCache, proximity::ProximityGraph, typo::TypoGraph,
RankingRuleGraph,
},
RankingRule, RankingRuleQueryTrait,
};
pub struct DefaultSearchLogger;
impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
fn initial_query(&mut self, _query: &Q) {}
fn initial_query(&mut self, _query: &Q, _time: Instant) {}
fn initial_universe(&mut self, _universe: &RoaringBitmap) {}
fn ranking_rules(&mut self, _rr: &[&mut dyn RankingRule<Q>]) {}
fn start_iteration_ranking_rule<'transaction>(
&mut self,
_ranking_rule_idx: usize,
_ranking_rule: &dyn RankingRule<'transaction, Q>,
_query: &Q,
_universe: &RoaringBitmap,
_time: Instant,
) {
}
@ -32,6 +35,8 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
_ranking_rule_idx: usize,
_ranking_rule: &dyn RankingRule<'transaction, Q>,
_universe: &RoaringBitmap,
_candidates: &RoaringBitmap,
_time: Instant,
) {
}
fn skip_bucket_ranking_rule<'transaction>(
@ -39,6 +44,7 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
_ranking_rule_idx: usize,
_ranking_rule: &dyn RankingRule<'transaction, Q>,
_candidates: &RoaringBitmap,
_time: Instant,
) {
}
@ -47,6 +53,7 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
_ranking_rule_idx: usize,
_ranking_rule: &dyn RankingRule<'transaction, Q>,
_universe: &RoaringBitmap,
_time: Instant,
) {
}
@ -57,22 +64,28 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
fn log_proximity_state(
&mut self,
_query_graph: &RankingRuleGraph<ProximityGraph>,
_paths_map: &PathsMap<u64>,
_paths_map: &[Vec<u32>],
_empty_paths_cache: &EmptyPathsCache,
_universe: &RoaringBitmap,
_distances: Vec<Vec<u64>>,
_cost: u64,
) {
}
fn log_typo_state(
&mut self,
query_graph: &RankingRuleGraph<TypoGraph>,
paths: &PathsMap<u64>,
empty_paths_cache: &EmptyPathsCache,
_query_graph: &RankingRuleGraph<TypoGraph>,
_paths: &[Vec<u32>],
_empty_paths_cache: &EmptyPathsCache,
_universe: &RoaringBitmap,
_distances: Vec<Vec<u64>>,
_cost: u64,
) {
}
}
pub trait SearchLogger<Q: RankingRuleQueryTrait> {
fn initial_query(&mut self, query: &Q);
fn initial_query(&mut self, query: &Q, time: Instant);
fn initial_universe(&mut self, universe: &RoaringBitmap);
fn ranking_rules(&mut self, rr: &[&mut dyn RankingRule<Q>]);
@ -83,24 +96,29 @@ pub trait SearchLogger<Q: RankingRuleQueryTrait> {
ranking_rule: &dyn RankingRule<'transaction, Q>,
query: &Q,
universe: &RoaringBitmap,
time: Instant,
);
fn next_bucket_ranking_rule<'transaction>(
&mut self,
ranking_rule_idx: usize,
ranking_rule: &dyn RankingRule<'transaction, Q>,
universe: &RoaringBitmap,
candidates: &RoaringBitmap,
time: Instant,
);
fn skip_bucket_ranking_rule<'transaction>(
&mut self,
ranking_rule_idx: usize,
ranking_rule: &dyn RankingRule<'transaction, Q>,
candidates: &RoaringBitmap,
time: Instant,
);
fn end_iteration_ranking_rule<'transaction>(
&mut self,
ranking_rule_idx: usize,
ranking_rule: &dyn RankingRule<'transaction, Q>,
universe: &RoaringBitmap,
time: Instant,
);
fn add_to_results(&mut self, docids: &[u32]);
@ -109,14 +127,20 @@ pub trait SearchLogger<Q: RankingRuleQueryTrait> {
fn log_proximity_state(
&mut self,
query_graph: &RankingRuleGraph<ProximityGraph>,
paths: &PathsMap<u64>,
paths: &[Vec<u32>],
empty_paths_cache: &EmptyPathsCache,
universe: &RoaringBitmap,
_distances: Vec<Vec<u64>>,
cost: u64,
);
fn log_typo_state(
&mut self,
query_graph: &RankingRuleGraph<TypoGraph>,
paths: &PathsMap<u64>,
paths: &[Vec<u32>],
empty_paths_cache: &EmptyPathsCache,
universe: &RoaringBitmap,
_distances: Vec<Vec<u64>>,
cost: u64,
);
}

View File

@ -1,10 +1,8 @@
use std::collections::{BTreeMap, HashSet};
use roaring::RoaringBitmap;
#![allow(clippy::too_many_arguments)]
use super::empty_paths_cache::EmptyPathsCache;
use super::paths_map::PathsMap;
use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait};
use super::{RankingRuleGraph, RankingRuleGraphTrait};
use std::collections::VecDeque;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct Path {
@ -12,226 +10,119 @@ pub struct Path {
pub cost: u64,
}
struct DijkstraState {
unvisited: RoaringBitmap, // should be a small bitset?
distances: Vec<u64>, // or binary heap, or btreemap? (f64, usize)
edges: Vec<u32>,
edge_costs: Vec<u8>,
paths: Vec<Option<u32>>,
}
pub struct KCheapestPathsState {
cheapest_paths: PathsMap<u64>,
potential_cheapest_paths: BTreeMap<u64, PathsMap<u64>>,
pub kth_cheapest_path: Path,
}
impl KCheapestPathsState {
pub fn next_cost(&self) -> u64 {
self.kth_cheapest_path.cost
}
pub fn new<G: RankingRuleGraphTrait>(
graph: &RankingRuleGraph<G>,
) -> Option<KCheapestPathsState> {
let Some(cheapest_path) = graph.cheapest_path_to_end(graph.query_graph.root_node) else {
return None
};
let cheapest_paths = PathsMap::from_paths(&[cheapest_path.clone()]);
let potential_cheapest_paths = BTreeMap::new();
Some(KCheapestPathsState {
cheapest_paths,
potential_cheapest_paths,
kth_cheapest_path: cheapest_path,
})
}
pub fn remove_empty_paths(mut self, empty_paths_cache: &EmptyPathsCache) -> Option<Self> {
self.cheapest_paths.remove_edges(&empty_paths_cache.empty_edges);
self.cheapest_paths.remove_prefixes(&empty_paths_cache.empty_prefixes);
let mut costs_to_delete = HashSet::new();
for (cost, potential_cheapest_paths) in self.potential_cheapest_paths.iter_mut() {
potential_cheapest_paths.remove_edges(&empty_paths_cache.empty_edges);
potential_cheapest_paths.remove_prefixes(&empty_paths_cache.empty_prefixes);
if potential_cheapest_paths.is_empty() {
costs_to_delete.insert(*cost);
}
}
for cost in costs_to_delete {
self.potential_cheapest_paths.remove(&cost);
}
if self.cheapest_paths.is_empty() {}
todo!()
}
pub fn compute_paths_of_next_lowest_cost<G: RankingRuleGraphTrait>(
mut self,
graph: &mut RankingRuleGraph<G>,
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
pub fn paths_of_cost(
&self,
from: usize,
cost: u64,
all_distances: &[Vec<u64>],
empty_paths_cache: &EmptyPathsCache,
into_map: &mut PathsMap<u64>,
) -> Option<Self> {
if !empty_paths_cache.path_is_empty(&self.kth_cheapest_path.edges) {
into_map.add_path(&self.kth_cheapest_path);
) -> Vec<Vec<u32>> {
let mut paths = vec![];
self.paths_of_cost_rec(
from,
all_distances,
cost,
&mut vec![],
&mut paths,
&vec![false; self.all_edges.len()],
empty_paths_cache,
);
paths
}
pub fn paths_of_cost_rec(
&self,
from: usize,
all_distances: &[Vec<u64>],
cost: u64,
prev_edges: &mut Vec<u32>,
paths: &mut Vec<Vec<u32>>,
forbidden_edges: &[bool],
empty_paths_cache: &EmptyPathsCache,
) {
let distances = &all_distances[from];
if !distances.contains(&cost) {
panic!();
}
let cur_cost = self.kth_cheapest_path.cost;
while self.kth_cheapest_path.cost <= cur_cost {
if let Some(next_self) = self.compute_next_cheapest_paths(graph, empty_paths_cache) {
self = next_self;
if self.kth_cheapest_path.cost == cur_cost
&& !empty_paths_cache.path_is_empty(&self.kth_cheapest_path.edges)
let tos = &self.query_graph.edges[from].successors;
let mut valid_edges = vec![];
for to in tos {
self.visit_edges::<()>(from as u32, to, |edge_idx, edge| {
if cost >= edge.cost as u64
&& all_distances[to as usize].contains(&(cost - edge.cost as u64))
&& !forbidden_edges[edge_idx as usize]
{
into_map.add_path(&self.kth_cheapest_path);
} else {
break;
valid_edges.push((edge_idx, edge.cost, to));
}
} else {
return None;
}
std::ops::ControlFlow::Continue(())
});
}
Some(self)
}
fn compute_next_cheapest_paths<G: RankingRuleGraphTrait>(
mut self,
graph: &mut RankingRuleGraph<G>,
empty_paths_cache: &EmptyPathsCache,
) -> Option<KCheapestPathsState> {
// for all nodes in the last cheapest path (called spur_node), except last one...
for (i, edge_idx) in self.kth_cheapest_path.edges[..self.kth_cheapest_path.edges.len() - 1]
.iter()
.enumerate()
{
let Some(edge) = graph.all_edges[*edge_idx as usize].as_ref() else { continue; };
let Edge { from_node: spur_node, .. } = edge;
let root_path = &self.kth_cheapest_path.edges[..i];
if empty_paths_cache.path_is_empty(root_path) {
for (edge_idx, edge_cost, to) in valid_edges {
prev_edges.push(edge_idx);
if empty_paths_cache.empty_prefixes.contains_prefix_of_path(prev_edges) {
continue;
}
let root_cost = root_path.iter().fold(0, |sum, next| {
sum + graph.all_edges[*next as usize].as_ref().unwrap().cost as u64
});
let mut tmp_removed_edges = vec![];
// for all the paths already found that share a common prefix with the root path
// we delete the edge from the spur node to the next one
for edge_index_to_remove in self.cheapest_paths.edge_indices_after_prefix(root_path) {
let was_removed =
graph.node_edges[*spur_node as usize].remove(edge_index_to_remove);
if was_removed {
tmp_removed_edges.push(edge_index_to_remove);
}
let mut new_forbidden_edges = forbidden_edges.to_vec();
for edge_idx in empty_paths_cache.empty_couple_edges[edge_idx as usize].iter() {
new_forbidden_edges[*edge_idx as usize] = true;
}
for edge_idx in empty_paths_cache.empty_prefixes.final_edges_ater_prefix(prev_edges) {
new_forbidden_edges[edge_idx as usize] = true;
}
// Compute the cheapest path from the spur node to the destination
// we will combine it with the root path to get a potential kth cheapest path
let spur_path = graph.cheapest_path_to_end(*spur_node);
// restore the temporarily removed edges
graph.node_edges[*spur_node as usize].extend(tmp_removed_edges);
let Some(spur_path) = spur_path else { continue; };
let total_cost = root_cost + spur_path.cost;
let total_path = Path {
edges: root_path.iter().chain(spur_path.edges.iter()).cloned().collect(),
cost: total_cost,
};
let entry = self.potential_cheapest_paths.entry(total_cost).or_default();
entry.add_path(&total_path);
if to == self.query_graph.end_node {
paths.push(prev_edges.clone());
} else {
self.paths_of_cost_rec(
to as usize,
all_distances,
cost - edge_cost as u64,
prev_edges,
paths,
&new_forbidden_edges,
empty_paths_cache,
)
}
prev_edges.pop();
}
while let Some(mut next_cheapest_paths_entry) = self.potential_cheapest_paths.first_entry()
}
pub fn initialize_distances_cheapest(&self) -> Vec<Vec<u64>> {
let mut distances_to_end: Vec<Vec<u64>> = vec![vec![]; self.query_graph.nodes.len()];
let mut enqueued = vec![false; self.query_graph.nodes.len()];
let mut node_stack = VecDeque::new();
distances_to_end[self.query_graph.end_node as usize] = vec![0];
for prev_node in
self.query_graph.edges[self.query_graph.end_node as usize].predecessors.iter()
{
let cost = *next_cheapest_paths_entry.key();
let next_cheapest_paths = next_cheapest_paths_entry.get_mut();
node_stack.push_back(prev_node as usize);
enqueued[prev_node as usize] = true;
}
while let Some((next_cheapest_path, cost2)) = next_cheapest_paths.remove_first() {
assert_eq!(cost, cost2);
// NOTE: it is important not to discard the paths that are forbidden due to a
// forbidden prefix, because the cheapest path algorithm (Dijkstra) cannot take
// this property into account.
if next_cheapest_path
.iter()
.any(|edge_index| graph.all_edges[*edge_index as usize].is_none())
{
continue;
} else {
self.cheapest_paths.insert(next_cheapest_path.iter().copied(), cost);
if next_cheapest_paths.is_empty() {
next_cheapest_paths_entry.remove();
while let Some(cur_node) = node_stack.pop_front() {
let mut self_distances = vec![];
for succ_node in self.query_graph.edges[cur_node].successors.iter() {
let succ_distances = &distances_to_end[succ_node as usize];
let _ = self.visit_edges::<()>(cur_node as u32, succ_node, |_, edge| {
for succ_distance in succ_distances {
self_distances.push(edge.cost as u64 + succ_distance);
}
self.kth_cheapest_path = Path { edges: next_cheapest_path, cost };
return Some(self);
std::ops::ControlFlow::Continue(())
});
}
self_distances.sort_unstable();
self_distances.dedup();
distances_to_end[cur_node] = self_distances;
for prev_node in self.query_graph.edges[cur_node].predecessors.iter() {
if !enqueued[prev_node as usize] {
node_stack.push_back(prev_node as usize);
enqueued[prev_node as usize] = true;
}
}
let _ = next_cheapest_paths_entry.remove_entry();
}
None
}
}
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
fn cheapest_path_to_end(&self, from: u32) -> Option<Path> {
let mut dijkstra = DijkstraState {
unvisited: (0..self.query_graph.nodes.len() as u32).collect(),
distances: vec![u64::MAX; self.query_graph.nodes.len()],
edges: vec![u32::MAX; self.query_graph.nodes.len()],
edge_costs: vec![u8::MAX; self.query_graph.nodes.len()],
paths: vec![None; self.query_graph.nodes.len()],
};
dijkstra.distances[from as usize] = 0;
// TODO: could use a binary heap here to store the distances, or a btreemap
while let Some(cur_node) =
dijkstra.unvisited.iter().min_by_key(|&n| dijkstra.distances[n as usize])
{
let cur_node_dist = dijkstra.distances[cur_node as usize];
if cur_node_dist == u64::MAX {
return None;
}
if cur_node == self.query_graph.end_node {
break;
}
let succ_cur_node = &self.successors[cur_node as usize];
let unvisited_succ_cur_node = succ_cur_node & &dijkstra.unvisited;
for succ in unvisited_succ_cur_node {
let Some((cheapest_edge, cheapest_edge_cost)) = self.cheapest_edge(cur_node, succ) else {
continue
};
let old_dist_succ = &mut dijkstra.distances[succ as usize];
let new_potential_distance = cur_node_dist + cheapest_edge_cost as u64;
if new_potential_distance < *old_dist_succ {
*old_dist_succ = new_potential_distance;
dijkstra.edges[succ as usize] = cheapest_edge;
dijkstra.edge_costs[succ as usize] = cheapest_edge_cost;
dijkstra.paths[succ as usize] = Some(cur_node);
}
}
dijkstra.unvisited.remove(cur_node);
}
let mut cur = self.query_graph.end_node;
let mut path_edges = vec![];
while let Some(n) = dijkstra.paths[cur as usize] {
path_edges.push(dijkstra.edges[cur as usize]);
cur = n;
}
path_edges.reverse();
Some(Path {
edges: path_edges,
cost: dijkstra.distances[self.query_graph.end_node as usize],
})
}
pub fn cheapest_edge(&self, cur_node: u32, succ: u32) -> Option<(u32, u8)> {
self.visit_edges(cur_node, succ, |edge_idx, edge| {
std::ops::ControlFlow::Break((edge_idx, edge.cost))
})
distances_to_end
}
}

View File

@ -32,16 +32,19 @@ impl<G: RankingRuleGraphTrait> EdgeDocidsCache<G> {
db_cache: &mut DatabaseCache<'transaction>,
edge_index: u32,
graph: &RankingRuleGraph<G>,
// TODO: maybe universe doesn't belong here
universe: &RoaringBitmap,
) -> Result<BitmapOrAllRef<'s>> {
if self.cache.contains_key(&edge_index) {
return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index]));
}
let edge = graph.all_edges[edge_index as usize].as_ref().unwrap();
match &edge.details {
EdgeDetails::Unconditional => Ok(BitmapOrAllRef::All),
EdgeDetails::Data(details) => {
let docids = G::compute_docids(index, txn, db_cache, details)?;
if self.cache.contains_key(&edge_index) {
return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index]));
}
// TODO: maybe universe doesn't belong here
let docids = universe & G::compute_docids(index, txn, db_cache, details)?;
let _ = self.cache.insert(edge_index, docids);
let docids = &self.cache[&edge_index];

View File

@ -1,26 +1,60 @@
use roaring::RoaringBitmap;
use super::paths_map::PathsMap;
#[derive(Default, Clone)]
#[derive(Clone)]
pub struct EmptyPathsCache {
pub empty_edges: RoaringBitmap,
pub empty_edges: Vec<bool>,
pub empty_prefixes: PathsMap<()>,
pub empty_couple_edges: Vec<Vec<u32>>,
}
impl EmptyPathsCache {
pub fn new(all_edges_len: usize) -> Self {
Self {
empty_edges: vec![false; all_edges_len],
empty_prefixes: PathsMap::default(),
empty_couple_edges: vec![vec![]; all_edges_len],
}
}
pub fn forbid_edge(&mut self, edge_idx: u32) {
self.empty_edges.insert(edge_idx);
self.empty_edges[edge_idx as usize] = true;
self.empty_couple_edges[edge_idx as usize] = vec![];
self.empty_prefixes.remove_edge(&edge_idx);
for edges2 in self.empty_couple_edges.iter_mut() {
if let Some(edge2_pos) = edges2.iter().position(|e| *e == edge_idx) {
edges2.swap_remove(edge2_pos);
}
}
}
pub fn forbid_prefix(&mut self, prefix: &[u32]) {
self.empty_prefixes.insert(prefix.iter().copied(), ());
}
pub fn forbid_couple_edges(&mut self, edge1: u32, edge2: u32) {
assert!(!self.empty_couple_edges[edge1 as usize].contains(&edge2));
self.empty_couple_edges[edge1 as usize].push(edge2);
}
pub fn path_is_empty(&self, path: &[u32]) -> bool {
for edge in path {
if self.empty_edges.contains(*edge) {
if self.empty_edges[*edge as usize] {
return true;
}
}
if self.empty_prefixes.contains_prefix_of_path(path) {
return true;
}
for (edge1, edges2) in self.empty_couple_edges.iter().enumerate() {
if let Some(pos_edge1) = path.iter().position(|e| *e == edge1 as u32) {
if path[pos_edge1..].iter().any(|e| edges2.contains(e)) {
return true;
}
}
}
// for (edge1, edge2) in self.empty_couple_edges.iter() {
// if path.contains(edge1) && path.contains(edge2) {
// return true;
// }
// }
// if self.empty_prefixes.contains_prefix_of_path(path) {
// return true;
// }
false
}
}

View File

@ -13,7 +13,6 @@ use heed::RoTxn;
use roaring::RoaringBitmap;
use self::empty_paths_cache::EmptyPathsCache;
use self::paths_map::PathsMap;
use super::db_cache::DatabaseCache;
use super::logger::SearchLogger;
@ -83,8 +82,11 @@ pub trait RankingRuleGraphTrait: Sized {
fn log_state(
graph: &RankingRuleGraph<Self>,
paths: &PathsMap<u64>,
paths: &[Vec<u32>],
empty_paths_cache: &EmptyPathsCache,
universe: &RoaringBitmap,
distances: &[Vec<u64>],
cost: u64,
logger: &mut dyn SearchLogger<QueryGraph>,
);
}
@ -135,7 +137,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
None
}
fn remove_edge(&mut self, edge_index: u32) {
pub fn remove_edge(&mut self, edge_index: u32) {
let edge_opt = &mut self.all_edges[edge_index as usize];
let Some(edge) = &edge_opt else { return };
let (from_node, _to_node) = (edge.from_node, edge.to_node);
@ -151,44 +153,4 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
}
self.successors[from_node as usize] = new_successors_from_node;
}
pub fn graphviz(&self) -> String {
let mut desc = String::new();
desc.push_str("digraph G {\nrankdir = LR;\nnode [shape = \"record\"]\n");
for (node_idx, node) in self.query_graph.nodes.iter().enumerate() {
if matches!(node, QueryNode::Deleted) {
continue;
}
desc.push_str(&format!("{node_idx} [label = {:?}]", node));
if node_idx == self.query_graph.root_node as usize {
desc.push_str("[color = blue]");
} else if node_idx == self.query_graph.end_node as usize {
desc.push_str("[color = red]");
}
desc.push_str(";\n");
}
for edge in self.all_edges.iter().flatten() {
let Edge { from_node, to_node, details, .. } = edge;
match &details {
EdgeDetails::Unconditional => {
desc.push_str(&format!(
"{from_node} -> {to_node} [label = \"always cost {cost}\"];\n",
cost = edge.cost,
));
}
EdgeDetails::Data(details) => {
desc.push_str(&format!(
"{from_node} -> {to_node} [label = \"cost {cost} {edge_label}\"];\n",
cost = edge.cost,
edge_label = G::graphviz_edge_details_label(details)
));
}
}
}
desc.push('}');
desc
}
}

View File

@ -1,12 +1,11 @@
use std::collections::hash_map::DefaultHasher;
use std::fmt::Write;
use std::hash::{Hash, Hasher};
use roaring::RoaringBitmap;
use super::cheapest_paths::Path;
use super::{Edge, EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait};
use crate::new::QueryNode;
#[derive(Debug, Clone)]
pub struct PathsMap<V> {
@ -157,6 +156,24 @@ impl<V> PathsMap<V> {
}
}
pub fn final_edges_ater_prefix(&self, prefix: &[u32]) -> Vec<u32> {
let [first_edge, remaining_prefix @ ..] = prefix else {
return self.nodes.iter().filter_map(|n| {
if n.1.value.is_some() {
Some(n.0)
} else {
None
}
}).collect();
};
for (edge, rest) in self.nodes.iter() {
if edge == first_edge {
return rest.final_edges_ater_prefix(remaining_prefix);
}
}
vec![]
}
pub fn edge_indices_after_prefix(&self, prefix: &[u32]) -> Vec<u32> {
let [first_edge, remaining_prefix @ ..] = prefix else {
return self.nodes.iter().map(|n| n.0).collect();
@ -185,88 +202,4 @@ impl<V> PathsMap<V> {
}
}
}
pub fn graphviz<G: RankingRuleGraphTrait>(&self, graph: &RankingRuleGraph<G>) -> String {
let mut desc = String::new();
desc.push_str("digraph G {\n");
self.graphviz_rec(&mut desc, vec![], graph);
desc.push_str("\n}\n");
desc
}
fn graphviz_rec<G: RankingRuleGraphTrait>(
&self,
desc: &mut String,
path_from: Vec<u64>,
graph: &RankingRuleGraph<G>,
) {
let id_from = {
let mut h = DefaultHasher::new();
path_from.hash(&mut h);
h.finish()
};
for (edge_idx, rest) in self.nodes.iter() {
let Some(Edge { from_node, to_node, cost, .. }) = graph.all_edges[*edge_idx as usize].as_ref() else {
continue;
};
let mut path_to = path_from.clone();
path_to.push({
let mut h = DefaultHasher::new();
edge_idx.hash(&mut h);
h.finish()
});
let id_to = {
let mut h = DefaultHasher::new();
path_to.hash(&mut h);
h.finish()
};
writeln!(desc, "{id_to} [label = \"{from_node}→{to_node} [{cost}]\"];").unwrap();
writeln!(desc, "{id_from} -> {id_to};").unwrap();
rest.graphviz_rec(desc, path_to, graph);
}
}
}
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
pub fn graphviz_with_path(&self, path: &Path) -> String {
let mut desc = String::new();
desc.push_str("digraph G {\nrankdir = LR;\nnode [shape = \"record\"]\n");
for (node_idx, node) in self.query_graph.nodes.iter().enumerate() {
if matches!(node, QueryNode::Deleted) {
continue;
}
desc.push_str(&format!("{node_idx} [label = {:?}]", node));
if node_idx == self.query_graph.root_node as usize {
desc.push_str("[color = blue]");
} else if node_idx == self.query_graph.end_node as usize {
desc.push_str("[color = red]");
}
desc.push_str(";\n");
}
for (edge_idx, edge) in self.all_edges.iter().enumerate() {
let Some(edge) = edge else { continue };
let Edge { from_node, to_node, .. } = edge;
let color = if path.edges.contains(&(edge_idx as u32)) { "red" } else { "green" };
match &edge.details {
EdgeDetails::Unconditional => {
desc.push_str(&format!(
"{from_node} -> {to_node} [label = \"cost {cost}\", color = {color}];\n",
cost = edge.cost,
));
}
EdgeDetails::Data(details) => {
desc.push_str(&format!(
"{from_node} -> {to_node} [label = \"cost {cost} {edge_label}\", color = {color}];\n",
cost = edge.cost,
edge_label = G::graphviz_edge_details_label(details),
));
}
}
}
desc.push('}');
desc
}
}

View File

@ -16,9 +16,9 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result<Option<(WordDerivations,
QueryNode::Term(LocatedQueryTerm { value: value1, positions: pos1 }) => {
match value1 {
QueryTerm::Word { derivations } => (derivations.clone(), *pos1.end()),
QueryTerm::Phrase(phrase1) => {
QueryTerm::Phrase { phrase: phrase1 } => {
// TODO: remove second unwrap
let original = phrase1.last().unwrap().as_ref().unwrap().clone();
let original = phrase1.words.last().unwrap().as_ref().unwrap().clone();
(
WordDerivations {
original: original.clone(),
@ -26,6 +26,8 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result<Option<(WordDerivations,
one_typo: vec![],
two_typos: vec![],
use_prefix_db: false,
synonyms: vec![],
split_words: None,
},
*pos1.end(),
)
@ -39,6 +41,8 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result<Option<(WordDerivations,
one_typo: vec![],
two_typos: vec![],
use_prefix_db: false,
synonyms: vec![],
split_words: None,
},
-100,
),
@ -63,9 +67,9 @@ pub fn visit_to_node<'transaction, 'from_data>(
let (derivations2, pos2, ngram_len2) = match value2 {
QueryTerm::Word { derivations } => (derivations.clone(), *pos2.start(), pos2.len()),
QueryTerm::Phrase(phrase2) => {
QueryTerm::Phrase { phrase: phrase2 } => {
// TODO: remove second unwrap
let original = phrase2.last().unwrap().as_ref().unwrap().clone();
let original = phrase2.words.last().unwrap().as_ref().unwrap().clone();
(
WordDerivations {
original: original.clone(),
@ -73,6 +77,8 @@ pub fn visit_to_node<'transaction, 'from_data>(
one_typo: vec![],
two_typos: vec![],
use_prefix_db: false,
synonyms: vec![],
split_words: None,
},
*pos2.start(),
1,

View File

@ -2,18 +2,21 @@ pub mod build;
pub mod compute_docids;
use heed::RoTxn;
use roaring::RoaringBitmap;
use super::empty_paths_cache::EmptyPathsCache;
use super::paths_map::PathsMap;
use super::{EdgeDetails, RankingRuleGraphTrait};
use crate::new::db_cache::DatabaseCache;
use crate::new::logger::SearchLogger;
use crate::new::query_term::WordDerivations;
use crate::new::QueryNode;
use crate::new::{QueryGraph, QueryNode};
use crate::{Index, Result};
// TODO: intern the strings, refer to them by their pointer?
#[derive(Debug, Clone)]
pub enum WordPair {
// TODO: add WordsSwapped and WordPrefixSwapped case
Words { left: String, right: String },
WordsSwapped { left: String, right: String },
WordPrefix { left: String, right_prefix: String },
@ -22,6 +25,7 @@ pub enum WordPair {
#[derive(Clone)]
pub struct ProximityEdge {
// TODO: use a list of pointers to the word pairs instead?
pairs: Vec<WordPair>,
proximity: u8,
}
@ -67,10 +71,20 @@ impl RankingRuleGraphTrait for ProximityGraph {
fn log_state(
graph: &super::RankingRuleGraph<Self>,
paths: &PathsMap<u64>,
paths: &[Vec<u32>],
empty_paths_cache: &EmptyPathsCache,
logger: &mut dyn crate::new::logger::SearchLogger<crate::new::QueryGraph>,
universe: &RoaringBitmap,
distances: &[Vec<u64>],
cost: u64,
logger: &mut dyn SearchLogger<QueryGraph>,
) {
logger.log_proximity_state(graph, paths, empty_paths_cache);
logger.log_proximity_state(
graph,
paths,
empty_paths_cache,
universe,
distances.to_vec(),
cost,
);
}
}

View File

@ -5,7 +5,7 @@ use roaring::{MultiOps, RoaringBitmap};
use super::edge_docids_cache::EdgeDocidsCache;
use super::empty_paths_cache::EmptyPathsCache;
use super::paths_map::PathsMap;
use super::{RankingRuleGraph, RankingRuleGraphTrait};
use crate::new::db_cache::DatabaseCache;
@ -21,44 +21,65 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
edge_docids_cache: &mut EdgeDocidsCache<G>,
empty_paths_cache: &mut EmptyPathsCache,
universe: &RoaringBitmap,
mut paths: PathsMap<u64>,
mut paths: Vec<Vec<u32>>,
) -> Result<RoaringBitmap> {
paths.sort_unstable();
let mut needs_filtering = false;
let mut path_bitmaps = vec![];
'path_loop: loop {
if needs_filtering {
for path in paths.iter_mut() {
if empty_paths_cache.path_is_empty(path) {
path.clear();
}
}
needs_filtering = false;
}
let Some(edge_indexes) = paths.pop() else {
break;
};
paths.remove_edges(&empty_paths_cache.empty_edges);
paths.remove_prefixes(&empty_paths_cache.empty_prefixes);
if edge_indexes.is_empty() {
continue;
}
'path_loop: while let Some((edge_indexes, _)) = paths.remove_first() {
// if path is excluded, continue...
let mut processed_edges = vec![];
let mut path_bitmap = universe.clone();
let mut visited_edges = vec![];
let mut cached_edge_docids = vec![];
'edge_loop: for edge_index in edge_indexes {
processed_edges.push(edge_index);
let edge_docids =
edge_docids_cache.get_edge_docids(index, txn, db_cache, edge_index, self)?;
visited_edges.push(edge_index);
let edge_docids = edge_docids_cache
.get_edge_docids(index, txn, db_cache, edge_index, self, universe)?;
match edge_docids {
BitmapOrAllRef::Bitmap(edge_docids) => {
cached_edge_docids.push((edge_index, edge_docids.clone()));
let (_, edge_docids) = cached_edge_docids.last().unwrap();
if edge_docids.is_disjoint(universe) {
// 1. Store in the cache that this edge is empty for this universe
empty_paths_cache.forbid_edge(edge_index);
// 2. remove all the paths that contain this edge for this universe
paths.remove_edge(&edge_index);
// 3. remove this edge from the proximity graph
// 2. remove this edge from the proximity graph
self.remove_edge(edge_index);
// 4. continue executing this function again on the remaining paths
edge_docids_cache.cache.remove(&edge_index);
needs_filtering = true;
// 3. continue executing this function again on the remaining paths
continue 'path_loop;
} else {
path_bitmap &= edge_docids;
if path_bitmap.is_disjoint(universe) {
// 1. Store in the cache that this prefix is empty for this universe
empty_paths_cache
.empty_prefixes
.insert(processed_edges.iter().copied(), ());
// 2. remove all the paths beginning with this prefix
paths.remove_prefix(&processed_edges);
// 3. continue executing this function again on the remaining paths?
needs_filtering = true;
empty_paths_cache.forbid_prefix(&visited_edges);
// if the intersection between this edge and any
// previous one is disjoint with the universe,
// then we add these two edges to the empty_path_cache
for (edge_index2, edge_docids2) in
cached_edge_docids[..cached_edge_docids.len() - 1].iter()
{
let intersection = edge_docids & edge_docids2;
if intersection.is_disjoint(universe) {
empty_paths_cache
.forbid_couple_edges(*edge_index2, edge_index);
}
}
continue 'path_loop;
}
}
@ -68,6 +89,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
}
path_bitmaps.push(path_bitmap);
}
Ok(MultiOps::union(path_bitmaps))
}
}

View File

@ -2,16 +2,18 @@ use heed::{BytesDecode, RoTxn};
use roaring::RoaringBitmap;
use super::empty_paths_cache::EmptyPathsCache;
use super::paths_map::PathsMap;
use super::{EdgeDetails, RankingRuleGraphTrait};
use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait};
use crate::new::db_cache::DatabaseCache;
use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
use crate::new::QueryNode;
use crate::new::logger::SearchLogger;
use crate::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations};
use crate::new::resolve_query_graph::resolve_phrase;
use crate::new::{QueryGraph, QueryNode};
use crate::{Index, Result, RoaringBitmapCodec};
#[derive(Clone)]
pub enum TypoEdge {
Phrase,
Phrase { phrase: Phrase },
Word { derivations: WordDerivations, nbr_typos: u8 },
}
@ -23,7 +25,7 @@ impl RankingRuleGraphTrait for TypoGraph {
fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String {
match edge {
TypoEdge::Phrase => format!(", 0 typos"),
TypoEdge::Phrase { .. } => ", 0 typos".to_owned(),
TypoEdge::Word { nbr_typos, .. } => format!(", {nbr_typos} typos"),
}
}
@ -33,9 +35,9 @@ impl RankingRuleGraphTrait for TypoGraph {
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
edge: &Self::EdgeDetails,
) -> Result<roaring::RoaringBitmap> {
) -> Result<RoaringBitmap> {
match edge {
TypoEdge::Phrase => todo!(),
TypoEdge::Phrase { phrase } => resolve_phrase(index, txn, db_cache, phrase),
TypoEdge::Word { derivations, nbr_typos } => {
let words = match nbr_typos {
0 => &derivations.zero_typo,
@ -68,21 +70,23 @@ impl RankingRuleGraphTrait for TypoGraph {
_index: &Index,
_txn: &'transaction RoTxn,
_db_cache: &mut DatabaseCache<'transaction>,
from_node: &QueryNode,
_from_node: &QueryNode,
) -> Result<Option<Self::BuildVisitedFromNode>> {
Ok(Some(()))
}
fn build_visit_to_node<'from_data, 'transaction: 'from_data>(
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
_index: &Index,
_txn: &'transaction RoTxn,
_db_cache: &mut DatabaseCache<'transaction>,
to_node: &QueryNode,
from_node_data: &'from_data Self::BuildVisitedFromNode,
_from_node_data: &'from_data Self::BuildVisitedFromNode,
) -> Result<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>> {
match to_node {
QueryNode::Term(LocatedQueryTerm { value, .. }) => match value {
QueryTerm::Phrase(_) => Ok(vec![(0, EdgeDetails::Data(TypoEdge::Phrase))]),
QueryTerm::Phrase { phrase } => {
Ok(vec![(0, EdgeDetails::Data(TypoEdge::Phrase { phrase: phrase.clone() }))])
}
QueryTerm::Word { derivations } => {
let mut edges = vec![];
if !derivations.zero_typo.is_empty() || derivations.use_prefix_db {
@ -121,11 +125,14 @@ impl RankingRuleGraphTrait for TypoGraph {
}
fn log_state(
graph: &super::RankingRuleGraph<Self>,
paths: &PathsMap<u64>,
graph: &RankingRuleGraph<Self>,
paths: &[Vec<u32>],
empty_paths_cache: &EmptyPathsCache,
logger: &mut dyn crate::new::logger::SearchLogger<crate::new::QueryGraph>,
universe: &RoaringBitmap,
distances: &[Vec<u64>],
cost: u64,
logger: &mut dyn SearchLogger<QueryGraph>,
) {
logger.log_typo_state(graph, paths, empty_paths_cache);
logger.log_typo_state(graph, paths, empty_paths_cache, universe, distances.to_vec(), cost);
}
}

View File

@ -1,3 +1,5 @@
use std::time::Instant;
use heed::RoTxn;
use roaring::RoaringBitmap;
@ -9,7 +11,7 @@ use crate::new::graph_based_ranking_rule::GraphBasedRankingRule;
use crate::new::ranking_rule_graph::proximity::ProximityGraph;
use crate::new::ranking_rule_graph::typo::TypoGraph;
use crate::new::words::Words;
use crate::search::new::sort::Sort;
// use crate::search::new::sort::Sort;
use crate::{Filter, Index, Result, TermsMatchingStrategy};
pub trait RankingRuleOutputIter<'transaction, Query> {
@ -123,13 +125,14 @@ pub fn execute_search<'transaction>(
length: usize,
logger: &mut dyn SearchLogger<QueryGraph>,
) -> Result<Vec<u32>> {
logger.initial_query(query_graph, Instant::now());
let words = &mut Words::new(TermsMatchingStrategy::Last);
let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?;
// let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?;
let proximity = &mut GraphBasedRankingRule::<ProximityGraph>::new("proximity".to_owned());
let typo = &mut GraphBasedRankingRule::<TypoGraph>::new("typo".to_owned());
// TODO: ranking rules given as argument
let mut ranking_rules: Vec<&mut dyn RankingRule<'transaction, QueryGraph>> =
vec![words, typo, proximity, sort];
vec![words, typo, proximity /*sort*/];
logger.ranking_rules(&ranking_rules);
@ -144,7 +147,13 @@ pub fn execute_search<'transaction>(
}
let ranking_rules_len = ranking_rules.len();
logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, &universe);
logger.start_iteration_ranking_rule(
0,
ranking_rules[0],
query_graph,
&universe,
Instant::now(),
);
ranking_rules[0].start_iteration(index, txn, db_cache, logger, &universe, query_graph)?;
let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len];
@ -154,11 +163,12 @@ pub fn execute_search<'transaction>(
macro_rules! back {
() => {
// assert!(candidates[cur_ranking_rule_index].is_empty());
assert!(candidates[cur_ranking_rule_index].is_empty());
logger.end_iteration_ranking_rule(
cur_ranking_rule_index,
ranking_rules[cur_ranking_rule_index],
&candidates[cur_ranking_rule_index],
Instant::now(),
);
candidates[cur_ranking_rule_index].clear();
ranking_rules[cur_ranking_rule_index].end_iteration(index, txn, db_cache, logger);
@ -187,6 +197,7 @@ pub fn execute_search<'transaction>(
cur_ranking_rule_index,
ranking_rules[cur_ranking_rule_index],
&candidates,
Instant::now(),
);
} else {
let all_candidates = candidates.iter().collect::<Vec<_>>();
@ -196,6 +207,7 @@ pub fn execute_search<'transaction>(
cur_ranking_rule_index,
ranking_rules[cur_ranking_rule_index],
&skipped_candidates.into_iter().collect(),
Instant::now(),
);
let candidates = candidates
.iter()
@ -219,24 +231,26 @@ pub fn execute_search<'transaction>(
// The universe for this bucket is zero or one element, so we don't need to sort
// anything, just extend the results and go back to the parent ranking rule.
if candidates[cur_ranking_rule_index].len() <= 1 {
candidates[cur_ranking_rule_index].clear();
maybe_add_to_results!(&candidates[cur_ranking_rule_index]);
candidates[cur_ranking_rule_index].clear();
back!();
continue;
}
logger.next_bucket_ranking_rule(
cur_ranking_rule_index,
ranking_rules[cur_ranking_rule_index],
&candidates[cur_ranking_rule_index],
);
let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(index, txn, db_cache, logger, &candidates[cur_ranking_rule_index])? else {
// TODO: add remaining candidates automatically here?
back!();
continue;
};
logger.next_bucket_ranking_rule(
cur_ranking_rule_index,
ranking_rules[cur_ranking_rule_index],
&candidates[cur_ranking_rule_index],
&next_bucket.candidates,
Instant::now(),
);
assert!(candidates[cur_ranking_rule_index].is_superset(&next_bucket.candidates));
candidates[cur_ranking_rule_index] -= &next_bucket.candidates;
@ -255,6 +269,7 @@ pub fn execute_search<'transaction>(
ranking_rules[cur_ranking_rule_index],
&next_bucket.query,
&candidates[cur_ranking_rule_index],
Instant::now(),
);
ranking_rules[cur_ranking_rule_index].start_iteration(
index,
@ -271,17 +286,18 @@ pub fn execute_search<'transaction>(
#[cfg(test)]
mod tests {
use std::fs::File;
use std::io::{BufRead, BufReader, Cursor, Seek};
use std::time::Instant;
use heed::EnvOpenOptions;
use super::execute_search;
// use crate::allocator::ALLOC;
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use crate::index::tests::TempIndex;
use crate::new::db_cache::DatabaseCache;
use crate::new::logger::detailed::DetailedSearchLogger;
use big_s::S;
use heed::EnvOpenOptions;
use maplit::hashset;
use std::fs::File;
use std::io::{BufRead, BufReader, Cursor, Seek};
use std::time::Instant;
// use crate::new::logger::detailed::DetailedSearchLogger;
use crate::new::logger::{DefaultSearchLogger, SearchLogger};
use crate::new::make_query_graph;
use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
@ -323,16 +339,119 @@ mod tests {
let mut db_cache = DatabaseCache::default();
let query_graph =
make_query_graph(&index, &txn, &mut db_cache, "b b b b b b b b b b").unwrap();
println!("{}", query_graph.graphviz());
logger.initial_query(&query_graph);
make_query_graph(&index, &txn, &mut db_cache, "releases from poison by the government")
.unwrap();
logger.initial_query(&query_graph, Instant::now());
let results =
execute_search(&index, &txn, &mut db_cache, &query_graph, None, 0, 20, &mut logger)
execute_search(&index, &txn, &mut db_cache, &query_graph, None, 0, 50, &mut logger)
.unwrap();
println!("{results:?}")
}
#[test]
fn search_wiki_new() {
let mut options = EnvOpenOptions::new();
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
let index = Index::new(options, "data_wiki").unwrap();
let txn = index.read_txn().unwrap();
println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len());
let primary_key = index.primary_key(&txn).unwrap().unwrap();
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
// loop {
let start = Instant::now();
let mut db_cache = DatabaseCache::default();
let query_graph = make_query_graph(
&index,
&txn,
&mut db_cache,
"which a the releases from poison by the government",
)
.unwrap();
// let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log");
let results = execute_search(
&index,
&txn,
&mut db_cache,
&query_graph,
None,
0,
20,
&mut DefaultSearchLogger,
// &mut logger,
)
.unwrap();
// logger.write_d2_description();
let elapsed = start.elapsed();
let ids = index
.documents(&txn, results.iter().copied())
.unwrap()
.into_iter()
.map(|x| {
let obkv = &x.1;
let id = obkv.get(primary_key).unwrap();
let id: serde_json::Value = serde_json::from_slice(id).unwrap();
id.as_str().unwrap().to_owned()
})
.collect::<Vec<_>>();
println!("{}us: {results:?}", elapsed.as_micros());
println!("external ids: {ids:?}");
// println!("max_resident: {}", ALLOC.max_resident.load(std::sync::atomic::Ordering::SeqCst));
// println!("allocated: {}", ALLOC.allocated.load(std::sync::atomic::Ordering::SeqCst));
// }
}
#[test]
fn search_wiki_old() {
let mut options = EnvOpenOptions::new();
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
let index = Index::new(options, "data_wiki").unwrap();
let txn = index.read_txn().unwrap();
let rr = index.criteria(&txn).unwrap();
println!("{rr:?}");
let primary_key = index.primary_key(&txn).unwrap().unwrap();
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
let start = Instant::now();
let mut s = Search::new(&txn, &index);
s.query("releases from poison by the government");
s.terms_matching_strategy(TermsMatchingStrategy::Last);
s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
let docs = s.execute().unwrap();
let elapsed = start.elapsed();
let ids = index
.documents(&txn, docs.documents_ids.iter().copied())
.unwrap()
.into_iter()
.map(|x| {
let obkv = &x.1;
let id = obkv.get(primary_key).unwrap();
let id: serde_json::Value = serde_json::from_slice(id).unwrap();
id.as_str().unwrap().to_owned()
})
.collect::<Vec<_>>();
println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
println!("external ids: {ids:?}");
}
#[test]
fn search_movies_new() {
let mut options = EnvOpenOptions::new();
@ -343,7 +462,7 @@ mod tests {
let primary_key = index.primary_key(&txn).unwrap().unwrap();
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
// loop {
let start = Instant::now();
let mut db_cache = DatabaseCache::default();
@ -352,7 +471,7 @@ mod tests {
make_query_graph(&index, &txn, &mut db_cache, "releases from poison by the government")
.unwrap();
let mut logger = DetailedSearchLogger::new("log");
let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log");
let results = execute_search(
&index,
@ -360,9 +479,10 @@ mod tests {
&mut db_cache,
&query_graph,
None,
5,
0,
20,
&mut logger, //&mut DefaultSearchLogger,
// &mut DefaultSearchLogger,
&mut logger,
)
.unwrap();
@ -384,6 +504,7 @@ mod tests {
println!("{}us: {results:?}", elapsed.as_micros());
println!("external ids: {ids:?}");
// }
}
#[test]
@ -392,19 +513,39 @@ mod tests {
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
let index = Index::new(options, "data_movies").unwrap();
let txn = index.read_txn().unwrap();
let rr = index.criteria(&txn).unwrap();
println!("{rr:?}");
let primary_key = index.primary_key(&txn).unwrap().unwrap();
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
let start = Instant::now();
let mut s = Search::new(&txn, &index);
s.query("b b b b b b b b b b");
s.query("releases from poison by the government");
s.terms_matching_strategy(TermsMatchingStrategy::Last);
s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
let docs = s.execute().unwrap();
let elapsed = start.elapsed();
let ids = index
.documents(&txn, docs.documents_ids.iter().copied())
.unwrap()
.into_iter()
.map(|x| {
let obkv = &x.1;
let id = obkv.get(primary_key).unwrap();
let id: serde_json::Value = serde_json::from_slice(id).unwrap();
id.as_str().unwrap().to_owned()
})
.collect::<Vec<_>>();
println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
println!("external ids: {ids:?}");
}
#[test]
@ -420,10 +561,16 @@ mod tests {
builder.set_min_word_len_one_typo(5);
builder.set_min_word_len_two_typos(100);
builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
builder.set_sortable_fields(hashset! { S("release_date") });
builder.set_criteria(vec![
Criterion::Words,
Criterion::Typo,
Criterion::Proximity,
Criterion::Asc("release_date".to_owned()),
]);
builder.execute(|_| (), || false).unwrap();
wtxn.commit().unwrap();
}
#[test]
@ -445,6 +592,7 @@ mod tests {
builder.set_searchable_fields(searchable_fields);
let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
builder.set_filterable_fields(filterable_fields);
builder.set_min_word_len_one_typo(5);
builder.set_min_word_len_two_typos(100);
builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
@ -467,6 +615,48 @@ mod tests {
index.prepare_for_closing().wait();
}
#[test]
fn _index_wiki() {
let mut options = EnvOpenOptions::new();
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
let index = Index::new(options, "data_wiki").unwrap();
let mut wtxn = index.write_txn().unwrap();
// let primary_key = "id";
let searchable_fields = vec!["body", "title", "url"];
// let filterable_fields = vec![];
let config = IndexerConfig::default();
let mut builder = Settings::new(&mut wtxn, &index, &config);
// builder.set_primary_key(primary_key.to_owned());
let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
builder.set_searchable_fields(searchable_fields);
// let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
// builder.set_filterable_fields(filterable_fields);
// builder.set_min_word_len_one_typo(5);
// builder.set_min_word_len_two_typos(100);
builder.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]);
builder.execute(|_| (), || false).unwrap();
let config = IndexerConfig::default();
let indexing_config =
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false)
.unwrap();
let documents = documents_from(
"/Users/meilisearch/Documents/milli2/benchmarks/datasets/smol-wiki-articles.csv",
"csv",
);
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
index.prepare_for_closing().wait();
}
fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader<impl BufRead + Seek> {
let reader = File::open(filename)