mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-23 11:47:28 +01:00
Add some documentation and use bitmaps instead of hashmaps when possible
This commit is contained in:
parent
132191360b
commit
66d0c63694
@ -1,7 +1,8 @@
|
|||||||
use std::collections::HashSet;
|
|
||||||
use std::fmt::Debug;
|
use std::fmt::Debug;
|
||||||
|
use std::{collections::HashSet, fmt};
|
||||||
|
|
||||||
use heed::RoTxn;
|
use heed::RoTxn;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::{
|
use super::{
|
||||||
db_cache::DatabaseCache,
|
db_cache::DatabaseCache,
|
||||||
@ -19,21 +20,31 @@ pub enum QueryNode {
|
|||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct Edges {
|
pub struct Edges {
|
||||||
pub incoming: HashSet<usize>,
|
// TODO: use a tiny bitset instead
|
||||||
pub outgoing: HashSet<usize>,
|
// something like a simple Vec<u8> where most queries will see a vector of one element
|
||||||
|
pub predecessors: RoaringBitmap,
|
||||||
|
pub successors: RoaringBitmap,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
|
pub struct NodeIndex(pub u32);
|
||||||
|
impl fmt::Display for NodeIndex {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
fmt::Display::fmt(&self.0, f)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct QueryGraph {
|
pub struct QueryGraph {
|
||||||
pub root_node: usize,
|
pub root_node: NodeIndex,
|
||||||
pub end_node: usize,
|
pub end_node: NodeIndex,
|
||||||
pub nodes: Vec<QueryNode>,
|
pub nodes: Vec<QueryNode>,
|
||||||
pub edges: Vec<Edges>,
|
pub edges: Vec<Edges>,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn _assert_sizes() {
|
fn _assert_sizes() {
|
||||||
let _: [u8; 112] = [0; std::mem::size_of::<QueryNode>()];
|
let _: [u8; 112] = [0; std::mem::size_of::<QueryNode>()];
|
||||||
let _: [u8; 96] = [0; std::mem::size_of::<Edges>()];
|
let _: [u8; 48] = [0; std::mem::size_of::<Edges>()];
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for QueryGraph {
|
impl Default for QueryGraph {
|
||||||
@ -41,32 +52,32 @@ impl Default for QueryGraph {
|
|||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
let nodes = vec![QueryNode::Start, QueryNode::End];
|
let nodes = vec![QueryNode::Start, QueryNode::End];
|
||||||
let edges = vec![
|
let edges = vec![
|
||||||
Edges { incoming: HashSet::new(), outgoing: HashSet::new() },
|
Edges { predecessors: RoaringBitmap::new(), successors: RoaringBitmap::new() },
|
||||||
Edges { incoming: HashSet::new(), outgoing: HashSet::new() },
|
Edges { predecessors: RoaringBitmap::new(), successors: RoaringBitmap::new() },
|
||||||
];
|
];
|
||||||
|
|
||||||
Self { root_node: 0, end_node: 1, nodes, edges }
|
Self { root_node: NodeIndex(0), end_node: NodeIndex(1), nodes, edges }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl QueryGraph {
|
impl QueryGraph {
|
||||||
fn connect_to_node(&mut self, from_nodes: &[usize], end_node: usize) {
|
fn connect_to_node(&mut self, from_nodes: &[NodeIndex], to_node: NodeIndex) {
|
||||||
for &from_node in from_nodes {
|
for &from_node in from_nodes {
|
||||||
self.edges[from_node].outgoing.insert(end_node);
|
self.edges[from_node.0 as usize].successors.insert(to_node.0);
|
||||||
self.edges[end_node].incoming.insert(from_node);
|
self.edges[to_node.0 as usize].predecessors.insert(from_node.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fn add_node(&mut self, from_nodes: &[usize], node: QueryNode) -> usize {
|
fn add_node(&mut self, from_nodes: &[NodeIndex], node: QueryNode) -> NodeIndex {
|
||||||
let new_node_idx = self.nodes.len();
|
let new_node_idx = self.nodes.len() as u32;
|
||||||
self.nodes.push(node);
|
self.nodes.push(node);
|
||||||
self.edges.push(Edges {
|
self.edges.push(Edges {
|
||||||
incoming: from_nodes.iter().copied().collect(),
|
predecessors: from_nodes.iter().map(|x| x.0).collect(),
|
||||||
outgoing: HashSet::new(),
|
successors: RoaringBitmap::new(),
|
||||||
});
|
});
|
||||||
for from_node in from_nodes {
|
for from_node in from_nodes {
|
||||||
self.edges[*from_node].outgoing.insert(new_node_idx);
|
self.edges[from_node.0 as usize].successors.insert(new_node_idx);
|
||||||
}
|
}
|
||||||
new_node_idx
|
NodeIndex(new_node_idx)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -88,7 +99,7 @@ impl QueryGraph {
|
|||||||
let word_set = index.words_fst(txn)?;
|
let word_set = index.words_fst(txn)?;
|
||||||
let mut graph = QueryGraph::default();
|
let mut graph = QueryGraph::default();
|
||||||
|
|
||||||
let (mut prev2, mut prev1, mut prev0): (Vec<usize>, Vec<usize>, Vec<usize>) =
|
let (mut prev2, mut prev1, mut prev0): (Vec<NodeIndex>, Vec<NodeIndex>, Vec<NodeIndex>) =
|
||||||
(vec![], vec![], vec![graph.root_node]);
|
(vec![], vec![], vec![graph.root_node]);
|
||||||
|
|
||||||
// TODO: add all the word derivations found in the fst
|
// TODO: add all the word derivations found in the fst
|
||||||
@ -162,38 +173,41 @@ impl QueryGraph {
|
|||||||
|
|
||||||
Ok(graph)
|
Ok(graph)
|
||||||
}
|
}
|
||||||
pub fn remove_nodes(&mut self, nodes: &[usize]) {
|
pub fn remove_nodes(&mut self, nodes: &[NodeIndex]) {
|
||||||
for &node in nodes {
|
for &node in nodes {
|
||||||
self.nodes[node] = QueryNode::Deleted;
|
self.nodes[node.0 as usize] = QueryNode::Deleted;
|
||||||
let edges = self.edges[node].clone();
|
let edges = self.edges[node.0 as usize].clone();
|
||||||
for &pred in edges.incoming.iter() {
|
for pred in edges.predecessors.iter() {
|
||||||
self.edges[pred].outgoing.remove(&node);
|
self.edges[pred as usize].successors.remove(node.0);
|
||||||
}
|
}
|
||||||
for succ in edges.outgoing {
|
for succ in edges.successors {
|
||||||
self.edges[succ].incoming.remove(&node);
|
self.edges[succ as usize].predecessors.remove(node.0);
|
||||||
}
|
}
|
||||||
self.edges[node] = Edges { incoming: HashSet::new(), outgoing: HashSet::new() };
|
self.edges[node.0 as usize] =
|
||||||
|
Edges { predecessors: RoaringBitmap::new(), successors: RoaringBitmap::new() };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pub fn remove_nodes_keep_edges(&mut self, nodes: &[usize]) {
|
pub fn remove_nodes_keep_edges(&mut self, nodes: &[NodeIndex]) {
|
||||||
for &node in nodes {
|
for &node in nodes {
|
||||||
self.nodes[node] = QueryNode::Deleted;
|
self.nodes[node.0 as usize] = QueryNode::Deleted;
|
||||||
let edges = self.edges[node].clone();
|
let edges = self.edges[node.0 as usize].clone();
|
||||||
for &pred in edges.incoming.iter() {
|
for pred in edges.predecessors.iter() {
|
||||||
self.edges[pred].outgoing.remove(&node);
|
self.edges[pred as usize].successors.remove(node.0);
|
||||||
self.edges[pred].outgoing.extend(edges.outgoing.iter());
|
self.edges[pred as usize].successors |= &edges.successors;
|
||||||
}
|
}
|
||||||
for succ in edges.outgoing {
|
for succ in edges.successors {
|
||||||
self.edges[succ].incoming.remove(&node);
|
self.edges[succ as usize].predecessors.remove(node.0);
|
||||||
self.edges[succ].incoming.extend(edges.incoming.iter());
|
self.edges[succ as usize].predecessors |= &edges.predecessors;
|
||||||
}
|
}
|
||||||
self.edges[node] = Edges { incoming: HashSet::new(), outgoing: HashSet::new() };
|
self.edges[node.0 as usize] =
|
||||||
|
Edges { predecessors: RoaringBitmap::new(), successors: RoaringBitmap::new() };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pub fn remove_words_at_position(&mut self, position: i8) {
|
pub fn remove_words_at_position(&mut self, position: i8) {
|
||||||
let mut nodes_to_remove_keeping_edges = vec![];
|
let mut nodes_to_remove_keeping_edges = vec![];
|
||||||
let mut nodes_to_remove = vec![];
|
let mut nodes_to_remove = vec![];
|
||||||
for (node_idx, node) in self.nodes.iter().enumerate() {
|
for (node_idx, node) in self.nodes.iter().enumerate() {
|
||||||
|
let node_idx = NodeIndex(node_idx as u32);
|
||||||
let QueryNode::Term(LocatedQueryTerm { value: _, positions }) = node else { continue };
|
let QueryNode::Term(LocatedQueryTerm { value: _, positions }) = node else { continue };
|
||||||
if positions.contains(&position) {
|
if positions.contains(&position) {
|
||||||
nodes_to_remove_keeping_edges.push(node_idx)
|
nodes_to_remove_keeping_edges.push(node_idx)
|
||||||
@ -213,11 +227,11 @@ impl QueryGraph {
|
|||||||
let mut nodes_to_remove = vec![];
|
let mut nodes_to_remove = vec![];
|
||||||
for (node_idx, node) in self.nodes.iter().enumerate() {
|
for (node_idx, node) in self.nodes.iter().enumerate() {
|
||||||
if (!matches!(node, QueryNode::End | QueryNode::Deleted)
|
if (!matches!(node, QueryNode::End | QueryNode::Deleted)
|
||||||
&& self.edges[node_idx].outgoing.is_empty())
|
&& self.edges[node_idx].successors.is_empty())
|
||||||
|| (!matches!(node, QueryNode::Start | QueryNode::Deleted)
|
|| (!matches!(node, QueryNode::Start | QueryNode::Deleted)
|
||||||
&& self.edges[node_idx].incoming.is_empty())
|
&& self.edges[node_idx].predecessors.is_empty())
|
||||||
{
|
{
|
||||||
nodes_to_remove.push(node_idx);
|
nodes_to_remove.push(NodeIndex(node_idx as u32));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if nodes_to_remove.is_empty() {
|
if nodes_to_remove.is_empty() {
|
||||||
@ -301,14 +315,14 @@ node [shape = "record"]
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
desc.push_str(&format!("{node} [label = {:?}]", &self.nodes[node],));
|
desc.push_str(&format!("{node} [label = {:?}]", &self.nodes[node],));
|
||||||
if node == self.root_node {
|
if node == self.root_node.0 as usize {
|
||||||
desc.push_str("[color = blue]");
|
desc.push_str("[color = blue]");
|
||||||
} else if node == self.end_node {
|
} else if node == self.end_node.0 as usize {
|
||||||
desc.push_str("[color = red]");
|
desc.push_str("[color = red]");
|
||||||
}
|
}
|
||||||
desc.push_str(";\n");
|
desc.push_str(";\n");
|
||||||
|
|
||||||
for edge in self.edges[node].outgoing.iter() {
|
for edge in self.edges[node].successors.iter() {
|
||||||
desc.push_str(&format!("{node} -> {edge};\n"));
|
desc.push_str(&format!("{node} -> {edge};\n"));
|
||||||
}
|
}
|
||||||
// for edge in self.edges[node].incoming.iter() {
|
// for edge in self.edges[node].incoming.iter() {
|
||||||
|
@ -1,10 +1,11 @@
|
|||||||
use std::collections::{BTreeSet, HashMap, HashSet};
|
use std::collections::{BTreeSet, HashMap, HashSet};
|
||||||
|
|
||||||
use heed::RoTxn;
|
use heed::RoTxn;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait};
|
use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait};
|
||||||
use crate::new::db_cache::DatabaseCache;
|
use crate::new::db_cache::DatabaseCache;
|
||||||
use crate::new::QueryGraph;
|
use crate::new::{NodeIndex, QueryGraph};
|
||||||
use crate::{Index, Result};
|
use crate::{Index, Result};
|
||||||
|
|
||||||
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||||
@ -14,29 +15,38 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
|||||||
db_cache: &mut DatabaseCache<'transaction>,
|
db_cache: &mut DatabaseCache<'transaction>,
|
||||||
query_graph: QueryGraph,
|
query_graph: QueryGraph,
|
||||||
) -> Result<Self> {
|
) -> Result<Self> {
|
||||||
let mut ranking_rule_graph = Self { query_graph, all_edges: vec![], node_edges: vec![] };
|
let mut ranking_rule_graph =
|
||||||
|
Self { query_graph, all_edges: vec![], node_edges: vec![], successors: vec![] };
|
||||||
|
|
||||||
for (node_idx, node) in ranking_rule_graph.query_graph.nodes.iter().enumerate() {
|
for (node_idx, node) in ranking_rule_graph.query_graph.nodes.iter().enumerate() {
|
||||||
ranking_rule_graph.node_edges.push(BTreeSet::new());
|
ranking_rule_graph.node_edges.push(RoaringBitmap::new());
|
||||||
|
ranking_rule_graph.successors.push(RoaringBitmap::new());
|
||||||
let new_edges = ranking_rule_graph.node_edges.last_mut().unwrap();
|
let new_edges = ranking_rule_graph.node_edges.last_mut().unwrap();
|
||||||
|
let new_successors = ranking_rule_graph.successors.last_mut().unwrap();
|
||||||
|
|
||||||
let Some(from_node_data) = G::build_visit_from_node(index, txn, db_cache, node)? else { continue };
|
let Some(from_node_data) = G::build_visit_from_node(index, txn, db_cache, node)? else { continue };
|
||||||
|
|
||||||
for &successor_idx in ranking_rule_graph.query_graph.edges[node_idx].outgoing.iter() {
|
for successor_idx in ranking_rule_graph.query_graph.edges[node_idx].successors.iter() {
|
||||||
let to_node = &ranking_rule_graph.query_graph.nodes[successor_idx];
|
let to_node = &ranking_rule_graph.query_graph.nodes[successor_idx as usize];
|
||||||
let Some(edges) = G::build_visit_to_node(index, txn, db_cache, to_node, &from_node_data)? else { continue };
|
let mut edges =
|
||||||
|
G::build_visit_to_node(index, txn, db_cache, to_node, &from_node_data)?;
|
||||||
|
if edges.is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
edges.sort_by_key(|e| e.0);
|
||||||
for (cost, details) in edges {
|
for (cost, details) in edges {
|
||||||
ranking_rule_graph.all_edges.push(Some(Edge {
|
ranking_rule_graph.all_edges.push(Some(Edge {
|
||||||
from_node: node_idx,
|
from_node: NodeIndex(node_idx as u32),
|
||||||
to_node: successor_idx,
|
to_node: NodeIndex(successor_idx),
|
||||||
cost,
|
cost,
|
||||||
details,
|
details,
|
||||||
}));
|
}));
|
||||||
new_edges.insert(ranking_rule_graph.all_edges.len() - 1);
|
new_edges.insert(ranking_rule_graph.all_edges.len() as u32 - 1);
|
||||||
|
new_successors.insert(successor_idx);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ranking_rule_graph.simplify();
|
// ranking_rule_graph.simplify();
|
||||||
|
|
||||||
Ok(ranking_rule_graph)
|
Ok(ranking_rule_graph)
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
use std::collections::{BTreeMap, HashSet};
|
use std::collections::{BTreeMap, HashSet};
|
||||||
|
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
use crate::new::NodeIndex;
|
||||||
|
|
||||||
use super::{
|
use super::{
|
||||||
empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, Edge, EdgeIndex, RankingRuleGraph,
|
empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, Edge, EdgeIndex, RankingRuleGraph,
|
||||||
@ -14,18 +17,11 @@ pub struct Path {
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct DijkstraState {
|
struct DijkstraState {
|
||||||
unvisited: HashSet<usize>, // should be a small bitset
|
unvisited: RoaringBitmap, // should be a small bitset?
|
||||||
distances: Vec<u64>, // or binary heap (f64, usize)
|
distances: Vec<u64>, // or binary heap, or btreemap? (f64, usize)
|
||||||
edges: Vec<EdgeIndex>,
|
edges: Vec<EdgeIndex>,
|
||||||
edge_costs: Vec<u8>,
|
edge_costs: Vec<u8>,
|
||||||
paths: Vec<Option<usize>>,
|
paths: Vec<Option<NodeIndex>>,
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
|
||||||
pub struct PathEdgeId<Id> {
|
|
||||||
pub from: usize,
|
|
||||||
pub to: usize,
|
|
||||||
pub id: Id,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct KCheapestPathsState {
|
pub struct KCheapestPathsState {
|
||||||
@ -127,9 +123,10 @@ impl KCheapestPathsState {
|
|||||||
// for all the paths already found that share a common prefix with the root path
|
// for all the paths already found that share a common prefix with the root path
|
||||||
// we delete the edge from the spur node to the next one
|
// we delete the edge from the spur node to the next one
|
||||||
for edge_index_to_remove in self.cheapest_paths.edge_indices_after_prefix(root_path) {
|
for edge_index_to_remove in self.cheapest_paths.edge_indices_after_prefix(root_path) {
|
||||||
let was_removed = graph.node_edges[*spur_node].remove(&edge_index_to_remove.0);
|
let was_removed =
|
||||||
|
graph.node_edges[spur_node.0 as usize].remove(edge_index_to_remove.0 as u32);
|
||||||
if was_removed {
|
if was_removed {
|
||||||
tmp_removed_edges.push(edge_index_to_remove.0);
|
tmp_removed_edges.push(edge_index_to_remove.0 as u32);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -137,7 +134,7 @@ impl KCheapestPathsState {
|
|||||||
// we will combine it with the root path to get a potential kth cheapest path
|
// we will combine it with the root path to get a potential kth cheapest path
|
||||||
let spur_path = graph.cheapest_path_to_end(*spur_node);
|
let spur_path = graph.cheapest_path_to_end(*spur_node);
|
||||||
// restore the temporarily removed edges
|
// restore the temporarily removed edges
|
||||||
graph.node_edges[*spur_node].extend(tmp_removed_edges);
|
graph.node_edges[spur_node.0 as usize].extend(tmp_removed_edges);
|
||||||
|
|
||||||
let Some(spur_path) = spur_path else { continue; };
|
let Some(spur_path) = spur_path else { continue; };
|
||||||
let total_cost = root_cost + spur_path.cost;
|
let total_cost = root_cost + spur_path.cost;
|
||||||
@ -182,68 +179,73 @@ impl KCheapestPathsState {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||||
fn cheapest_path_to_end(&self, from: usize) -> Option<Path> {
|
fn cheapest_path_to_end(&self, from: NodeIndex) -> Option<Path> {
|
||||||
let mut dijkstra = DijkstraState {
|
let mut dijkstra = DijkstraState {
|
||||||
unvisited: (0..self.query_graph.nodes.len()).collect(),
|
unvisited: (0..self.query_graph.nodes.len() as u32).collect(),
|
||||||
distances: vec![u64::MAX; self.query_graph.nodes.len()],
|
distances: vec![u64::MAX; self.query_graph.nodes.len()],
|
||||||
edges: vec![EdgeIndex(usize::MAX); self.query_graph.nodes.len()],
|
edges: vec![EdgeIndex(usize::MAX); self.query_graph.nodes.len()],
|
||||||
edge_costs: vec![u8::MAX; self.query_graph.nodes.len()],
|
edge_costs: vec![u8::MAX; self.query_graph.nodes.len()],
|
||||||
paths: vec![None; self.query_graph.nodes.len()],
|
paths: vec![None; self.query_graph.nodes.len()],
|
||||||
};
|
};
|
||||||
dijkstra.distances[from] = 0;
|
dijkstra.distances[from.0 as usize] = 0;
|
||||||
|
|
||||||
// TODO: could use a binary heap here to store the distances
|
// TODO: could use a binary heap here to store the distances, or a btreemap
|
||||||
while let Some(&cur_node) =
|
while let Some(cur_node) =
|
||||||
dijkstra.unvisited.iter().min_by_key(|&&n| dijkstra.distances[n])
|
dijkstra.unvisited.iter().min_by_key(|&n| dijkstra.distances[n as usize])
|
||||||
{
|
{
|
||||||
let cur_node_dist = dijkstra.distances[cur_node];
|
let cur_node_dist = dijkstra.distances[cur_node as usize];
|
||||||
if cur_node_dist == u64::MAX {
|
if cur_node_dist == u64::MAX {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
if cur_node == self.query_graph.end_node {
|
if cur_node == self.query_graph.end_node.0 {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
let succ_cur_node: HashSet<_> = self.node_edges[cur_node]
|
// this is expensive, but shouldn't
|
||||||
.iter()
|
// ideally I could quickly get a bitmap of all a node's successors
|
||||||
.map(|e| self.all_edges[*e].as_ref().unwrap().to_node)
|
// then take the intersection with unvisited
|
||||||
.collect();
|
let succ_cur_node: &RoaringBitmap = &self.successors[cur_node as usize];
|
||||||
|
// .iter()
|
||||||
|
// .map(|e| self.all_edges[e as usize].as_ref().unwrap().to_node.0)
|
||||||
|
// .collect();
|
||||||
// TODO: this intersection may be slow but shouldn't be,
|
// TODO: this intersection may be slow but shouldn't be,
|
||||||
// can use a bitmap intersection instead
|
// can use a bitmap intersection instead
|
||||||
let unvisited_succ_cur_node = succ_cur_node.intersection(&dijkstra.unvisited);
|
let unvisited_succ_cur_node = succ_cur_node & &dijkstra.unvisited;
|
||||||
for &succ in unvisited_succ_cur_node {
|
for succ in unvisited_succ_cur_node {
|
||||||
let Some((cheapest_edge, cheapest_edge_cost)) = self.cheapest_edge(cur_node, succ) else {
|
// cheapest_edge() is also potentially too expensive
|
||||||
|
let Some((cheapest_edge, cheapest_edge_cost)) = self.cheapest_edge(NodeIndex(cur_node), NodeIndex(succ)) else {
|
||||||
continue
|
continue
|
||||||
};
|
};
|
||||||
|
|
||||||
// println!("cur node dist {cur_node_dist}");
|
// println!("cur node dist {cur_node_dist}");
|
||||||
let old_dist_succ = &mut dijkstra.distances[succ];
|
let old_dist_succ = &mut dijkstra.distances[succ as usize];
|
||||||
let new_potential_distance = cur_node_dist + cheapest_edge_cost as u64;
|
let new_potential_distance = cur_node_dist + cheapest_edge_cost as u64;
|
||||||
if new_potential_distance < *old_dist_succ {
|
if new_potential_distance < *old_dist_succ {
|
||||||
*old_dist_succ = new_potential_distance;
|
*old_dist_succ = new_potential_distance;
|
||||||
dijkstra.edges[succ] = cheapest_edge;
|
dijkstra.edges[succ as usize] = cheapest_edge;
|
||||||
dijkstra.edge_costs[succ] = cheapest_edge_cost;
|
dijkstra.edge_costs[succ as usize] = cheapest_edge_cost;
|
||||||
dijkstra.paths[succ] = Some(cur_node);
|
dijkstra.paths[succ as usize] = Some(NodeIndex(cur_node));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
dijkstra.unvisited.remove(&cur_node);
|
dijkstra.unvisited.remove(cur_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut cur = self.query_graph.end_node;
|
let mut cur = self.query_graph.end_node;
|
||||||
// let mut edge_costs = vec![];
|
// let mut edge_costs = vec![];
|
||||||
// let mut distances = vec![];
|
// let mut distances = vec![];
|
||||||
let mut path_edges = vec![];
|
let mut path_edges = vec![];
|
||||||
while let Some(n) = dijkstra.paths[cur] {
|
while let Some(n) = dijkstra.paths[cur.0 as usize] {
|
||||||
path_edges.push(dijkstra.edges[cur]);
|
path_edges.push(dijkstra.edges[cur.0 as usize]);
|
||||||
cur = n;
|
cur = n;
|
||||||
}
|
}
|
||||||
path_edges.reverse();
|
path_edges.reverse();
|
||||||
Some(Path { edges: path_edges, cost: dijkstra.distances[self.query_graph.end_node] })
|
Some(Path {
|
||||||
|
edges: path_edges,
|
||||||
|
cost: dijkstra.distances[self.query_graph.end_node.0 as usize],
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: this implementation is VERY fragile, as we assume that the edges are ordered by cost
|
pub fn cheapest_edge(&self, cur_node: NodeIndex, succ: NodeIndex) -> Option<(EdgeIndex, u8)> {
|
||||||
// already. Change it.
|
|
||||||
pub fn cheapest_edge(&self, cur_node: usize, succ: usize) -> Option<(EdgeIndex, u8)> {
|
|
||||||
self.visit_edges(cur_node, succ, |edge_idx, edge| {
|
self.visit_edges(cur_node, succ, |edge_idx, edge| {
|
||||||
std::ops::ControlFlow::Break((edge_idx, edge.cost))
|
std::ops::ControlFlow::Break((edge_idx, edge.cost))
|
||||||
})
|
})
|
||||||
|
@ -9,6 +9,12 @@ use crate::new::db_cache::DatabaseCache;
|
|||||||
use crate::new::BitmapOrAllRef;
|
use crate::new::BitmapOrAllRef;
|
||||||
use crate::{Index, Result};
|
use crate::{Index, Result};
|
||||||
|
|
||||||
|
// TODO: the cache should have a G::EdgeDetails as key
|
||||||
|
// but then it means that we should have a quick way of
|
||||||
|
// computing their hash and comparing them
|
||||||
|
// which can be done...
|
||||||
|
// by using a pointer (real, Rc, bumpalo, or in a vector)???
|
||||||
|
|
||||||
pub struct EdgeDocidsCache<G: RankingRuleGraphTrait> {
|
pub struct EdgeDocidsCache<G: RankingRuleGraphTrait> {
|
||||||
pub cache: HashMap<EdgeIndex, RoaringBitmap>,
|
pub cache: HashMap<EdgeIndex, RoaringBitmap>,
|
||||||
|
|
||||||
|
@ -13,7 +13,7 @@ use heed::RoTxn;
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::db_cache::DatabaseCache;
|
use super::db_cache::DatabaseCache;
|
||||||
use super::{QueryGraph, QueryNode};
|
use super::{NodeIndex, QueryGraph, QueryNode};
|
||||||
use crate::{Index, Result};
|
use crate::{Index, Result};
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
@ -24,8 +24,8 @@ pub enum EdgeDetails<E> {
|
|||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct Edge<E> {
|
pub struct Edge<E> {
|
||||||
from_node: usize,
|
from_node: NodeIndex,
|
||||||
to_node: usize,
|
to_node: NodeIndex,
|
||||||
cost: u8,
|
cost: u8,
|
||||||
details: EdgeDetails<E>,
|
details: EdgeDetails<E>,
|
||||||
}
|
}
|
||||||
@ -38,22 +38,20 @@ pub struct EdgePointer<'graph, E> {
|
|||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
pub struct EdgeIndex(pub usize);
|
pub struct EdgeIndex(pub usize);
|
||||||
// {
|
|
||||||
// // TODO: they could all be u16 instead
|
|
||||||
// // There may be a way to store all the edge indices in a u32 as well,
|
|
||||||
// // if the edges are in a vector
|
|
||||||
// // then we can store sets of edges in a bitmap efficiently
|
|
||||||
// pub from: usize,
|
|
||||||
// pub to: usize,
|
|
||||||
// pub edge_idx: usize,
|
|
||||||
// }
|
|
||||||
|
|
||||||
pub trait RankingRuleGraphTrait {
|
pub trait RankingRuleGraphTrait {
|
||||||
|
/// The details of an edge connecting two query nodes. These details
|
||||||
|
/// should be sufficient to compute the edge's cost and associated document ids
|
||||||
|
/// in [`compute_docids`](RankingRuleGraphTrait).
|
||||||
type EdgeDetails: Sized;
|
type EdgeDetails: Sized;
|
||||||
|
|
||||||
type BuildVisitedFromNode;
|
type BuildVisitedFromNode;
|
||||||
|
|
||||||
fn edge_details_dot_label(edge: &Self::EdgeDetails) -> String;
|
/// Return the label of the given edge details, to be used when visualising
|
||||||
|
/// the ranking rule graph using GraphViz.
|
||||||
|
fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String;
|
||||||
|
|
||||||
|
/// Compute the document ids associated with the given edge.
|
||||||
fn compute_docids<'transaction>(
|
fn compute_docids<'transaction>(
|
||||||
index: &Index,
|
index: &Index,
|
||||||
txn: &'transaction RoTxn,
|
txn: &'transaction RoTxn,
|
||||||
@ -61,6 +59,10 @@ pub trait RankingRuleGraphTrait {
|
|||||||
edge_details: &Self::EdgeDetails,
|
edge_details: &Self::EdgeDetails,
|
||||||
) -> Result<RoaringBitmap>;
|
) -> Result<RoaringBitmap>;
|
||||||
|
|
||||||
|
/// Prepare to build the edges outgoing from `from_node`.
|
||||||
|
///
|
||||||
|
/// This call is followed by zero, one or more calls to [`build_visit_to_node`](RankingRuleGraphTrait::build_visit_to_node),
|
||||||
|
/// which builds the actual edges.
|
||||||
fn build_visit_from_node<'transaction>(
|
fn build_visit_from_node<'transaction>(
|
||||||
index: &Index,
|
index: &Index,
|
||||||
txn: &'transaction RoTxn,
|
txn: &'transaction RoTxn,
|
||||||
@ -68,39 +70,59 @@ pub trait RankingRuleGraphTrait {
|
|||||||
from_node: &QueryNode,
|
from_node: &QueryNode,
|
||||||
) -> Result<Option<Self::BuildVisitedFromNode>>;
|
) -> Result<Option<Self::BuildVisitedFromNode>>;
|
||||||
|
|
||||||
|
/// Return the cost and details of the edges going from the previously visited node
|
||||||
|
/// (with [`build_visit_from_node`](RankingRuleGraphTrait::build_visit_from_node)) to `to_node`.
|
||||||
fn build_visit_to_node<'from_data, 'transaction: 'from_data>(
|
fn build_visit_to_node<'from_data, 'transaction: 'from_data>(
|
||||||
index: &Index,
|
index: &Index,
|
||||||
txn: &'transaction RoTxn,
|
txn: &'transaction RoTxn,
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
db_cache: &mut DatabaseCache<'transaction>,
|
||||||
to_node: &QueryNode,
|
to_node: &QueryNode,
|
||||||
from_node_data: &'from_data Self::BuildVisitedFromNode,
|
from_node_data: &'from_data Self::BuildVisitedFromNode,
|
||||||
) -> Result<Option<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>>>;
|
) -> Result<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>>;
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct RankingRuleGraph<G: RankingRuleGraphTrait> {
|
pub struct RankingRuleGraph<G: RankingRuleGraphTrait> {
|
||||||
pub query_graph: QueryGraph,
|
pub query_graph: QueryGraph,
|
||||||
// pub edges: Vec<HashMap<usize, Vec<Edge<G::EdgeDetails>>>>,
|
// pub edges: Vec<HashMap<usize, Vec<Edge<G::EdgeDetails>>>>,
|
||||||
pub all_edges: Vec<Option<Edge<G::EdgeDetails>>>,
|
pub all_edges: Vec<Option<Edge<G::EdgeDetails>>>,
|
||||||
pub node_edges: Vec<BTreeSet<usize>>,
|
|
||||||
|
pub node_edges: Vec<RoaringBitmap>,
|
||||||
|
|
||||||
|
pub successors: Vec<RoaringBitmap>,
|
||||||
|
// to get the edges between two nodes:
|
||||||
|
// 1. get node_outgoing_edges[from]
|
||||||
|
// 2. get node_incoming_edges[to]
|
||||||
|
// 3. take intersection betweem the two
|
||||||
|
|
||||||
|
// TODO: node edges could be different I guess
|
||||||
|
// something like:
|
||||||
|
// pub node_edges: Vec<BitSet>
|
||||||
|
// where each index is the result of:
|
||||||
|
// the successor index in the top 16 bits, the edge index in the bottom 16 bits
|
||||||
|
|
||||||
|
// TODO:
|
||||||
|
// node_successors?
|
||||||
|
|
||||||
// pub removed_edges: HashSet<EdgeIndex>,
|
// pub removed_edges: HashSet<EdgeIndex>,
|
||||||
// pub tmp_removed_edges: HashSet<EdgeIndex>,
|
// pub tmp_removed_edges: HashSet<EdgeIndex>,
|
||||||
}
|
}
|
||||||
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||||
// NOTE: returns the edge even if it was removed
|
|
||||||
pub fn get_edge(&self, edge_index: EdgeIndex) -> &Option<Edge<G::EdgeDetails>> {
|
pub fn get_edge(&self, edge_index: EdgeIndex) -> &Option<Edge<G::EdgeDetails>> {
|
||||||
&self.all_edges[edge_index.0]
|
&self.all_edges[edge_index.0]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Visit all edges between the two given nodes in order of increasing cost.
|
||||||
pub fn visit_edges<'graph, O>(
|
pub fn visit_edges<'graph, O>(
|
||||||
&'graph self,
|
&'graph self,
|
||||||
from: usize,
|
from: NodeIndex,
|
||||||
to: usize,
|
to: NodeIndex,
|
||||||
mut visit: impl FnMut(EdgeIndex, &'graph Edge<G::EdgeDetails>) -> ControlFlow<O>,
|
mut visit: impl FnMut(EdgeIndex, &'graph Edge<G::EdgeDetails>) -> ControlFlow<O>,
|
||||||
) -> Option<O> {
|
) -> Option<O> {
|
||||||
let from_edges = &self.node_edges[from];
|
let from_edges = &self.node_edges[from.0 as usize];
|
||||||
for &edge_idx in from_edges {
|
for edge_idx in from_edges {
|
||||||
let edge = self.all_edges[edge_idx].as_ref().unwrap();
|
let edge = self.all_edges[edge_idx as usize].as_ref().unwrap();
|
||||||
if edge.to_node == to {
|
if edge.to_node == to {
|
||||||
let cf = visit(EdgeIndex(edge_idx), edge);
|
let cf = visit(EdgeIndex(edge_idx as usize), edge);
|
||||||
match cf {
|
match cf {
|
||||||
ControlFlow::Continue(_) => continue,
|
ControlFlow::Continue(_) => continue,
|
||||||
ControlFlow::Break(o) => return Some(o),
|
ControlFlow::Break(o) => return Some(o),
|
||||||
@ -113,54 +135,61 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
|||||||
|
|
||||||
fn remove_edge(&mut self, edge_index: EdgeIndex) {
|
fn remove_edge(&mut self, edge_index: EdgeIndex) {
|
||||||
let edge_opt = &mut self.all_edges[edge_index.0];
|
let edge_opt = &mut self.all_edges[edge_index.0];
|
||||||
let Some(Edge { from_node, to_node, cost, details }) = &edge_opt else { return };
|
let Some(edge) = &edge_opt else { return };
|
||||||
|
let (from_node, to_node) = (edge.from_node, edge.to_node);
|
||||||
let node_edges = &mut self.node_edges[*from_node];
|
|
||||||
node_edges.remove(&edge_index.0);
|
|
||||||
|
|
||||||
*edge_opt = None;
|
*edge_opt = None;
|
||||||
}
|
|
||||||
pub fn remove_nodes(&mut self, nodes: &[usize]) {
|
|
||||||
for &node in nodes {
|
|
||||||
let edge_indices = &mut self.node_edges[node];
|
|
||||||
for edge_index in edge_indices.iter() {
|
|
||||||
self.all_edges[*edge_index] = None;
|
|
||||||
}
|
|
||||||
edge_indices.clear();
|
|
||||||
|
|
||||||
let preds = &self.query_graph.edges[node].incoming;
|
let from_node_edges = &mut self.node_edges[from_node.0 as usize];
|
||||||
for pred in preds {
|
from_node_edges.remove(edge_index.0 as u32);
|
||||||
let edge_indices = &mut self.node_edges[*pred];
|
|
||||||
for edge_index in edge_indices.iter() {
|
let mut new_successors_from_node = RoaringBitmap::new();
|
||||||
let edge_opt = &mut self.all_edges[*edge_index];
|
for edge in from_node_edges.iter() {
|
||||||
let Some(edge) = edge_opt else { continue; };
|
let Edge { to_node, .. } = &self.all_edges[edge as usize].as_ref().unwrap();
|
||||||
if edge.to_node == node {
|
new_successors_from_node.insert(to_node.0);
|
||||||
*edge_opt = None;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
panic!("remove nodes is incorrect at the moment");
|
|
||||||
edge_indices.clear();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
self.query_graph.remove_nodes(nodes);
|
|
||||||
}
|
|
||||||
pub fn simplify(&mut self) {
|
|
||||||
loop {
|
|
||||||
let mut nodes_to_remove = vec![];
|
|
||||||
for (node_idx, node) in self.query_graph.nodes.iter().enumerate() {
|
|
||||||
if !matches!(node, QueryNode::End | QueryNode::Deleted)
|
|
||||||
&& self.node_edges[node_idx].is_empty()
|
|
||||||
{
|
|
||||||
nodes_to_remove.push(node_idx);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if nodes_to_remove.is_empty() {
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
self.remove_nodes(&nodes_to_remove);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
self.successors[from_node.0 as usize] = new_successors_from_node;
|
||||||
}
|
}
|
||||||
|
// pub fn remove_nodes(&mut self, nodes: &[usize]) {
|
||||||
|
// for &node in nodes {
|
||||||
|
// let edge_indices = &mut self.node_edges[node];
|
||||||
|
// for edge_index in edge_indices.iter() {
|
||||||
|
// self.all_edges[*edge_index] = None;
|
||||||
|
// }
|
||||||
|
// edge_indices.clear();
|
||||||
|
|
||||||
|
// let preds = &self.query_graph.edges[node].incoming;
|
||||||
|
// for pred in preds {
|
||||||
|
// let edge_indices = &mut self.node_edges[*pred];
|
||||||
|
// for edge_index in edge_indices.iter() {
|
||||||
|
// let edge_opt = &mut self.all_edges[*edge_index];
|
||||||
|
// let Some(edge) = edge_opt else { continue; };
|
||||||
|
// if edge.to_node == node {
|
||||||
|
// *edge_opt = None;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// panic!("remove nodes is incorrect at the moment");
|
||||||
|
// edge_indices.clear();
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// self.query_graph.remove_nodes(nodes);
|
||||||
|
// }
|
||||||
|
// pub fn simplify(&mut self) {
|
||||||
|
// loop {
|
||||||
|
// let mut nodes_to_remove = vec![];
|
||||||
|
// for (node_idx, node) in self.query_graph.nodes.iter().enumerate() {
|
||||||
|
// if !matches!(node, QueryNode::End | QueryNode::Deleted)
|
||||||
|
// && self.node_edges[node_idx].is_empty()
|
||||||
|
// {
|
||||||
|
// nodes_to_remove.push(node_idx);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// if nodes_to_remove.is_empty() {
|
||||||
|
// break;
|
||||||
|
// } else {
|
||||||
|
// self.remove_nodes(&nodes_to_remove);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
// fn is_removed_edge(&self, edge: EdgeIndex) -> bool {
|
// fn is_removed_edge(&self, edge: EdgeIndex) -> bool {
|
||||||
// self.removed_edges.contains(&edge) || self.tmp_removed_edges.contains(&edge)
|
// self.removed_edges.contains(&edge) || self.tmp_removed_edges.contains(&edge)
|
||||||
// }
|
// }
|
||||||
@ -174,9 +203,9 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
desc.push_str(&format!("{node_idx} [label = {:?}]", node));
|
desc.push_str(&format!("{node_idx} [label = {:?}]", node));
|
||||||
if node_idx == self.query_graph.root_node {
|
if node_idx == self.query_graph.root_node.0 as usize {
|
||||||
desc.push_str("[color = blue]");
|
desc.push_str("[color = blue]");
|
||||||
} else if node_idx == self.query_graph.end_node {
|
} else if node_idx == self.query_graph.end_node.0 as usize {
|
||||||
desc.push_str("[color = red]");
|
desc.push_str("[color = red]");
|
||||||
}
|
}
|
||||||
desc.push_str(";\n");
|
desc.push_str(";\n");
|
||||||
@ -195,7 +224,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
|||||||
desc.push_str(&format!(
|
desc.push_str(&format!(
|
||||||
"{from_node} -> {to_node} [label = \"cost {cost} {edge_label}\"];\n",
|
"{from_node} -> {to_node} [label = \"cost {cost} {edge_label}\"];\n",
|
||||||
cost = edge.cost,
|
cost = edge.cost,
|
||||||
edge_label = G::edge_details_dot_label(details)
|
edge_label = G::graphviz_edge_details_label(details)
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -235,9 +235,9 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
desc.push_str(&format!("{node_idx} [label = {:?}]", node));
|
desc.push_str(&format!("{node_idx} [label = {:?}]", node));
|
||||||
if node_idx == self.query_graph.root_node {
|
if node_idx == self.query_graph.root_node.0 as usize {
|
||||||
desc.push_str("[color = blue]");
|
desc.push_str("[color = blue]");
|
||||||
} else if node_idx == self.query_graph.end_node {
|
} else if node_idx == self.query_graph.end_node.0 as usize {
|
||||||
desc.push_str("[color = red]");
|
desc.push_str("[color = red]");
|
||||||
}
|
}
|
||||||
desc.push_str(";\n");
|
desc.push_str(";\n");
|
||||||
@ -262,7 +262,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
|||||||
desc.push_str(&format!(
|
desc.push_str(&format!(
|
||||||
"{from_node} -> {to_node} [label = \"cost {cost} {edge_label}\", color = {color}];\n",
|
"{from_node} -> {to_node} [label = \"cost {cost} {edge_label}\", color = {color}];\n",
|
||||||
cost = edge.cost,
|
cost = edge.cost,
|
||||||
edge_label = G::edge_details_dot_label(details),
|
edge_label = G::graphviz_edge_details_label(details),
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -51,11 +51,11 @@ pub fn visit_to_node<'transaction, 'from_data>(
|
|||||||
db_cache: &mut DatabaseCache<'transaction>,
|
db_cache: &mut DatabaseCache<'transaction>,
|
||||||
to_node: &QueryNode,
|
to_node: &QueryNode,
|
||||||
from_node_data: &'from_data (WordDerivations, i8),
|
from_node_data: &'from_data (WordDerivations, i8),
|
||||||
) -> Result<Option<Vec<(u8, EdgeDetails<ProximityEdge>)>>> {
|
) -> Result<Vec<(u8, EdgeDetails<ProximityEdge>)>> {
|
||||||
let (derivations1, pos1) = from_node_data;
|
let (derivations1, pos1) = from_node_data;
|
||||||
let term2 = match &to_node {
|
let term2 = match &to_node {
|
||||||
QueryNode::End => return Ok(Some(vec![(0, EdgeDetails::Unconditional)])),
|
QueryNode::End => return Ok(vec![(0, EdgeDetails::Unconditional)]),
|
||||||
QueryNode::Deleted | QueryNode::Start => return Ok(None),
|
QueryNode::Deleted | QueryNode::Start => return Ok(vec![]),
|
||||||
QueryNode::Term(term) => term,
|
QueryNode::Term(term) => term,
|
||||||
};
|
};
|
||||||
let LocatedQueryTerm { value: value2, positions: pos2 } = term2;
|
let LocatedQueryTerm { value: value2, positions: pos2 } = term2;
|
||||||
@ -86,7 +86,7 @@ pub fn visit_to_node<'transaction, 'from_data>(
|
|||||||
// We want to effectively ignore this pair of terms
|
// We want to effectively ignore this pair of terms
|
||||||
// Unconditionally walk through the edge without computing the docids
|
// Unconditionally walk through the edge without computing the docids
|
||||||
// But also what should the cost be?
|
// But also what should the cost be?
|
||||||
return Ok(Some(vec![(0, EdgeDetails::Unconditional)]));
|
return Ok(vec![(0, EdgeDetails::Unconditional)]);
|
||||||
}
|
}
|
||||||
|
|
||||||
let updb1 = derivations1.use_prefix_db;
|
let updb1 = derivations1.use_prefix_db;
|
||||||
@ -161,5 +161,5 @@ pub fn visit_to_node<'transaction, 'from_data>(
|
|||||||
})
|
})
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
new_edges.push((8 + (ngram_len2 - 1) as u8, EdgeDetails::Unconditional));
|
new_edges.push((8 + (ngram_len2 - 1) as u8, EdgeDetails::Unconditional));
|
||||||
Ok(Some(new_edges))
|
Ok(new_edges)
|
||||||
}
|
}
|
||||||
|
@ -26,7 +26,7 @@ impl RankingRuleGraphTrait for ProximityGraph {
|
|||||||
type EdgeDetails = ProximityEdge;
|
type EdgeDetails = ProximityEdge;
|
||||||
type BuildVisitedFromNode = (WordDerivations, i8);
|
type BuildVisitedFromNode = (WordDerivations, i8);
|
||||||
|
|
||||||
fn edge_details_dot_label(edge: &Self::EdgeDetails) -> String {
|
fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String {
|
||||||
let ProximityEdge { pairs, proximity } = edge;
|
let ProximityEdge { pairs, proximity } = edge;
|
||||||
format!(", prox {proximity}, {} pairs", pairs.len())
|
format!(", prox {proximity}, {} pairs", pairs.len())
|
||||||
}
|
}
|
||||||
@ -55,7 +55,7 @@ impl RankingRuleGraphTrait for ProximityGraph {
|
|||||||
db_cache: &mut DatabaseCache<'transaction>,
|
db_cache: &mut DatabaseCache<'transaction>,
|
||||||
to_node: &QueryNode,
|
to_node: &QueryNode,
|
||||||
from_node_data: &'from_data Self::BuildVisitedFromNode,
|
from_node_data: &'from_data Self::BuildVisitedFromNode,
|
||||||
) -> Result<Option<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>>> {
|
) -> Result<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>> {
|
||||||
build::visit_to_node(index, txn, db_cache, to_node, from_node_data)
|
build::visit_to_node(index, txn, db_cache, to_node, from_node_data)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -36,15 +36,17 @@ impl<'transaction, Query> RankingRuleOutputIter<'transaction, Query>
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub trait RankingRuleQueryTrait: Sized + Clone + 'static {}
|
pub trait RankingRuleQueryTrait: Sized + Clone + 'static {}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct PlaceholderQuery;
|
pub struct PlaceholderQuery;
|
||||||
impl RankingRuleQueryTrait for PlaceholderQuery {}
|
impl RankingRuleQueryTrait for PlaceholderQuery {}
|
||||||
impl RankingRuleQueryTrait for QueryGraph {}
|
impl RankingRuleQueryTrait for QueryGraph {}
|
||||||
|
|
||||||
pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> {
|
pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> {
|
||||||
// TODO: add an update_candidates function to deal with distinct
|
/// Prepare the ranking rule such that it can start iterating over its
|
||||||
// attributes?
|
/// buckets using [`next_bucket`](RankingRule::next_bucket).
|
||||||
|
///
|
||||||
|
/// The given universe is the universe that will be given to [`next_bucket`](RankingRule::next_bucket).
|
||||||
fn start_iteration(
|
fn start_iteration(
|
||||||
&mut self,
|
&mut self,
|
||||||
index: &Index,
|
index: &Index,
|
||||||
@ -54,6 +56,13 @@ pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> {
|
|||||||
query: &Query,
|
query: &Query,
|
||||||
) -> Result<()>;
|
) -> Result<()>;
|
||||||
|
|
||||||
|
/// Return the next bucket of this ranking rule.
|
||||||
|
///
|
||||||
|
/// The returned candidates MUST be a subset of the given universe.
|
||||||
|
///
|
||||||
|
/// The universe given as argument is either:
|
||||||
|
/// - a subset of the universe given to the previous call to [`next_bucket`](RankingRule::next_bucket); OR
|
||||||
|
/// - the universe given to [`start_iteration`](RankingRule::start_iteration)
|
||||||
fn next_bucket(
|
fn next_bucket(
|
||||||
&mut self,
|
&mut self,
|
||||||
index: &Index,
|
index: &Index,
|
||||||
@ -62,6 +71,8 @@ pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> {
|
|||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
) -> Result<Option<RankingRuleOutput<Query>>>;
|
) -> Result<Option<RankingRuleOutput<Query>>>;
|
||||||
|
|
||||||
|
/// Finish iterating over the buckets, which yields control to the parent ranking rule
|
||||||
|
/// The next call to this ranking rule, if any, will be [`start_iteration`](RankingRule::start_iteration).
|
||||||
fn end_iteration(
|
fn end_iteration(
|
||||||
&mut self,
|
&mut self,
|
||||||
index: &Index,
|
index: &Index,
|
||||||
@ -72,7 +83,7 @@ pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> {
|
|||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct RankingRuleOutput<Q> {
|
pub struct RankingRuleOutput<Q> {
|
||||||
/// The query tree that must be used by the child ranking rule to fetch candidates.
|
/// The query corresponding to the current bucket for the child ranking rule
|
||||||
pub query: Q,
|
pub query: Q,
|
||||||
/// The allowed candidates for the child ranking rule
|
/// The allowed candidates for the child ranking rule
|
||||||
pub candidates: RoaringBitmap,
|
pub candidates: RoaringBitmap,
|
||||||
@ -151,7 +162,6 @@ pub fn execute_search<'transaction>(
|
|||||||
let ranking_rules_len = ranking_rules.len();
|
let ranking_rules_len = ranking_rules.len();
|
||||||
ranking_rules[0].start_iteration(index, txn, db_cache, universe, query_graph)?;
|
ranking_rules[0].start_iteration(index, txn, db_cache, universe, query_graph)?;
|
||||||
|
|
||||||
// TODO: parent_candidates could be used only during debugging?
|
|
||||||
let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len];
|
let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len];
|
||||||
candidates[0] = universe.clone();
|
candidates[0] = universe.clone();
|
||||||
|
|
||||||
@ -296,43 +306,43 @@ mod tests {
|
|||||||
let primary_key = index.primary_key(&txn).unwrap().unwrap();
|
let primary_key = index.primary_key(&txn).unwrap().unwrap();
|
||||||
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
|
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
|
||||||
|
|
||||||
// loop {
|
loop {
|
||||||
// let start = Instant::now();
|
let start = Instant::now();
|
||||||
|
|
||||||
// let mut db_cache = DatabaseCache::default();
|
let mut db_cache = DatabaseCache::default();
|
||||||
|
|
||||||
// let query_graph = make_query_graph(
|
let query_graph = make_query_graph(
|
||||||
// &index,
|
&index,
|
||||||
// &txn,
|
&txn,
|
||||||
// &mut db_cache,
|
&mut db_cache,
|
||||||
// "released from prison by the government",
|
"released from prison by the government",
|
||||||
// )
|
)
|
||||||
// .unwrap();
|
.unwrap();
|
||||||
// // println!("{}", query_graph.graphviz());
|
// println!("{}", query_graph.graphviz());
|
||||||
|
|
||||||
// // TODO: filters + maybe distinct attributes?
|
// TODO: filters + maybe distinct attributes?
|
||||||
// let universe = get_start_universe(
|
let universe = get_start_universe(
|
||||||
// &index,
|
&index,
|
||||||
// &txn,
|
&txn,
|
||||||
// &mut db_cache,
|
&mut db_cache,
|
||||||
// &query_graph,
|
&query_graph,
|
||||||
// TermsMatchingStrategy::Last,
|
TermsMatchingStrategy::Last,
|
||||||
// )
|
)
|
||||||
// .unwrap();
|
.unwrap();
|
||||||
// // println!("universe: {universe:?}");
|
// println!("universe: {universe:?}");
|
||||||
|
|
||||||
// let results = execute_search(
|
let results = execute_search(
|
||||||
// &index,
|
&index,
|
||||||
// &txn,
|
&txn,
|
||||||
// &mut db_cache,
|
&mut db_cache,
|
||||||
// &universe,
|
&universe,
|
||||||
// &query_graph, /* 0, 20 */
|
&query_graph, /* 0, 20 */
|
||||||
// )
|
)
|
||||||
// .unwrap();
|
.unwrap();
|
||||||
|
|
||||||
// let elapsed = start.elapsed();
|
let elapsed = start.elapsed();
|
||||||
// println!("{}us: {results:?}", elapsed.as_micros());
|
println!("{}us: {results:?}", elapsed.as_micros());
|
||||||
// }
|
}
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
|
|
||||||
let mut db_cache = DatabaseCache::default();
|
let mut db_cache = DatabaseCache::default();
|
||||||
@ -388,7 +398,7 @@ mod tests {
|
|||||||
let mut s = Search::new(&txn, &index);
|
let mut s = Search::new(&txn, &index);
|
||||||
s.query("released from prison by the government");
|
s.query("released from prison by the government");
|
||||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
// s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
|
s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
|
||||||
let docs = s.execute().unwrap();
|
let docs = s.execute().unwrap();
|
||||||
|
|
||||||
let elapsed = start.elapsed();
|
let elapsed = start.elapsed();
|
||||||
@ -431,7 +441,7 @@ mod tests {
|
|||||||
builder.execute(|_| (), || false).unwrap();
|
builder.execute(|_| (), || false).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
// #[test]
|
#[test]
|
||||||
fn _index_movies() {
|
fn _index_movies() {
|
||||||
let mut options = EnvOpenOptions::new();
|
let mut options = EnvOpenOptions::new();
|
||||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||||
@ -446,20 +456,14 @@ mod tests {
|
|||||||
|
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
||||||
|
|
||||||
builder.set_primary_key(primary_key.to_owned());
|
builder.set_primary_key(primary_key.to_owned());
|
||||||
|
|
||||||
let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
|
let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
|
||||||
builder.set_searchable_fields(searchable_fields);
|
builder.set_searchable_fields(searchable_fields);
|
||||||
|
|
||||||
let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
|
let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
|
||||||
builder.set_filterable_fields(filterable_fields);
|
builder.set_filterable_fields(filterable_fields);
|
||||||
|
builder.set_min_word_len_one_typo(5);
|
||||||
builder.set_criteria(vec![Criterion::Words]);
|
builder.set_min_word_len_two_typos(100);
|
||||||
|
builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
|
||||||
// let sortable_fields = sortable_fields.iter().map(|s| s.to_string()).collect();
|
|
||||||
// builder.set_sortable_fields(sortable_fields);
|
|
||||||
|
|
||||||
builder.execute(|_| (), || false).unwrap();
|
builder.execute(|_| (), || false).unwrap();
|
||||||
|
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
|
@ -4,11 +4,12 @@ use std::collections::{HashMap, HashSet, VecDeque};
|
|||||||
|
|
||||||
use super::db_cache::DatabaseCache;
|
use super::db_cache::DatabaseCache;
|
||||||
use super::query_term::{QueryTerm, WordDerivations};
|
use super::query_term::{QueryTerm, WordDerivations};
|
||||||
use super::QueryGraph;
|
use super::{NodeIndex, QueryGraph};
|
||||||
use crate::{Index, Result, RoaringBitmapCodec};
|
use crate::{Index, Result, RoaringBitmapCodec};
|
||||||
|
|
||||||
// TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc.
|
// TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc.
|
||||||
|
|
||||||
|
// TODO: reuse NodeDocidsCache in between calls to resolve_query_graph
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct NodeDocIdsCache {
|
pub struct NodeDocIdsCache {
|
||||||
pub cache: HashMap<usize, RoaringBitmap>,
|
pub cache: HashMap<usize, RoaringBitmap>,
|
||||||
@ -26,7 +27,7 @@ pub fn resolve_query_graph<'transaction>(
|
|||||||
|
|
||||||
// resolve_query_graph_rec(index, txn, q, q.root_node, &mut docids, &mut cache)?;
|
// resolve_query_graph_rec(index, txn, q, q.root_node, &mut docids, &mut cache)?;
|
||||||
|
|
||||||
let mut nodes_resolved = HashSet::new();
|
let mut nodes_resolved = RoaringBitmap::new();
|
||||||
// TODO: should be given as an argument and kept between invocations of resolve query graph
|
// TODO: should be given as an argument and kept between invocations of resolve query graph
|
||||||
let mut nodes_docids = vec![RoaringBitmap::new(); q.nodes.len()];
|
let mut nodes_docids = vec![RoaringBitmap::new(); q.nodes.len()];
|
||||||
|
|
||||||
@ -34,16 +35,16 @@ pub fn resolve_query_graph<'transaction>(
|
|||||||
next_nodes_to_visit.push_front(q.root_node);
|
next_nodes_to_visit.push_front(q.root_node);
|
||||||
|
|
||||||
while let Some(node) = next_nodes_to_visit.pop_front() {
|
while let Some(node) = next_nodes_to_visit.pop_front() {
|
||||||
let predecessors = &q.edges[node].incoming;
|
let predecessors = &q.edges[node.0 as usize].predecessors;
|
||||||
if !predecessors.is_subset(&nodes_resolved) {
|
if !predecessors.is_subset(&nodes_resolved) {
|
||||||
next_nodes_to_visit.push_back(node);
|
next_nodes_to_visit.push_back(node);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// Take union of all predecessors
|
// Take union of all predecessors
|
||||||
let predecessors_iter = predecessors.iter().map(|p| &nodes_docids[*p]);
|
let predecessors_iter = predecessors.iter().map(|p| &nodes_docids[p as usize]);
|
||||||
let predecessors_docids = MultiOps::union(predecessors_iter);
|
let predecessors_docids = MultiOps::union(predecessors_iter);
|
||||||
|
|
||||||
let n = &q.nodes[node];
|
let n = &q.nodes[node.0 as usize];
|
||||||
// println!("resolving {node} {n:?}, predecessors: {predecessors:?}, their docids: {predecessors_docids:?}");
|
// println!("resolving {node} {n:?}, predecessors: {predecessors:?}, their docids: {predecessors_docids:?}");
|
||||||
let node_docids = match n {
|
let node_docids = match n {
|
||||||
super::QueryNode::Term(located_term) => {
|
super::QueryNode::Term(located_term) => {
|
||||||
@ -95,18 +96,18 @@ pub fn resolve_query_graph<'transaction>(
|
|||||||
return Ok(predecessors_docids);
|
return Ok(predecessors_docids);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
nodes_resolved.insert(node);
|
nodes_resolved.insert(node.0);
|
||||||
nodes_docids[node] = node_docids;
|
nodes_docids[node.0 as usize] = node_docids;
|
||||||
|
|
||||||
for &succ in q.edges[node].outgoing.iter() {
|
for succ in q.edges[node.0 as usize].successors.iter() {
|
||||||
if !next_nodes_to_visit.contains(&succ) && !nodes_resolved.contains(&succ) {
|
if !next_nodes_to_visit.contains(&NodeIndex(succ)) && !nodes_resolved.contains(succ) {
|
||||||
next_nodes_to_visit.push_back(succ);
|
next_nodes_to_visit.push_back(NodeIndex(succ));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// This is currently slow but could easily be implemented very efficiently
|
// This is currently slow but could easily be implemented very efficiently
|
||||||
for &prec in q.edges[node].incoming.iter() {
|
for prec in q.edges[node.0 as usize].predecessors.iter() {
|
||||||
if q.edges[prec].outgoing.is_subset(&nodes_resolved) {
|
if q.edges[prec as usize].successors.is_subset(&nodes_resolved) {
|
||||||
nodes_docids[prec].clear();
|
nodes_docids[prec as usize].clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// println!("cached docids: {nodes_docids:?}");
|
// println!("cached docids: {nodes_docids:?}");
|
||||||
|
Loading…
x
Reference in New Issue
Block a user