MeiliSearch/milli/src/search/new/graph_based_ranking_rule.rs

/*! Implementation of a generic graph-based ranking rule.

A graph-based ranking rule is a ranking rule that works by representing
its possible operations and their relevancy cost as a directed acyclic multi-graph
built on top of the query graph. It then computes its buckets by finding the
cheapest paths from the start node to the end node and computing the document ids
that satisfy those paths.

For example, the proximity ranking rule builds a graph where the edges between two
nodes represent a condition that the term of the source node is in a certain proximity
to the term of the destination node. With the query "pretty house by" where the term
"pretty" has three possible proximities to the term "house" and "house" has two
proximities to "by", the graph will look like this:

```txt
┌───────┐     ┌───────┐─────1────▶┌───────┐──1──▶┌─────┐    ┌───────┐
│ START │──0─▶│pretty │─────2────▶│ house │      │ by  │─0─▶│  END  │
└───────┘     └───────┘─────3────▶└───────┘──2-─▶└─────┘    └───────┘
```
The proximity ranking rule's first bucket will be determined by the union of all
the shortest paths from START to END, which in this case is:
```txt
START --0-> pretty --1--> house --1--> by --0--> end
```
The path's corresponding document ids are found by taking the intersection of the
document ids of each edge. That is, we find the documents where both `pretty` is
1-close to `house` AND `house` is 1-close to `by`.

For the second bucket, we get the union of the second-cheapest paths, which are:
```txt
START --0-> pretty --1--> house --2--> by --0--> end
START --0-> pretty --2--> house --1--> by --0--> end
```
That is we find the documents where either:
- `pretty` is 1-close to `house` AND `house` is 2-close to `by`
- OR: `pretty` is 2-close to `house` AND `house` is 1-close to `by`
*/

use roaring::RoaringBitmap;

use super::logger::SearchLogger;
use super::ranking_rule_graph::{
    EdgeDocidsCache, EmptyPathsCache, RankingRuleGraph, RankingRuleGraphTrait,
};
use super::small_bitmap::SmallBitmap;
use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
use crate::Result;

/// A generic graph-based ranking rule
pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> {
    id: String,
    // When the ranking rule is not iterating over its buckets,
    // its state is `None`.
    state: Option<GraphBasedRankingRuleState<G>>,
}
impl<G: RankingRuleGraphTrait> GraphBasedRankingRule<G> {
    /// Creates the ranking rule with the given identifier
    pub fn new(id: String) -> Self {
        Self { id, state: None }
    }
}

/// The internal state of a graph-based ranking rule during iteration
pub struct GraphBasedRankingRuleState<G: RankingRuleGraphTrait> {
    /// The current graph
    graph: RankingRuleGraph<G>,
    /// Cache to retrieve the docids associated with each edge
    edge_docids_cache: EdgeDocidsCache<G>,
    /// Cache used to optimistically discard paths that resolve to no documents.
    empty_paths_cache: EmptyPathsCache,
    /// A structure giving the list of possible costs from each node to the end node,
    /// along with a set of unavoidable edges that must be traversed to achieve that distance.
    all_distances: Vec<Vec<(u16, SmallBitmap)>>,
    /// An index in the first element of `all_distances`, giving the cost of the next bucket
    cur_distance_idx: usize,
}

/// Traverse each edge of the graph, computes its associated document ids,
/// and remove this edge from the graph if its docids are disjoint with the
/// given universe.
fn remove_empty_edges<'search, G: RankingRuleGraphTrait>(
    ctx: &mut SearchContext<'search>,
    graph: &mut RankingRuleGraph<G>,
    edge_docids_cache: &mut EdgeDocidsCache<G>,
    universe: &RoaringBitmap,
    empty_paths_cache: &mut EmptyPathsCache,
) -> Result<()> {
    for edge_index in 0..graph.edges_store.len() as u16 {
        if graph.edges_store[edge_index as usize].is_none() {
            continue;
        }
        let docids = edge_docids_cache.get_edge_docids(ctx, edge_index, &*graph, universe)?;
        match docids {
            BitmapOrAllRef::Bitmap(docids) => {
                if docids.is_disjoint(universe) {
                    graph.remove_ranking_rule_edge(edge_index);
                    empty_paths_cache.forbid_edge(edge_index);
                    edge_docids_cache.cache.remove(&edge_index);
                    continue;
                }
            }
            BitmapOrAllRef::All => continue,
        }
    }
    Ok(())
}

impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
    for GraphBasedRankingRule<G>
{
    fn id(&self) -> String {
        self.id.clone()
    }
    fn start_iteration(
        &mut self,
        ctx: &mut SearchContext<'search>,
        _logger: &mut dyn SearchLogger<QueryGraph>,
        universe: &RoaringBitmap,
        query_graph: &QueryGraph,
    ) -> Result<()> {
        let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?;
        let mut edge_docids_cache = EdgeDocidsCache::default();
        let mut empty_paths_cache = EmptyPathsCache::new(graph.edges_store.len() as u16);

        // First simplify the graph as much as possible, by computing the docids of the edges
        // within the rule's universe and removing the edges that have no associated docids.
        remove_empty_edges(
            ctx,
            &mut graph,
            &mut edge_docids_cache,
            universe,
            &mut empty_paths_cache,
        )?;

        // Then pre-compute the cost of all paths from each node to the end node
        let all_distances = graph.initialize_distances_with_necessary_edges();

        let state = GraphBasedRankingRuleState {
            graph,
            edge_docids_cache,
            empty_paths_cache,
            all_distances,
            cur_distance_idx: 0,
        };

        self.state = Some(state);

        Ok(())
    }

    fn next_bucket(
        &mut self,
        ctx: &mut SearchContext<'search>,
        logger: &mut dyn SearchLogger<QueryGraph>,
        universe: &RoaringBitmap,
    ) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
        // If universe.len() <= 1, the bucket sort algorithm
        // should not have called this function.
        assert!(universe.len() > 1);
        // Will crash if `next_bucket` is called before `start_iteration` or after `end_iteration`,
        // should never happen
        let mut state = self.state.take().unwrap();

        // TODO: does this have a real positive performance cost?
        remove_empty_edges(
            ctx,
            &mut state.graph,
            &mut state.edge_docids_cache,
            universe,
            &mut state.empty_paths_cache,
        )?;

        // If the cur_distance_idx does not point to a valid cost in the `all_distances`
        // structure, then we have computed all the buckets and can return.
        if state.cur_distance_idx
            >= state.all_distances[state.graph.query_graph.root_node as usize].len()
        {
            self.state = None;
            return Ok(None);
        }

        // Retrieve the cost of the paths to compute
        let (cost, _) =
            state.all_distances[state.graph.query_graph.root_node as usize][state.cur_distance_idx];
        state.cur_distance_idx += 1;

        let mut bucket = RoaringBitmap::new();

        let GraphBasedRankingRuleState {
            graph,
            edge_docids_cache,
            empty_paths_cache,
            all_distances,
            cur_distance_idx: _,
        } = &mut state;

        let original_universe = universe;
        let mut universe = universe.clone();

        // TODO: remove this unnecessary clone
        let original_graph = graph.clone();
        // and this vector as well
        let mut paths = vec![];

        // For each path of the given cost, we will compute its associated
        // document ids.
        // In case the path does not resolve to any document id, we try to figure out why
        // and update the `empty_paths_cache` accordingly.
        // For example, it may be that the path is empty because one of its edges is disjoint
        // with the universe, or because a prefix of the path is disjoint with the universe, or because
        // the path contains two edges that are disjoint from each other within the universe.
        // Updating the empty_paths_cache helps speed up the execution of `visit_paths_of_cost` and reduces
        // the number of future candidate paths given by that same function.
        graph.visit_paths_of_cost(
            graph.query_graph.root_node as usize,
            cost,
            all_distances,
            empty_paths_cache,
            |path, graph, empty_paths_cache| {
                // Accumulate the path for logging purposes only
                paths.push(path.to_vec());
                let mut path_docids = universe.clone();

                // We store the edges and their docids in vectors in case the path turns out to be
                // empty and we need to figure out why it was empty.
                let mut visited_edges = vec![];
                let mut cached_edge_docids = vec![];

                for &edge_index in path {
                    visited_edges.push(edge_index);
                    let edge_docids =
                        edge_docids_cache.get_edge_docids(ctx, edge_index, graph, &universe)?;
                    let edge_docids = match edge_docids {
                        BitmapOrAllRef::Bitmap(b) => b,
                        BitmapOrAllRef::All => continue,
                    };
                    cached_edge_docids.push((edge_index, edge_docids.clone()));

                    // If the edge is empty, then the path will be empty as well, we update the graph
                    // and caches accordingly and skip to the next candidate path.
                    if edge_docids.is_disjoint(&universe) {
                        // 1. Store in the cache that this edge is empty for this universe
                        empty_paths_cache.forbid_edge(edge_index);
                        // 2. remove this edge from the ranking rule graph
                        graph.remove_ranking_rule_edge(edge_index);
                        // 3. Also remove the entry from the edge_docids_cache, since we don't need it anymore
                        edge_docids_cache.cache.remove(&edge_index);
                        return Ok(());
                    }
                    path_docids &= edge_docids;

                    // If the (sub)path is empty, we try to figure out why and update the caches accordingly.
                    if path_docids.is_disjoint(&universe) {
                        // First, we know that this path is empty, and thus any path
                        // that is a superset of it will also be empty.
                        empty_paths_cache.forbid_prefix(&visited_edges);
                        // Second, if the intersection between this edge and any
                        // previous one is disjoint with the universe,
                        // then we also know that any path containing the same couple of
                        // edges will also be empty.
                        for (edge_index2, edge_docids2) in
                            cached_edge_docids[..cached_edge_docids.len() - 1].iter()
                        {
                            let intersection = edge_docids & edge_docids2;
                            if intersection.is_disjoint(&universe) {
                                empty_paths_cache.forbid_couple_edges(*edge_index2, edge_index);
                            }
                        }
                        return Ok(());
                    }
                }
                bucket |= &path_docids;
                // Reduce the size of the universe so that we can more optimistically discard candidate paths
                universe -= path_docids;
                Ok(())
            },
        )?;

        G::log_state(
            &original_graph,
            &paths,
            &state.empty_paths_cache,
            original_universe,
            &state.all_distances,
            cost,
            logger,
        );

        // TODO: Graph-based ranking rules do not (yet) modify the query graph. We could, however,
        // remove nodes and/or terms within nodes that weren't present in any of the paths.
        let next_query_graph = state.graph.query_graph.clone();

        self.state = Some(state);

        Ok(Some(RankingRuleOutput { query: next_query_graph, candidates: bucket }))
    }

    fn end_iteration(
        &mut self,
        _ctx: &mut SearchContext<'search>,
        _logger: &mut dyn SearchLogger<QueryGraph>,
    ) {
        self.state = None;
    }
}
Add documentation 2023-03-08 13:26:29 +01:00			`/*! Implementation of a generic graph-based ranking rule.`

			`A graph-based ranking rule is a ranking rule that works by representing`
			`its possible operations and their relevancy cost as a directed acyclic multi-graph`
			`built on top of the query graph. It then computes its buckets by finding the`
			`cheapest paths from the start node to the end node and computing the document ids`
			`that satisfy those paths.`

			`For example, the proximity ranking rule builds a graph where the edges between two`
			`nodes represent a condition that the term of the source node is in a certain proximity`
			`to the term of the destination node. With the query "pretty house by" where the term`
			`"pretty" has three possible proximities to the term "house" and "house" has two`
			`proximities to "by", the graph will look like this:`

			```txt
			`┌───────┐ ┌───────┐─────1────▶┌───────┐──1──▶┌─────┐ ┌───────┐`
			`│ START │──0─▶│pretty │─────2────▶│ house │ │ by │─0─▶│ END │`
			`└───────┘ └───────┘─────3────▶└───────┘──2-─▶└─────┘ └───────┘`
			```
			`The proximity ranking rule's first bucket will be determined by the union of all`
			`the shortest paths from START to END, which in this case is:`
			```txt
			`START --0-> pretty --1--> house --1--> by --0--> end`
			```
			`The path's corresponding document ids are found by taking the intersection of the`
			document ids of each edge. That is, we find the documents where both `pretty` is
			1-close to `house` AND `house` is 1-close to `by`.

			`For the second bucket, we get the union of the second-cheapest paths, which are:`
			```txt
			`START --0-> pretty --1--> house --2--> by --0--> end`
			`START --0-> pretty --2--> house --1--> by --0--> end`
			```
			`That is we find the documents where either:`
			- `pretty` is 1-close to `house` AND `house` is 2-close to `by`
			- OR: `pretty` is 2-close to `house` AND `house` is 1-close to `by`
			`*/`

Cargo fmt 2023-03-08 09:55:53 +01:00			`use roaring::RoaringBitmap;`

Add a search logger 2023-02-22 15:34:37 +01:00			`use super::logger::SearchLogger;`
Cargo fmt 2023-03-08 09:55:53 +01:00			`use super::ranking_rule_graph::{`
			`EdgeDocidsCache, EmptyPathsCache, RankingRuleGraph, RankingRuleGraphTrait,`
			`};`
Add a few more optimisations to new search algorithms 2023-03-08 09:53:05 +01:00			`use super::small_bitmap::SmallBitmap;`
Cargo fmt 2023-03-08 09:55:53 +01:00			`use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput, SearchContext};`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`use crate::Result;`
Introduce a generic graph-based ranking rule 2023-02-21 09:48:49 +01:00
Add documentation 2023-03-08 13:26:29 +01:00			`/// A generic graph-based ranking rule`
Introduce a generic graph-based ranking rule 2023-02-21 09:48:49 +01:00			`pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> {`
Add a search logger 2023-02-22 15:34:37 +01:00			`id: String,`
Add documentation 2023-03-08 13:26:29 +01:00			`// When the ranking rule is not iterating over its buckets,`
			// its state is `None`.
Introduce a generic graph-based ranking rule 2023-02-21 09:48:49 +01:00			`state: Option<GraphBasedRankingRuleState<G>>,`
			`}`
Add a search logger 2023-02-22 15:34:37 +01:00			`impl<G: RankingRuleGraphTrait> GraphBasedRankingRule<G> {`
Add documentation 2023-03-08 13:26:29 +01:00			`/// Creates the ranking rule with the given identifier`
Add a search logger 2023-02-22 15:34:37 +01:00			`pub fn new(id: String) -> Self {`
			`Self { id, state: None }`
Introduce a generic graph-based ranking rule 2023-02-21 09:48:49 +01:00			`}`
			`}`

Add documentation 2023-03-08 13:26:29 +01:00			`/// The internal state of a graph-based ranking rule during iteration`
Introduce a generic graph-based ranking rule 2023-02-21 09:48:49 +01:00			`pub struct GraphBasedRankingRuleState<G: RankingRuleGraphTrait> {`
Add documentation 2023-03-08 13:26:29 +01:00			`/// The current graph`
Introduce a generic graph-based ranking rule 2023-02-21 09:48:49 +01:00			`graph: RankingRuleGraph<G>,`
Add documentation 2023-03-08 13:26:29 +01:00			`/// Cache to retrieve the docids associated with each edge`
Introduce a generic graph-based ranking rule 2023-02-21 09:48:49 +01:00			`edge_docids_cache: EdgeDocidsCache<G>,`
Add documentation 2023-03-08 13:26:29 +01:00			`/// Cache used to optimistically discard paths that resolve to no documents.`
Introduce a generic graph-based ranking rule 2023-02-21 09:48:49 +01:00			`empty_paths_cache: EmptyPathsCache,`
Add documentation 2023-03-08 13:26:29 +01:00			`/// A structure giving the list of possible costs from each node to the end node,`
			`/// along with a set of unavoidable edges that must be traversed to achieve that distance.`
Add a few more optimisations to new search algorithms 2023-03-08 09:53:05 +01:00			`all_distances: Vec<Vec<(u16, SmallBitmap)>>,`
Add documentation 2023-03-08 13:26:29 +01:00			/// An index in the first element of `all_distances`, giving the cost of the next bucket
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00			`cur_distance_idx: usize,`
			`}`

Add documentation 2023-03-08 13:26:29 +01:00			`/// Traverse each edge of the graph, computes its associated document ids,`
			`/// and remove this edge from the graph if its docids are disjoint with the`
			`/// given universe.`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`fn remove_empty_edges<'search, G: RankingRuleGraphTrait>(`
			`ctx: &mut SearchContext<'search>,`
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00			`graph: &mut RankingRuleGraph<G>,`
			`edge_docids_cache: &mut EdgeDocidsCache<G>,`
			`universe: &RoaringBitmap,`
			`empty_paths_cache: &mut EmptyPathsCache,`
			`) -> Result<()> {`
Continue documenting and cleaning up the code 2023-03-08 15:04:25 +01:00			`for edge_index in 0..graph.edges_store.len() as u16 {`
			`if graph.edges_store[edge_index as usize].is_none() {`
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00			`continue;`
			`}`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`let docids = edge_docids_cache.get_edge_docids(ctx, edge_index, &*graph, universe)?;`
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00			`match docids {`
Apply a few optimisations for graph-based ranking rules 2023-03-07 14:42:58 +01:00			`BitmapOrAllRef::Bitmap(docids) => {`
			`if docids.is_disjoint(universe) {`
Continue documenting and cleaning up the code 2023-03-08 15:04:25 +01:00			`graph.remove_ranking_rule_edge(edge_index);`
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00			`empty_paths_cache.forbid_edge(edge_index);`
			`edge_docids_cache.cache.remove(&edge_index);`
			`continue;`
			`}`
			`}`
			`BitmapOrAllRef::All => continue,`
			`}`
			`}`
			`Ok(())`
Introduce a generic graph-based ranking rule 2023-02-21 09:48:49 +01:00			`}`

Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>`
Introduce a generic graph-based ranking rule 2023-02-21 09:48:49 +01:00			`for GraphBasedRankingRule<G>`
			`{`
Add a search logger 2023-02-22 15:34:37 +01:00			`fn id(&self) -> String {`
			`self.id.clone()`
			`}`
Introduce a generic graph-based ranking rule 2023-02-21 09:48:49 +01:00			`fn start_iteration(`
			`&mut self,`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`ctx: &mut SearchContext<'search>,`
Remove warnings 2023-02-28 11:49:24 +01:00			`_logger: &mut dyn SearchLogger<QueryGraph>,`
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00			`universe: &RoaringBitmap,`
Introduce a generic graph-based ranking rule 2023-02-21 09:48:49 +01:00			`query_graph: &QueryGraph,`
			`) -> Result<()> {`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?;`
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00			`let mut edge_docids_cache = EdgeDocidsCache::default();`
Continue documenting and cleaning up the code 2023-03-08 15:04:25 +01:00			`let mut empty_paths_cache = EmptyPathsCache::new(graph.edges_store.len() as u16);`
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00
Add documentation 2023-03-08 13:26:29 +01:00			`// First simplify the graph as much as possible, by computing the docids of the edges`
			`// within the rule's universe and removing the edges that have no associated docids.`
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00			`remove_empty_edges(`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`ctx,`
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00			`&mut graph,`
			`&mut edge_docids_cache,`
			`universe,`
			`&mut empty_paths_cache,`
			`)?;`
Add documentation 2023-03-08 13:26:29 +01:00
			`// Then pre-compute the cost of all paths from each node to the end node`
Add a few more optimisations to new search algorithms 2023-03-08 09:53:05 +01:00			`let all_distances = graph.initialize_distances_with_necessary_edges();`
Introduce a generic graph-based ranking rule 2023-02-21 09:48:49 +01:00
			`let state = GraphBasedRankingRuleState {`
			`graph,`
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00			`edge_docids_cache,`
			`empty_paths_cache,`
			`all_distances,`
			`cur_distance_idx: 0,`
Introduce a generic graph-based ranking rule 2023-02-21 09:48:49 +01:00			`};`

			`self.state = Some(state);`

			`Ok(())`
			`}`

			`fn next_bucket(`
			`&mut self,`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`ctx: &mut SearchContext<'search>,`
Add a search logger 2023-02-22 15:34:37 +01:00			`logger: &mut dyn SearchLogger<QueryGraph>,`
Introduce a generic graph-based ranking rule 2023-02-21 09:48:49 +01:00			`universe: &RoaringBitmap,`
			`) -> Result<Option<RankingRuleOutput<QueryGraph>>> {`
Add documentation 2023-03-08 13:26:29 +01:00			`// If universe.len() <= 1, the bucket sort algorithm`
			`// should not have called this function.`
Introduce a generic graph-based ranking rule 2023-02-21 09:48:49 +01:00			`assert!(universe.len() > 1);`
Add documentation 2023-03-08 13:26:29 +01:00			// Will crash if `next_bucket` is called before `start_iteration` or after `end_iteration`,
			`// should never happen`
Introduce a generic graph-based ranking rule 2023-02-21 09:48:49 +01:00			`let mut state = self.state.take().unwrap();`
Add a few more optimisations to new search algorithms 2023-03-08 09:53:05 +01:00
Add documentation 2023-03-08 13:26:29 +01:00			`// TODO: does this have a real positive performance cost?`
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00			`remove_empty_edges(`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`ctx,`
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00			`&mut state.graph,`
			`&mut state.edge_docids_cache,`
			`universe,`
			`&mut state.empty_paths_cache,`
			`)?;`
Add typo ranking rule to new search impl 2023-02-28 14:19:57 +01:00
Add documentation 2023-03-08 13:26:29 +01:00			// If the cur_distance_idx does not point to a valid cost in the `all_distances`
			`// structure, then we have computed all the buckets and can return.`
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00			`if state.cur_distance_idx`
			`>= state.all_distances[state.graph.query_graph.root_node as usize].len()`
			`{`
			`self.state = None;`
Add typo ranking rule to new search impl 2023-02-28 14:19:57 +01:00			`return Ok(None);`
			`}`
Add documentation 2023-03-08 13:26:29 +01:00
			`// Retrieve the cost of the paths to compute`
Add a few more optimisations to new search algorithms 2023-03-08 09:53:05 +01:00			`let (cost, _) =`
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00			`state.all_distances[state.graph.query_graph.root_node as usize][state.cur_distance_idx];`
			`state.cur_distance_idx += 1;`

Apply a few optimisations for graph-based ranking rules 2023-03-07 14:42:58 +01:00			`let mut bucket = RoaringBitmap::new();`

			`let GraphBasedRankingRuleState {`
			`graph,`
			`edge_docids_cache,`
			`empty_paths_cache,`
			`all_distances,`
			`cur_distance_idx: _,`
			`} = &mut state;`

			`let original_universe = universe;`
			`let mut universe = universe.clone();`

Add a few more optimisations to new search algorithms 2023-03-08 09:53:05 +01:00			`// TODO: remove this unnecessary clone`
			`let original_graph = graph.clone();`
Add documentation 2023-03-08 13:26:29 +01:00			`// and this vector as well`
			`let mut paths = vec![];`

			`// For each path of the given cost, we will compute its associated`
			`// document ids.`
			`// In case the path does not resolve to any document id, we try to figure out why`
			// and update the `empty_paths_cache` accordingly.
			`// For example, it may be that the path is empty because one of its edges is disjoint`
			`// with the universe, or because a prefix of the path is disjoint with the universe, or because`
			`// the path contains two edges that are disjoint from each other within the universe.`
			// Updating the empty_paths_cache helps speed up the execution of `visit_paths_of_cost` and reduces
			`// the number of future candidate paths given by that same function.`
Apply a few optimisations for graph-based ranking rules 2023-03-07 14:42:58 +01:00			`graph.visit_paths_of_cost(`
			`graph.query_graph.root_node as usize,`
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00			`cost,`
Apply a few optimisations for graph-based ranking rules 2023-03-07 14:42:58 +01:00			`all_distances,`
			`empty_paths_cache,`
			`\|path, graph, empty_paths_cache\| {`
Add documentation 2023-03-08 13:26:29 +01:00			`// Accumulate the path for logging purposes only`
Add a few more optimisations to new search algorithms 2023-03-08 09:53:05 +01:00			`paths.push(path.to_vec());`
Apply a few optimisations for graph-based ranking rules 2023-03-07 14:42:58 +01:00			`let mut path_docids = universe.clone();`
Add documentation 2023-03-08 13:26:29 +01:00
			`// We store the edges and their docids in vectors in case the path turns out to be`
			`// empty and we need to figure out why it was empty.`
Apply a few optimisations for graph-based ranking rules 2023-03-07 14:42:58 +01:00			`let mut visited_edges = vec![];`
			`let mut cached_edge_docids = vec![];`
Add documentation 2023-03-08 13:26:29 +01:00
Apply a few optimisations for graph-based ranking rules 2023-03-07 14:42:58 +01:00			`for &edge_index in path {`
			`visited_edges.push(edge_index);`
			`let edge_docids =`
			`edge_docids_cache.get_edge_docids(ctx, edge_index, graph, &universe)?;`
			`let edge_docids = match edge_docids {`
			`BitmapOrAllRef::Bitmap(b) => b,`
			`BitmapOrAllRef::All => continue,`
			`};`
			`cached_edge_docids.push((edge_index, edge_docids.clone()));`
Add documentation 2023-03-08 13:26:29 +01:00
			`// If the edge is empty, then the path will be empty as well, we update the graph`
			`// and caches accordingly and skip to the next candidate path.`
Apply a few optimisations for graph-based ranking rules 2023-03-07 14:42:58 +01:00			`if edge_docids.is_disjoint(&universe) {`
			`// 1. Store in the cache that this edge is empty for this universe`
			`empty_paths_cache.forbid_edge(edge_index);`
			`// 2. remove this edge from the ranking rule graph`
Continue documenting and cleaning up the code 2023-03-08 15:04:25 +01:00			`graph.remove_ranking_rule_edge(edge_index);`
Add documentation 2023-03-08 13:26:29 +01:00			`// 3. Also remove the entry from the edge_docids_cache, since we don't need it anymore`
Apply a few optimisations for graph-based ranking rules 2023-03-07 14:42:58 +01:00			`edge_docids_cache.cache.remove(&edge_index);`
			`return Ok(());`
			`}`
			`path_docids &= edge_docids;`

Add documentation 2023-03-08 13:26:29 +01:00			`// If the (sub)path is empty, we try to figure out why and update the caches accordingly.`
Apply a few optimisations for graph-based ranking rules 2023-03-07 14:42:58 +01:00			`if path_docids.is_disjoint(&universe) {`
Add documentation 2023-03-08 13:26:29 +01:00			`// First, we know that this path is empty, and thus any path`
			`// that is a superset of it will also be empty.`
			`empty_paths_cache.forbid_prefix(&visited_edges);`
			`// Second, if the intersection between this edge and any`
Apply a few optimisations for graph-based ranking rules 2023-03-07 14:42:58 +01:00			`// previous one is disjoint with the universe,`
Add documentation 2023-03-08 13:26:29 +01:00			`// then we also know that any path containing the same couple of`
			`// edges will also be empty.`
Apply a few optimisations for graph-based ranking rules 2023-03-07 14:42:58 +01:00			`for (edge_index2, edge_docids2) in`
			`cached_edge_docids[..cached_edge_docids.len() - 1].iter()`
			`{`
			`let intersection = edge_docids & edge_docids2;`
			`if intersection.is_disjoint(&universe) {`
			`empty_paths_cache.forbid_couple_edges(*edge_index2, edge_index);`
			`}`
			`}`
			`return Ok(());`
			`}`
			`}`
			`bucket \|= &path_docids;`
Add documentation 2023-03-08 13:26:29 +01:00			`// Reduce the size of the universe so that we can more optimistically discard candidate paths`
Apply a few optimisations for graph-based ranking rules 2023-03-07 14:42:58 +01:00			`universe -= path_docids;`
			`Ok(())`
			`},`
			`)?;`
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00
			`G::log_state(`
Add a few more optimisations to new search algorithms 2023-03-08 09:53:05 +01:00			`&original_graph,`
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00			`&paths,`
			`&state.empty_paths_cache,`
Apply a few optimisations for graph-based ranking rules 2023-03-07 14:42:58 +01:00			`original_universe,`
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00			`&state.all_distances,`
			`cost,`
			`logger,`
			`);`
Improve the visual/detailed search logger 2023-02-23 13:13:19 +01:00
Add documentation 2023-03-08 13:26:29 +01:00			`// TODO: Graph-based ranking rules do not (yet) modify the query graph. We could, however,`
			`// remove nodes and/or terms within nodes that weren't present in any of the paths.`
Introduce a generic graph-based ranking rule 2023-02-21 09:48:49 +01:00			`let next_query_graph = state.graph.query_graph.clone();`

			`self.state = Some(state);`

			`Ok(Some(RankingRuleOutput { query: next_query_graph, candidates: bucket }))`
			`}`

			`fn end_iteration(`
			`&mut self,`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`_ctx: &mut SearchContext<'search>,`
Remove warnings 2023-02-28 11:49:24 +01:00			`_logger: &mut dyn SearchLogger<QueryGraph>,`
Introduce a generic graph-based ranking rule 2023-02-21 09:48:49 +01:00			`) {`
			`self.state = None;`
			`}`
			`}`