Continue documenting and cleaning up the code

This commit is contained in:
Loïc Lecrenier 2023-03-08 15:04:25 +01:00
parent c232cdabf5
commit 2099991dd1
12 changed files with 245 additions and 325 deletions

View File

@ -85,15 +85,15 @@ fn remove_empty_edges<'search, G: RankingRuleGraphTrait>(
universe: &RoaringBitmap,
empty_paths_cache: &mut EmptyPathsCache,
) -> Result<()> {
for edge_index in 0..graph.all_edges.len() as u16 {
if graph.all_edges[edge_index as usize].is_none() {
for edge_index in 0..graph.edges_store.len() as u16 {
if graph.edges_store[edge_index as usize].is_none() {
continue;
}
let docids = edge_docids_cache.get_edge_docids(ctx, edge_index, &*graph, universe)?;
match docids {
BitmapOrAllRef::Bitmap(docids) => {
if docids.is_disjoint(universe) {
graph.remove_edge(edge_index);
graph.remove_ranking_rule_edge(edge_index);
empty_paths_cache.forbid_edge(edge_index);
edge_docids_cache.cache.remove(&edge_index);
continue;
@ -120,7 +120,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
) -> Result<()> {
let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?;
let mut edge_docids_cache = EdgeDocidsCache::default();
let mut empty_paths_cache = EmptyPathsCache::new(graph.all_edges.len() as u16);
let mut empty_paths_cache = EmptyPathsCache::new(graph.edges_store.len() as u16);
// First simplify the graph as much as possible, by computing the docids of the edges
// within the rule's universe and removing the edges that have no associated docids.
@ -242,7 +242,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
// 1. Store in the cache that this edge is empty for this universe
empty_paths_cache.forbid_edge(edge_index);
// 2. remove this edge from the ranking rule graph
graph.remove_edge(edge_index);
graph.remove_ranking_rule_edge(edge_index);
// 3. Also remove the entry from the edge_docids_cache, since we don't need it anymore
edge_docids_cache.cache.remove(&edge_index);
return Ok(());

View File

@ -8,7 +8,7 @@ use roaring::RoaringBitmap;
use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
use crate::search::new::ranking_rule_graph::{
Edge, EdgeDetails, EmptyPathsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait,
Edge, EdgeCondition, EmptyPathsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait,
TypoGraph,
};
use crate::search::new::small_bitmap::SmallBitmap;
@ -534,24 +534,24 @@ shape: class"
let distances = &distances[node_idx];
Self::query_node_d2_desc(ctx, node_idx, node, distances.as_slice(), file);
}
for edge in graph.all_edges.iter().flatten() {
let Edge { from_node, to_node, details, .. } = edge;
for edge in graph.edges_store.iter().flatten() {
let Edge { source_node, dest_node, condition: details, .. } = edge;
match &details {
EdgeDetails::Unconditional => {
EdgeCondition::Unconditional => {
writeln!(
file,
"{from_node} -> {to_node} : \"always cost {cost}\"",
"{source_node} -> {dest_node} : \"always cost {cost}\"",
cost = edge.cost,
)
.unwrap();
}
EdgeDetails::Data(details) => {
EdgeCondition::Conditional(details) => {
writeln!(
file,
"{from_node} -> {to_node} : \"cost {cost} {edge_label}\"",
"{source_node} -> {dest_node} : \"cost {cost} {edge_label}\"",
cost = edge.cost,
edge_label = R::graphviz_edge_details_label(details)
edge_label = R::label_for_edge_condition(details)
)
.unwrap();
}
@ -589,10 +589,10 @@ shape: class"
edge_idx: u16,
file: &mut File,
) {
let Edge { from_node, to_node, cost, .. } =
graph.all_edges[edge_idx as usize].as_ref().unwrap();
let from_node = &graph.query_graph.nodes[*from_node as usize];
let from_node_desc = match from_node {
let Edge { source_node, dest_node, cost, .. } =
graph.edges_store[edge_idx as usize].as_ref().unwrap();
let source_node = &graph.query_graph.nodes[*source_node as usize];
let source_node_desc = match source_node {
QueryNode::Term(term) => match &term.value {
QueryTerm::Phrase { phrase } => {
let phrase = ctx.phrase_interner.get(*phrase);
@ -606,8 +606,8 @@ shape: class"
QueryNode::Start => "START".to_owned(),
QueryNode::End => "END".to_owned(),
};
let to_node = &graph.query_graph.nodes[*to_node as usize];
let to_node_desc = match to_node {
let dest_node = &graph.query_graph.nodes[*dest_node as usize];
let dest_node_desc = match dest_node {
QueryNode::Term(term) => match &term.value {
QueryTerm::Phrase { phrase } => {
let phrase = ctx.phrase_interner.get(*phrase);
@ -623,7 +623,7 @@ shape: class"
};
writeln!(
file,
"{edge_idx}: \"{from_node_desc}->{to_node_desc} [{cost}]\" {{
"{edge_idx}: \"{source_node_desc}->{dest_node_desc} [{cost}]\" {{
shape: class
}}"
)

View File

@ -1,6 +1,3 @@
// TODO: put primitive query part in here
use std::borrow::Cow;
use std::mem;
use std::ops::RangeInclusive;
@ -18,6 +15,8 @@ use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
use crate::search::{build_dfa, get_first};
use crate::{CboRoaringBitmapLenCodec, Index, Result};
/// A phrase in the user's search query, consisting of several words
/// that must appear side-by-side in the search results.
#[derive(Default, Clone, PartialEq, Eq, Hash)]
pub struct Phrase {
pub words: Vec<Option<Interned<String>>>,
@ -28,18 +27,38 @@ impl Phrase {
}
}
/// A structure storing all the different ways to match
/// a term in the user's search query.
#[derive(Clone)]
pub struct WordDerivations {
/// The original word
pub original: Interned<String>,
// TODO: pub prefix_of: Vec<String>,
// TODO: original should only be used for debugging purposes?
// TODO: pub zero_typo: Option<Interned<String>>,
// TODO: pub prefix_of: Box<[Interned<String>]>,
/// All the synonyms of the original word
pub synonyms: Box<[Interned<Phrase>]>,
/// The original word split into multiple consecutive words
pub split_words: Option<Interned<Phrase>>,
/// The original words and words which are prefixed by it
pub zero_typo: Box<[Interned<String>]>,
/// Words that are 1 typo away from the original word
pub one_typo: Box<[Interned<String>]>,
/// Words that are 2 typos away from the original word
pub two_typos: Box<[Interned<String>]>,
/// True if the prefix databases must be used to retrieve
/// the words which are prefixed by the original word.
pub use_prefix_db: bool,
}
impl WordDerivations {
/// Return an iterator over all the single words derived from the original word.
///
/// This excludes synonyms, split words, and words stored in the prefix databases.
pub fn all_derivations_except_prefix_db(
&'_ self,
) -> impl Iterator<Item = Interned<String>> + Clone + '_ {
@ -49,17 +68,20 @@ impl WordDerivations {
self.zero_typo.is_empty()
&& self.one_typo.is_empty()
&& self.two_typos.is_empty()
&& self.synonyms.is_empty()
&& self.split_words.is_none()
&& !self.use_prefix_db
}
}
/// Compute the word derivations for the given word
pub fn word_derivations(
ctx: &mut SearchContext,
word: &str,
max_typo: u8,
is_prefix: bool,
fst: &fst::Set<Cow<[u8]>>,
) -> Result<WordDerivations> {
let fst = ctx.index.words_fst(ctx.txn)?;
let word_interned = ctx.word_interner.insert(word.to_owned());
let use_prefix_db = is_prefix
@ -171,6 +193,10 @@ pub fn word_derivations(
})
}
/// Split the original word into the two words that appear the
/// most next to each other in the index.
///
/// Return `None` if the original word cannot be split.
fn split_best_frequency(
index: &Index,
txn: &RoTxn,
@ -199,16 +225,12 @@ fn split_best_frequency(
#[derive(Clone)]
pub enum QueryTerm {
// TODO: should there be SplitWord, NGram2, and NGram3 variants?
// NGram2 can have 1 typo and synonyms
// NGram3 cannot have typos but can have synonyms
// SplitWords are a phrase
// Can NGrams be prefixes?
Phrase { phrase: Interned<Phrase> },
Word { derivations: WordDerivations },
}
impl QueryTerm {
/// Return the original word from the given query term
pub fn original_single_word<'interner>(
&self,
word_interner: &'interner Interner<String>,
@ -226,6 +248,7 @@ impl QueryTerm {
}
}
/// A query term term coupled with its position in the user's search query.
#[derive(Clone)]
pub struct LocatedQueryTerm {
pub value: QueryTerm,
@ -233,14 +256,18 @@ pub struct LocatedQueryTerm {
}
impl LocatedQueryTerm {
/// Return `true` iff the word derivations within the query term are empty
pub fn is_empty(&self) -> bool {
match &self.value {
// TODO: phrases should be greedily computed, so that they can be excluded from
// the query graph right from the start?
QueryTerm::Phrase { phrase: _ } => false,
QueryTerm::Word { derivations, .. } => derivations.is_empty(),
}
}
}
/// Convert the tokenised search query into a list of located query terms.
pub fn located_query_terms_from_string<'search>(
ctx: &mut SearchContext<'search>,
query: NormalizedTokenIter<Vec<u8>>,
@ -250,8 +277,8 @@ pub fn located_query_terms_from_string<'search>(
let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?;
let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?;
// TODO: should `exact_words` also disable prefix search, ngrams, split words, or synonyms?
let exact_words = ctx.index.exact_words(ctx.txn)?;
let fst = ctx.index.words_fst(ctx.txn)?;
let nbr_typos = |word: &str| {
if !authorize_typos
@ -266,9 +293,9 @@ pub fn located_query_terms_from_string<'search>(
}
};
let mut primitive_query = Vec::new();
let mut phrase = Vec::new();
let mut located_terms = Vec::new();
let mut phrase = Vec::new();
let mut quoted = false;
let parts_limit = words_limit.unwrap_or(usize::MAX);
@ -280,8 +307,8 @@ pub fn located_query_terms_from_string<'search>(
let mut peekable = query.peekable();
while let Some(token) = peekable.next() {
// early return if word limit is exceeded
if primitive_query.len() >= parts_limit {
return Ok(primitive_query);
if located_terms.len() >= parts_limit {
return Ok(located_terms);
}
match token.kind {
@ -307,24 +334,23 @@ pub fn located_query_terms_from_string<'search>(
match token.kind {
TokenKind::Word => {
let word = token.lemma();
let derivations =
word_derivations(ctx, word, nbr_typos(word), false, &fst)?;
let derivations = word_derivations(ctx, word, nbr_typos(word), false)?;
let located_term = LocatedQueryTerm {
value: QueryTerm::Word { derivations },
positions: position..=position,
};
primitive_query.push(located_term);
located_terms.push(located_term);
}
TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {}
}
} else {
let word = token.lemma();
let derivations = word_derivations(ctx, word, nbr_typos(word), true, &fst)?;
let derivations = word_derivations(ctx, word, nbr_typos(word), true)?;
let located_term = LocatedQueryTerm {
value: QueryTerm::Word { derivations },
positions: position..=position,
};
primitive_query.push(located_term);
located_terms.push(located_term);
}
}
TokenKind::Separator(separator_kind) => {
@ -352,7 +378,7 @@ pub fn located_query_terms_from_string<'search>(
},
positions: phrase_start..=phrase_end,
};
primitive_query.push(located_query_term);
located_terms.push(located_query_term);
}
}
_ => (),
@ -367,10 +393,10 @@ pub fn located_query_terms_from_string<'search>(
},
positions: phrase_start..=phrase_end,
};
primitive_query.push(located_query_term);
located_terms.push(located_query_term);
}
Ok(primitive_query)
Ok(located_terms)
}
// TODO: return a word derivations instead?
@ -396,6 +422,8 @@ pub fn ngram2(
_ => None,
}
}
// TODO: return a word derivations instead?
pub fn ngram3(
ctx: &mut SearchContext,
x: &LocatedQueryTerm,

View File

@ -6,49 +6,43 @@ use crate::search::new::{QueryGraph, SearchContext};
use crate::Result;
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
/// Build the ranking rule graph from the given query graph
pub fn build(ctx: &mut SearchContext, query_graph: QueryGraph) -> Result<Self> {
let QueryGraph { nodes: graph_nodes, edges: graph_edges, .. } = &query_graph;
let mut all_edges = vec![];
let mut node_edges = vec![];
let mut successors = vec![];
let mut edges_store = vec![];
let mut edges_of_node = vec![];
for (node_idx, node) in graph_nodes.iter().enumerate() {
node_edges.push(HashSet::new());
successors.push(HashSet::new());
let new_edges = node_edges.last_mut().unwrap();
let new_successors = successors.last_mut().unwrap();
edges_of_node.push(HashSet::new());
let new_edges = edges_of_node.last_mut().unwrap();
let Some(from_node_data) = G::build_visit_from_node(ctx, node)? else { continue };
let Some(source_node_data) = G::build_step_visit_source_node(ctx, node)? else { continue };
for successor_idx in graph_edges[node_idx].successors.iter() {
let to_node = &graph_nodes[successor_idx as usize];
let mut edges = G::build_visit_to_node(ctx, to_node, &from_node_data)?;
let dest_node = &graph_nodes[successor_idx as usize];
let edges =
G::build_step_visit_destination_node(ctx, dest_node, &source_node_data)?;
if edges.is_empty() {
continue;
}
edges.sort_by_key(|e| e.0);
for (cost, details) in edges {
all_edges.push(Some(Edge {
from_node: node_idx as u16,
to_node: successor_idx,
edges_store.push(Some(Edge {
source_node: node_idx as u16,
dest_node: successor_idx,
cost,
details,
condition: details,
}));
new_edges.insert(all_edges.len() as u16 - 1);
new_successors.insert(successor_idx);
new_edges.insert(edges_store.len() as u16 - 1);
}
}
}
let node_edges = node_edges
let edges_of_node = edges_of_node
.into_iter()
.map(|edges| SmallBitmap::from_iter(edges.into_iter(), all_edges.len() as u16))
.collect();
let successors = successors
.into_iter()
.map(|edges| SmallBitmap::from_iter(edges.into_iter(), all_edges.len() as u16))
.map(|edges| SmallBitmap::from_iter(edges.into_iter(), edges_store.len() as u16))
.collect();
Ok(RankingRuleGraph { query_graph, all_edges, node_edges, successors })
Ok(RankingRuleGraph { query_graph, edges_store, edges_of_node })
}
}

View File

@ -30,7 +30,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
empty_paths_cache,
&mut visit,
&mut vec![],
&mut SmallBitmap::new(self.all_edges.len() as u16),
&mut SmallBitmap::new(self.edges_store.len() as u16),
empty_paths_cache.empty_edges.clone(),
)?;
Ok(())
@ -48,12 +48,12 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
) -> Result<bool> {
let mut any_valid = false;
let edges = self.node_edges[from].clone();
let edges = self.edges_of_node[from].clone();
for edge_idx in edges.iter() {
let Some(edge) = self.all_edges[edge_idx as usize].as_ref() else { continue };
let Some(edge) = self.edges_store[edge_idx as usize].as_ref() else { continue };
if cost < edge.cost as u16
|| forbidden_edges.contains(edge_idx)
|| !all_distances[edge.to_node as usize].iter().any(
|| !all_distances[edge.dest_node as usize].iter().any(
|(next_cost, necessary_edges)| {
(*next_cost == cost - edge.cost as u16)
&& !forbidden_edges.intersects(necessary_edges)
@ -71,13 +71,13 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
new_forbidden_edges.insert(x);
});
let next_any_valid = if edge.to_node == self.query_graph.end_node {
let next_any_valid = if edge.dest_node == self.query_graph.end_node {
any_valid = true;
visit(prev_edges, self, empty_paths_cache)?;
true
} else {
self.visit_paths_of_cost_rec(
edge.to_node as usize,
edge.dest_node as usize,
cost - edge.cost as u16,
all_distances,
empty_paths_cache,
@ -115,7 +115,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
let mut node_stack = VecDeque::new();
distances_to_end[self.query_graph.end_node as usize] =
vec![(0, SmallBitmap::new(self.all_edges.len() as u16))];
vec![(0, SmallBitmap::new(self.edges_store.len() as u16))];
for prev_node in
self.query_graph.edges[self.query_graph.end_node as usize].predecessors.iter()
@ -127,15 +127,15 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
while let Some(cur_node) = node_stack.pop_front() {
let mut self_distances = BTreeMap::<u16, SmallBitmap>::new();
let cur_node_edges = &self.node_edges[cur_node];
let cur_node_edges = &self.edges_of_node[cur_node];
for edge_idx in cur_node_edges.iter() {
let edge = self.all_edges[edge_idx as usize].as_ref().unwrap();
let succ_node = edge.to_node;
let edge = self.edges_store[edge_idx as usize].as_ref().unwrap();
let succ_node = edge.dest_node;
let succ_distances = &distances_to_end[succ_node as usize];
for (succ_distance, succ_necessary_edges) in succ_distances {
let potential_necessary_edges = SmallBitmap::from_iter(
std::iter::once(edge_idx).chain(succ_necessary_edges.iter()),
self.all_edges.len() as u16,
self.edges_store.len() as u16,
);
match self_distances.entry(edge.cost as u16 + succ_distance) {
Entry::Occupied(mut prev_necessary_edges) => {

View File

@ -3,28 +3,13 @@ use std::marker::PhantomData;
use fxhash::FxHashMap;
use roaring::RoaringBitmap;
use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait};
use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait};
use crate::search::new::{BitmapOrAllRef, SearchContext};
use crate::Result;
// TODO: the cache should have a G::EdgeDetails as key
// but then it means that we should have a quick way of
// computing their hash and comparing them
// which can be done...
// by using a pointer (real, Rc, bumpalo, or in a vector)???
//
// But actually.... the edge details' docids are a subset of the universe at the
// moment they were computed.
// But the universes between two iterations of a ranking rule are completely different
// Thus, there is no point in doing this.
// UNLESS...
// we compute the whole docids corresponding to the edge details (potentially expensive in time and memory
// in the common case)
//
// But we could still benefit within a single iteration for requests like:
// `a a a a a a a a a` where we have many of the same edge details, repeated
/// A cache storing the document ids associated with each ranking rule edge
pub struct EdgeDocidsCache<G: RankingRuleGraphTrait> {
// TODO: should be FxHashMap<Interned<EdgeCondition>, RoaringBitmap>
pub cache: FxHashMap<u16, RoaringBitmap>,
_phantom: PhantomData<G>,
}
@ -34,19 +19,24 @@ impl<G: RankingRuleGraphTrait> Default for EdgeDocidsCache<G> {
}
}
impl<G: RankingRuleGraphTrait> EdgeDocidsCache<G> {
/// Retrieve the document ids for the given edge condition.
///
/// If the cache does not yet contain these docids, they are computed
/// and inserted in the cache.
pub fn get_edge_docids<'s, 'search>(
&'s mut self,
ctx: &mut SearchContext<'search>,
// TODO: should be Interned<EdgeCondition>
edge_index: u16,
graph: &RankingRuleGraph<G>,
// TODO: maybe universe doesn't belong here
universe: &RoaringBitmap,
) -> Result<BitmapOrAllRef<'s>> {
let edge = graph.all_edges[edge_index as usize].as_ref().unwrap();
let edge = graph.edges_store[edge_index as usize].as_ref().unwrap();
match &edge.details {
EdgeDetails::Unconditional => Ok(BitmapOrAllRef::All),
EdgeDetails::Data(details) => {
match &edge.condition {
EdgeCondition::Unconditional => Ok(BitmapOrAllRef::All),
EdgeCondition::Conditional(details) => {
if self.cache.contains_key(&edge_index) {
// TODO: should we update the bitmap in the cache if the new universe
// reduces it?
@ -56,7 +46,7 @@ impl<G: RankingRuleGraphTrait> EdgeDocidsCache<G> {
return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index]));
}
// TODO: maybe universe doesn't belong here
let docids = universe & G::compute_docids(ctx, details, universe)?;
let docids = universe & G::resolve_edge_condition(ctx, details, universe)?;
let _ = self.cache.insert(edge_index, docids);
let docids = &self.cache[&edge_index];
Ok(BitmapOrAllRef::Bitmap(docids))

View File

@ -1,20 +1,29 @@
use super::paths_map::PathsMap;
use super::paths_map::PathSet;
use crate::search::new::small_bitmap::SmallBitmap;
/// A cache which stores sufficient conditions for a path
/// to resolve to an empty set of candidates within the current
/// universe.
#[derive(Clone)]
pub struct EmptyPathsCache {
/// The set of edge indexes that resolve to no documents.
pub empty_edges: SmallBitmap,
pub empty_prefixes: PathsMap<()>,
/// A set of path prefixes that resolve to no documents.
pub empty_prefixes: PathSet,
/// A set of empty couple of edge indexes that resolve to no documents.
pub empty_couple_edges: Vec<SmallBitmap>,
}
impl EmptyPathsCache {
/// Create a new cache for a ranking rule graph containing at most `all_edges_len` edges.
pub fn new(all_edges_len: u16) -> Self {
Self {
empty_edges: SmallBitmap::new(all_edges_len),
empty_prefixes: PathsMap::default(),
empty_prefixes: PathSet::default(),
empty_couple_edges: vec![SmallBitmap::new(all_edges_len); all_edges_len as usize],
}
}
/// Store in the cache that every path containing the given edge resolves to no documents.
pub fn forbid_edge(&mut self, edge_idx: u16) {
self.empty_edges.insert(edge_idx);
self.empty_couple_edges[edge_idx as usize].clear();
@ -23,12 +32,17 @@ impl EmptyPathsCache {
edges2.remove(edge_idx);
}
}
/// Store in the cache that every path containing the given prefix resolves to no documents.
pub fn forbid_prefix(&mut self, prefix: &[u16]) {
self.empty_prefixes.insert(prefix.iter().copied(), ());
self.empty_prefixes.insert(prefix.iter().copied());
}
/// Store in the cache that every path containing the two given edges resolves to no documents.
pub fn forbid_couple_edges(&mut self, edge1: u16, edge2: u16) {
self.empty_couple_edges[edge1 as usize].insert(edge2);
}
/// Returns true if the cache can determine that the given path resolves to no documents.
pub fn path_is_empty(&self, path: &[u16], path_bitmap: &SmallBitmap) -> bool {
if path_bitmap.intersects(&self.empty_edges) {
return true;

View File

@ -1,9 +1,19 @@
/*! Module implementing the graph used for the graph-based ranking rules
and its related algorithms.
A ranking rule graph is built on top of the [`QueryGraph`]: the nodes stay
the same but the edges are replaced.
*/
mod build;
mod cheapest_paths;
mod edge_docids_cache;
mod empty_paths_cache;
mod paths_map;
/// Implementation of the `proximity` ranking rule
mod proximity;
/// Implementation of the `typo` ranking rule
mod typo;
pub use edge_docids_cache::EdgeDocidsCache;
@ -17,30 +27,38 @@ use super::small_bitmap::SmallBitmap;
use super::{QueryGraph, QueryNode, SearchContext};
use crate::Result;
/// The condition that is associated with an edge in the ranking rule graph.
///
/// Some edges are unconditional, which means that traversing them does not reduce
/// the set of candidates.
///
/// Most edges, however, have a condition attached to them. For example, for the
/// proximity ranking rule, the condition could be that a word is N-close to another one.
/// When the edge is traversed, some database operations are executed to retrieve the set
/// of documents that satisfy the condition, which reduces the list of candidate document ids.
#[derive(Debug, Clone)]
pub enum EdgeDetails<E> {
pub enum EdgeCondition<E> {
Unconditional,
Data(E),
Conditional(E),
}
/// An edge in the ranking rule graph.
///
/// It contains:
/// 1. The source and destination nodes
/// 2. The cost of traversing this edge
/// 3. The condition associated with it
#[derive(Debug, Clone)]
pub struct Edge<E> {
pub from_node: u16,
pub to_node: u16,
pub source_node: u16,
pub dest_node: u16,
pub cost: u8,
pub details: EdgeDetails<E>,
}
#[derive(Debug, Clone)]
pub struct EdgePointer<'graph, E> {
pub index: u16,
pub edge: &'graph Edge<E>,
pub condition: EdgeCondition<E>,
}
// pub struct SubWordDerivations {
// words: FxHashSet<Interned<String>>,
// synonyms: FxHashSet<Interned<Phrase>>, // NO! they're phrases, not strings
// split_words: bool,
// phrases: FxHashSet<Interned<Phrase>>,
// use_prefix_db: bool,
// }
@ -74,46 +92,55 @@ pub struct EdgePointer<'graph, E> {
// }
// fn word_derivations_used_by_edge<G: RankingRuleGraphTrait>(
// edge: G::EdgeDetails,
// edge: G::EdgeCondition,
// ) -> SubWordDerivations {
// todo!()
// }
/// A trait to be implemented by a marker type to build a graph-based ranking rule.
///
/// It mostly describes how to:
/// 1. Retrieve the set of edges (their cost and condition) between two nodes.
/// 2. Compute the document ids satisfying a condition
pub trait RankingRuleGraphTrait: Sized {
/// The details of an edge connecting two query nodes. These details
/// The condition of an edge connecting two query nodes. The condition
/// should be sufficient to compute the edge's cost and associated document ids
/// in [`compute_docids`](RankingRuleGraphTrait).
type EdgeDetails: Sized + Clone;
/// in [`resolve_edge_condition`](RankingRuleGraphTrait::resolve_edge_condition).
type EdgeCondition: Sized + Clone;
/// A structure used in the construction of the graph, created when a
/// query graph source node is visited. It is used to determine the cost
/// and condition of a ranking rule edge when the destination node is visited.
type BuildVisitedFromNode;
/// Return the label of the given edge details, to be used when visualising
/// the ranking rule graph using GraphViz.
fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String;
/// Return the label of the given edge condition, to be used when visualising
/// the ranking rule graph.
fn label_for_edge_condition(edge: &Self::EdgeCondition) -> String;
/// Compute the document ids associated with the given edge.
fn compute_docids<'search>(
/// Compute the document ids associated with the given edge condition,
/// restricted to the given universe.
fn resolve_edge_condition<'search>(
ctx: &mut SearchContext<'search>,
edge_details: &Self::EdgeDetails,
edge_condition: &Self::EdgeCondition,
universe: &RoaringBitmap,
) -> Result<RoaringBitmap>;
/// Prepare to build the edges outgoing from `from_node`.
/// Prepare to build the edges outgoing from `source_node`.
///
/// This call is followed by zero, one or more calls to [`build_visit_to_node`](RankingRuleGraphTrait::build_visit_to_node),
/// This call is followed by zero, one or more calls to [`build_step_visit_destination_node`](RankingRuleGraphTrait::build_step_visit_destination_node),
/// which builds the actual edges.
fn build_visit_from_node<'search>(
fn build_step_visit_source_node<'search>(
ctx: &mut SearchContext<'search>,
from_node: &QueryNode,
source_node: &QueryNode,
) -> Result<Option<Self::BuildVisitedFromNode>>;
/// Return the cost and details of the edges going from the previously visited node
/// (with [`build_visit_from_node`](RankingRuleGraphTrait::build_visit_from_node)) to `to_node`.
fn build_visit_to_node<'from_data, 'search: 'from_data>(
/// Return the cost and condition of the edges going from the previously visited node
/// (with [`build_step_visit_source_node`](RankingRuleGraphTrait::build_step_visit_source_node)) to `dest_node`.
fn build_step_visit_destination_node<'from_data, 'search: 'from_data>(
ctx: &mut SearchContext<'search>,
to_node: &QueryNode,
from_node_data: &'from_data Self::BuildVisitedFromNode,
) -> Result<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>>;
dest_node: &QueryNode,
source_node_data: &'from_data Self::BuildVisitedFromNode,
) -> Result<Vec<(u8, EdgeCondition<Self::EdgeCondition>)>>;
fn log_state(
graph: &RankingRuleGraph<Self>,
@ -126,45 +153,32 @@ pub trait RankingRuleGraphTrait: Sized {
);
}
/// The graph used by graph-based ranking rules.
///
/// It is built on top of a [`QueryGraph`], keeping the same nodes
/// but replacing the edges.
pub struct RankingRuleGraph<G: RankingRuleGraphTrait> {
pub query_graph: QueryGraph,
// pub edges: Vec<HashMap<usize, Vec<Edge<G::EdgeDetails>>>>,
pub all_edges: Vec<Option<Edge<G::EdgeDetails>>>,
pub node_edges: Vec<SmallBitmap>,
pub successors: Vec<SmallBitmap>,
// TODO: to get the edges between two nodes:
// 1. get node_outgoing_edges[from]
// 2. get node_incoming_edges[to]
// 3. take intersection betweem the two
pub edges_store: Vec<Option<Edge<G::EdgeCondition>>>,
pub edges_of_node: Vec<SmallBitmap>,
}
impl<G: RankingRuleGraphTrait> Clone for RankingRuleGraph<G> {
fn clone(&self) -> Self {
Self {
query_graph: self.query_graph.clone(),
all_edges: self.all_edges.clone(),
node_edges: self.node_edges.clone(),
successors: self.successors.clone(),
edges_store: self.edges_store.clone(),
edges_of_node: self.edges_of_node.clone(),
}
}
}
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
pub fn remove_edge(&mut self, edge_index: u16) {
let edge_opt = &mut self.all_edges[edge_index as usize];
/// Remove the given edge from the ranking rule graph
pub fn remove_ranking_rule_edge(&mut self, edge_index: u16) {
let edge_opt = &mut self.edges_store[edge_index as usize];
let Some(edge) = &edge_opt else { return };
let (from_node, _to_node) = (edge.from_node, edge.to_node);
let (source_node, _dest_node) = (edge.source_node, edge.dest_node);
*edge_opt = None;
let from_node_edges = &mut self.node_edges[from_node as usize];
from_node_edges.remove(edge_index);
let mut new_successors_from_node = SmallBitmap::new(self.all_edges.len() as u16);
let all_edges = &self.all_edges;
for from_node_edge in from_node_edges.iter() {
let Edge { to_node, .. } = &all_edges[from_node_edge as usize].as_ref().unwrap();
new_successors_from_node.insert(*to_node);
}
self.successors[from_node as usize] = new_successors_from_node;
self.edges_of_node[source_node as usize].remove(edge_index);
}
}

View File

@ -1,117 +1,32 @@
use super::cheapest_paths::Path;
use crate::search::new::small_bitmap::SmallBitmap;
// What is PathsMap used for?
// What is PathSet used for?
// For the empty_prefixes field in the EmptyPathsCache only :/
// but it could be used for more, like efficient computing of a set of paths
#[derive(Debug, Clone)]
pub struct PathsMap<V> {
pub nodes: Vec<(u16, PathsMap<V>)>,
pub value: Option<V>,
/// A set of [`Path`]
#[derive(Default, Debug, Clone)]
pub struct PathSet {
nodes: Vec<(u16, PathSet)>,
is_end: bool,
}
impl<V> Default for PathsMap<V> {
fn default() -> Self {
Self { nodes: vec![], value: None }
}
}
impl PathsMap<u64> {
pub fn from_paths(paths: &[Path]) -> Self {
let mut result = Self::default();
for p in paths {
result.add_path(p);
}
result
}
pub fn add_path(&mut self, path: &Path) {
self.insert(path.edges.iter().copied(), path.cost);
}
}
impl<V> PathsMap<V> {
pub fn is_empty(&self) -> bool {
self.nodes.is_empty() && self.value.is_none()
}
pub fn insert(&mut self, mut edges: impl Iterator<Item = u16>, value: V) {
impl PathSet {
pub fn insert(&mut self, mut edges: impl Iterator<Item = u16>) {
match edges.next() {
None => {
self.value = Some(value);
self.is_end = true;
}
Some(first_edge) => {
// comment
for (edge, next_node) in &mut self.nodes {
if edge == &first_edge {
return next_node.insert(edges, value);
return next_node.insert(edges);
}
}
let mut rest = PathsMap::default();
rest.insert(edges, value);
let mut rest = PathSet::default();
rest.insert(edges);
self.nodes.push((first_edge, rest));
}
}
}
fn remove_first_rec(&mut self, cur: &mut Vec<u16>) -> (bool, V) {
let Some((first_edge, rest)) = self.nodes.first_mut() else {
// The PathsMap has to be correct by construction here, otherwise
// the unwrap() will crash
return (true, self.value.take().unwrap())
};
cur.push(*first_edge);
let (rest_is_empty, value) = rest.remove_first_rec(cur);
if rest_is_empty {
self.nodes.remove(0);
(self.nodes.is_empty(), value)
} else {
(false, value)
}
}
pub fn remove_first(&mut self) -> Option<(Vec<u16>, V)> {
if self.is_empty() {
return None;
}
let mut result = vec![];
let (_, value) = self.remove_first_rec(&mut result);
Some((result, value))
}
pub fn iterate_rec(&self, cur: &mut Vec<u16>, visit: &mut impl FnMut(&Vec<u16>, &V)) {
if let Some(value) = &self.value {
visit(cur, value);
}
for (first_edge, rest) in self.nodes.iter() {
cur.push(*first_edge);
rest.iterate_rec(cur, visit);
cur.pop();
}
}
pub fn iterate(&self, mut visit: impl FnMut(&Vec<u16>, &V)) {
self.iterate_rec(&mut vec![], &mut visit)
}
pub fn remove_prefixes<U>(&mut self, prefixes: &PathsMap<U>) {
prefixes.iterate(|prefix, _v| {
self.remove_prefix(prefix);
});
}
pub fn remove_edges(&mut self, forbidden_edges: &SmallBitmap) {
let mut i = 0;
while i < self.nodes.len() {
let should_remove = if forbidden_edges.contains(self.nodes[i].0) {
true
} else if !self.nodes[i].1.nodes.is_empty() {
self.nodes[i].1.remove_edges(forbidden_edges);
self.nodes[i].1.nodes.is_empty()
} else {
false
};
if should_remove {
self.nodes.remove(i);
} else {
i += 1;
}
}
}
pub fn remove_edge(&mut self, forbidden_edge: &u16) {
let mut i = 0;
while i < self.nodes.len() {
@ -130,34 +45,11 @@ impl<V> PathsMap<V> {
}
}
}
pub fn remove_prefix(&mut self, forbidden_prefix: &[u16]) {
let [first_edge, remaining_prefix @ ..] = forbidden_prefix else {
self.nodes.clear();
self.value = None;
return;
};
let mut i = 0;
while i < self.nodes.len() {
let edge = self.nodes[i].0;
let should_remove = if edge == *first_edge {
self.nodes[i].1.remove_prefix(remaining_prefix);
self.nodes[i].1.nodes.is_empty()
} else {
false
};
if should_remove {
self.nodes.remove(i);
} else {
i += 1;
}
}
}
pub fn final_edges_after_prefix(&self, prefix: &[u16], visit: &mut impl FnMut(u16)) {
let [first_edge, remaining_prefix @ ..] = prefix else {
for node in self.nodes.iter() {
if node.1.value.is_some() {
if node.1.is_end {
visit(node.0)
}
}
@ -170,20 +62,8 @@ impl<V> PathsMap<V> {
}
}
pub fn edge_indices_after_prefix(&self, prefix: &[u16]) -> Vec<u16> {
let [first_edge, remaining_prefix @ ..] = prefix else {
return self.nodes.iter().map(|n| n.0).collect();
};
for (edge, rest) in self.nodes.iter() {
if edge == first_edge {
return rest.edge_indices_after_prefix(remaining_prefix);
}
}
vec![]
}
pub fn contains_prefix_of_path(&self, path: &[u16]) -> bool {
if self.value.is_some() {
if self.is_end {
return true;
}
match path {

View File

@ -5,7 +5,7 @@ use itertools::Itertools;
use super::ProximityEdge;
use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
use crate::search::new::ranking_rule_graph::proximity::WordPair;
use crate::search::new::ranking_rule_graph::EdgeDetails;
use crate::search::new::ranking_rule_graph::EdgeCondition;
use crate::search::new::{QueryNode, SearchContext};
use crate::Result;
@ -57,10 +57,10 @@ pub fn visit_to_node<'search, 'from_data>(
ctx: &mut SearchContext<'search>,
to_node: &QueryNode,
from_node_data: &'from_data (WordDerivations, i8),
) -> Result<Vec<(u8, EdgeDetails<ProximityEdge>)>> {
) -> Result<Vec<(u8, EdgeCondition<ProximityEdge>)>> {
let (derivations1, pos1) = from_node_data;
let term2 = match &to_node {
QueryNode::End => return Ok(vec![(0, EdgeDetails::Unconditional)]),
QueryNode::End => return Ok(vec![(0, EdgeCondition::Unconditional)]),
QueryNode::Deleted | QueryNode::Start => return Ok(vec![]),
QueryNode::Term(term) => term,
};
@ -96,7 +96,7 @@ pub fn visit_to_node<'search, 'from_data>(
// We want to effectively ignore this pair of terms
// Unconditionally walk through the edge without computing the docids
// But also what should the cost be?
return Ok(vec![(0, EdgeDetails::Unconditional)]);
return Ok(vec![(0, EdgeCondition::Unconditional)]);
}
let updb1 = derivations1.use_prefix_db;
@ -189,7 +189,7 @@ pub fn visit_to_node<'search, 'from_data>(
for (proximity, word_pairs) in proximity_word_pairs {
edges.push((
cost,
EdgeDetails::Data(ProximityEdge {
EdgeCondition::Conditional(ProximityEdge {
pairs: word_pairs.into_boxed_slice(),
proximity,
}),
@ -198,6 +198,6 @@ pub fn visit_to_node<'search, 'from_data>(
edges
})
.collect::<Vec<_>>();
new_edges.push((8 + (ngram_len2 - 1) as u8, EdgeDetails::Unconditional));
new_edges.push((8 + (ngram_len2 - 1) as u8, EdgeCondition::Unconditional));
Ok(new_edges)
}

View File

@ -4,7 +4,7 @@ pub mod compute_docids;
use roaring::RoaringBitmap;
use super::empty_paths_cache::EmptyPathsCache;
use super::{EdgeDetails, RankingRuleGraphTrait};
use super::{EdgeCondition, RankingRuleGraphTrait};
use crate::search::new::interner::Interned;
use crate::search::new::logger::SearchLogger;
use crate::search::new::query_term::WordDerivations;
@ -30,34 +30,34 @@ pub struct ProximityEdge {
pub enum ProximityGraph {}
impl RankingRuleGraphTrait for ProximityGraph {
type EdgeDetails = ProximityEdge;
type EdgeCondition = ProximityEdge;
type BuildVisitedFromNode = (WordDerivations, i8);
fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String {
fn label_for_edge_condition(edge: &Self::EdgeCondition) -> String {
let ProximityEdge { pairs, proximity } = edge;
format!(", prox {proximity}, {} pairs", pairs.len())
}
fn compute_docids<'search>(
fn resolve_edge_condition<'search>(
ctx: &mut SearchContext<'search>,
edge: &Self::EdgeDetails,
edge: &Self::EdgeCondition,
universe: &RoaringBitmap,
) -> Result<roaring::RoaringBitmap> {
compute_docids::compute_docids(ctx, edge, universe)
}
fn build_visit_from_node<'search>(
fn build_step_visit_source_node<'search>(
ctx: &mut SearchContext<'search>,
from_node: &QueryNode,
) -> Result<Option<Self::BuildVisitedFromNode>> {
build::visit_from_node(ctx, from_node)
}
fn build_visit_to_node<'from_data, 'search: 'from_data>(
fn build_step_visit_destination_node<'from_data, 'search: 'from_data>(
ctx: &mut SearchContext<'search>,
to_node: &QueryNode,
from_node_data: &'from_data Self::BuildVisitedFromNode,
) -> Result<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>> {
) -> Result<Vec<(u8, EdgeCondition<Self::EdgeCondition>)>> {
build::visit_to_node(ctx, to_node, from_node_data)
}

View File

@ -2,7 +2,7 @@ use heed::BytesDecode;
use roaring::RoaringBitmap;
use super::empty_paths_cache::EmptyPathsCache;
use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait};
use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait};
use crate::search::new::interner::Interned;
use crate::search::new::logger::SearchLogger;
use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations};
@ -20,19 +20,19 @@ pub enum TypoEdge {
pub enum TypoGraph {}
impl RankingRuleGraphTrait for TypoGraph {
type EdgeDetails = TypoEdge;
type EdgeCondition = TypoEdge;
type BuildVisitedFromNode = ();
fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String {
fn label_for_edge_condition(edge: &Self::EdgeCondition) -> String {
match edge {
TypoEdge::Phrase { .. } => ", 0 typos".to_owned(),
TypoEdge::Word { nbr_typos, .. } => format!(", {nbr_typos} typos"),
}
}
fn compute_docids<'db_cache, 'search>(
fn resolve_edge_condition<'db_cache, 'search>(
ctx: &mut SearchContext<'search>,
edge: &Self::EdgeDetails,
edge: &Self::EdgeCondition,
universe: &RoaringBitmap,
) -> Result<RoaringBitmap> {
match edge {
@ -66,29 +66,29 @@ impl RankingRuleGraphTrait for TypoGraph {
}
}
fn build_visit_from_node<'search>(
fn build_step_visit_source_node<'search>(
_ctx: &mut SearchContext<'search>,
_from_node: &QueryNode,
) -> Result<Option<Self::BuildVisitedFromNode>> {
Ok(Some(()))
}
fn build_visit_to_node<'from_data, 'search: 'from_data>(
fn build_step_visit_destination_node<'from_data, 'search: 'from_data>(
_ctx: &mut SearchContext<'search>,
to_node: &QueryNode,
_from_node_data: &'from_data Self::BuildVisitedFromNode,
) -> Result<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>> {
) -> Result<Vec<(u8, EdgeCondition<Self::EdgeCondition>)>> {
match to_node {
QueryNode::Term(LocatedQueryTerm { value, .. }) => match value {
&QueryTerm::Phrase { phrase } => {
Ok(vec![(0, EdgeDetails::Data(TypoEdge::Phrase { phrase }))])
Ok(vec![(0, EdgeCondition::Conditional(TypoEdge::Phrase { phrase }))])
}
QueryTerm::Word { derivations } => {
let mut edges = vec![];
if !derivations.zero_typo.is_empty() || derivations.use_prefix_db {
edges.push((
0,
EdgeDetails::Data(TypoEdge::Word {
EdgeCondition::Conditional(TypoEdge::Word {
derivations: derivations.clone(),
nbr_typos: 0,
}),
@ -97,7 +97,7 @@ impl RankingRuleGraphTrait for TypoGraph {
if !derivations.one_typo.is_empty() {
edges.push((
1,
EdgeDetails::Data(TypoEdge::Word {
EdgeCondition::Conditional(TypoEdge::Word {
derivations: derivations.clone(),
nbr_typos: 1,
}),
@ -106,7 +106,7 @@ impl RankingRuleGraphTrait for TypoGraph {
if !derivations.two_typos.is_empty() {
edges.push((
2,
EdgeDetails::Data(TypoEdge::Word {
EdgeCondition::Conditional(TypoEdge::Word {
derivations: derivations.clone(),
nbr_typos: 2,
}),
@ -115,7 +115,7 @@ impl RankingRuleGraphTrait for TypoGraph {
Ok(edges)
}
},
QueryNode::End => Ok(vec![(0, EdgeDetails::Unconditional)]),
QueryNode::End => Ok(vec![(0, EdgeCondition::Unconditional)]),
QueryNode::Deleted | QueryNode::Start => panic!(),
}
}