3835: Add more documentation to graph-based ranking rule algorithms + comment cleanup r=Kerollmops a=loiclec

In addition to documenting the `cheapest_path.rs` file, this PR cleans up a few outdated comments as well as some TODOs. These TODOs have been moved to https://github.com/meilisearch/meilisearch/issues/3776



Co-authored-by: Loïc Lecrenier <loic.lecrenier@icloud.com>
This commit is contained in:
meili-bors[bot] 2023-06-15 15:30:24 +00:00 committed by GitHub
commit cb9d78fc7f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 117 additions and 93 deletions

View File

@ -26,7 +26,6 @@ pub fn apply_distinct_rule(
ctx: &mut SearchContext, ctx: &mut SearchContext,
field_id: u16, field_id: u16,
candidates: &RoaringBitmap, candidates: &RoaringBitmap,
// TODO: add a universe here, such that the `excluded` are a subset of the universe?
) -> Result<DistinctOutput> { ) -> Result<DistinctOutput> {
let mut excluded = RoaringBitmap::new(); let mut excluded = RoaringBitmap::new();
let mut remaining = RoaringBitmap::new(); let mut remaining = RoaringBitmap::new();

View File

@ -206,7 +206,7 @@ impl State {
)?; )?;
intersection &= &candidates; intersection &= &candidates;
if !intersection.is_empty() { if !intersection.is_empty() {
// TODO: although not really worth it in terms of performance, // Although not really worth it in terms of performance,
// if would be good to put this in cache for the sake of consistency // if would be good to put this in cache for the sake of consistency
let candidates_with_exact_word_count = if count_all_positions < u8::MAX as usize { let candidates_with_exact_word_count = if count_all_positions < u8::MAX as usize {
ctx.index ctx.index

View File

@ -32,7 +32,7 @@ impl<T> Interned<T> {
#[derive(Clone)] #[derive(Clone)]
pub struct DedupInterner<T> { pub struct DedupInterner<T> {
stable_store: Vec<T>, stable_store: Vec<T>,
lookup: FxHashMap<T, Interned<T>>, // TODO: Arc lookup: FxHashMap<T, Interned<T>>,
} }
impl<T> Default for DedupInterner<T> { impl<T> Default for DedupInterner<T> {
fn default() -> Self { fn default() -> Self {

View File

@ -1,5 +1,4 @@
/// Maximum number of tokens we consider in a single search. /// Maximum number of tokens we consider in a single search.
// TODO: Loic, find proper value here so we don't overflow the interner.
pub const MAX_TOKEN_COUNT: usize = 1_000; pub const MAX_TOKEN_COUNT: usize = 1_000;
/// Maximum number of prefixes that can be derived from a single word. /// Maximum number of prefixes that can be derived from a single word.

View File

@ -92,7 +92,7 @@ impl QueryGraph {
/// which contains ngrams. /// which contains ngrams.
pub fn from_query( pub fn from_query(
ctx: &mut SearchContext, ctx: &mut SearchContext,
// NOTE: the terms here must be consecutive // The terms here must be consecutive
terms: &[LocatedQueryTerm], terms: &[LocatedQueryTerm],
) -> Result<(QueryGraph, Vec<LocatedQueryTerm>)> { ) -> Result<(QueryGraph, Vec<LocatedQueryTerm>)> {
let mut new_located_query_terms = terms.to_vec(); let mut new_located_query_terms = terms.to_vec();
@ -103,7 +103,7 @@ impl QueryGraph {
let root_node = 0; let root_node = 0;
let end_node = 1; let end_node = 1;
// TODO: we could consider generalizing to 4,5,6,7,etc. ngrams // Ee could consider generalizing to 4,5,6,7,etc. ngrams
let (mut prev2, mut prev1, mut prev0): (Vec<u16>, Vec<u16>, Vec<u16>) = let (mut prev2, mut prev1, mut prev0): (Vec<u16>, Vec<u16>, Vec<u16>) =
(vec![], vec![], vec![root_node]); (vec![], vec![], vec![root_node]);

View File

@ -132,7 +132,6 @@ impl QueryTermSubset {
if full_query_term.ngram_words.is_some() { if full_query_term.ngram_words.is_some() {
return None; return None;
} }
// TODO: included in subset
if let Some(phrase) = full_query_term.zero_typo.phrase { if let Some(phrase) = full_query_term.zero_typo.phrase {
self.zero_typo_subset.contains_phrase(phrase).then_some(ExactTerm::Phrase(phrase)) self.zero_typo_subset.contains_phrase(phrase).then_some(ExactTerm::Phrase(phrase))
} else if let Some(word) = full_query_term.zero_typo.exact { } else if let Some(word) = full_query_term.zero_typo.exact {
@ -182,7 +181,6 @@ impl QueryTermSubset {
let word = match &self.zero_typo_subset { let word = match &self.zero_typo_subset {
NTypoTermSubset::All => Some(use_prefix_db), NTypoTermSubset::All => Some(use_prefix_db),
NTypoTermSubset::Subset { words, phrases: _ } => { NTypoTermSubset::Subset { words, phrases: _ } => {
// TODO: use a subset of prefix words instead
if words.contains(&use_prefix_db) { if words.contains(&use_prefix_db) {
Some(use_prefix_db) Some(use_prefix_db)
} else { } else {
@ -204,7 +202,6 @@ impl QueryTermSubset {
ctx: &mut SearchContext, ctx: &mut SearchContext,
) -> Result<BTreeSet<Word>> { ) -> Result<BTreeSet<Word>> {
let mut result = BTreeSet::default(); let mut result = BTreeSet::default();
// TODO: a compute_partially funtion
if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() { if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() {
self.original.compute_fully_if_needed(ctx)?; self.original.compute_fully_if_needed(ctx)?;
} }
@ -300,7 +297,6 @@ impl QueryTermSubset {
let mut result = BTreeSet::default(); let mut result = BTreeSet::default();
if !self.one_typo_subset.is_empty() { if !self.one_typo_subset.is_empty() {
// TODO: compute less than fully if possible
self.original.compute_fully_if_needed(ctx)?; self.original.compute_fully_if_needed(ctx)?;
} }
let original = ctx.term_interner.get_mut(self.original); let original = ctx.term_interner.get_mut(self.original);

View File

@ -139,7 +139,6 @@ pub fn number_of_typos_allowed<'ctx>(
let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?; let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?;
let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?; let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?;
// TODO: should `exact_words` also disable prefix search, ngrams, split words, or synonyms?
let exact_words = ctx.index.exact_words(ctx.txn)?; let exact_words = ctx.index.exact_words(ctx.txn)?;
Ok(Box::new(move |word: &str| { Ok(Box::new(move |word: &str| {
@ -250,8 +249,6 @@ impl PhraseBuilder {
} else { } else {
// token has kind Word // token has kind Word
let word = ctx.word_interner.insert(token.lemma().to_string()); let word = ctx.word_interner.insert(token.lemma().to_string());
// TODO: in a phrase, check that every word exists
// otherwise return an empty term
self.words.push(Some(word)); self.words.push(Some(word));
} }
} }

View File

@ -1,5 +1,48 @@
#![allow(clippy::too_many_arguments)] /** Implements a "PathVisitor" which finds all paths of a certain cost
from the START to END node of a ranking rule graph.
A path is a list of conditions. A condition is the data associated with
an edge, given by the ranking rule. Some edges don't have a condition associated
with them, they are "unconditional". These kinds of edges are used to "skip" a node.
The algorithm uses a depth-first search. It benefits from two main optimisations:
- The list of all possible costs to go from any node to the END node is precomputed
- The `DeadEndsCache` reduces the number of valid paths drastically, by making some edges
untraversable depending on what other edges were selected.
These two optimisations are meant to avoid traversing edges that wouldn't lead
to a valid path. In practically all cases, we avoid the exponential complexity
that is inherent to depth-first search in a large ranking rule graph.
The DeadEndsCache is a sort of prefix tree which associates a list of forbidden
conditions to a list of traversed conditions.
For example, the DeadEndsCache could say the following:
- Immediately, from the start, the conditions `[a,b]` are forbidden
- if we take the condition `c`, then the conditions `[e]` are also forbidden
- and if after that, we take `f`, then `[h,i]` are also forbidden
- etc.
- if we take `g`, then `[f]` is also forbidden
- etc.
- etc.
As we traverse the graph, we also traverse the `DeadEndsCache` and keep a list of forbidden
conditions in memory. Then, we know to avoid all edges which have a condition that is forbidden.
When a path is found from START to END, we give it to the `visit` closure.
This closure takes a mutable reference to the `DeadEndsCache`. This means that
the caller can update this cache. Therefore, we must handle the case where the
DeadEndsCache has been updated. This means potentially backtracking up to the point
where the traversed conditions are all allowed by the new DeadEndsCache.
The algorithm also implements the `TermsMatchingStrategy` logic.
Some edges are augmented with a list of "nodes_to_skip". Skipping
a node means "reaching this node through an unconditional edge". If we have
already traversed (ie. not skipped) a node that is in this list, then we know that we
can't traverse this edge. Otherwise, we traverse the edge but make sure to skip any
future node that was present in the "nodes_to_skip" list.
The caller can decide to stop the path finding algorithm
by returning a `ControlFlow::Break` from the `visit` closure.
*/
use std::collections::{BTreeSet, VecDeque}; use std::collections::{BTreeSet, VecDeque};
use std::iter::FromIterator; use std::iter::FromIterator;
use std::ops::ControlFlow; use std::ops::ControlFlow;
@ -12,30 +55,41 @@ use crate::search::new::query_graph::QueryNode;
use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::small_bitmap::SmallBitmap;
use crate::Result; use crate::Result;
/// Closure which processes a path found by the `PathVisitor`
type VisitFn<'f, G> = &'f mut dyn FnMut( type VisitFn<'f, G> = &'f mut dyn FnMut(
// the path as a list of conditions
&[Interned<<G as RankingRuleGraphTrait>::Condition>], &[Interned<<G as RankingRuleGraphTrait>::Condition>],
&mut RankingRuleGraph<G>, &mut RankingRuleGraph<G>,
// a mutable reference to the DeadEndsCache, to update it in case the given
// path doesn't resolve to any valid document ids
&mut DeadEndsCache<<G as RankingRuleGraphTrait>::Condition>, &mut DeadEndsCache<<G as RankingRuleGraphTrait>::Condition>,
) -> Result<ControlFlow<()>>; ) -> Result<ControlFlow<()>>;
/// A structure which is kept but not updated during the traversal of the graph.
/// It can however be updated by the `visit` closure once a valid path has been found.
struct VisitorContext<'a, G: RankingRuleGraphTrait> { struct VisitorContext<'a, G: RankingRuleGraphTrait> {
graph: &'a mut RankingRuleGraph<G>, graph: &'a mut RankingRuleGraph<G>,
all_costs_from_node: &'a MappedInterner<QueryNode, Vec<u64>>, all_costs_from_node: &'a MappedInterner<QueryNode, Vec<u64>>,
dead_ends_cache: &'a mut DeadEndsCache<G::Condition>, dead_ends_cache: &'a mut DeadEndsCache<G::Condition>,
} }
/// The internal state of the traversal algorithm
struct VisitorState<G: RankingRuleGraphTrait> { struct VisitorState<G: RankingRuleGraphTrait> {
/// Budget from the current node to the end node
remaining_cost: u64, remaining_cost: u64,
/// Previously visited conditions, in order.
path: Vec<Interned<G::Condition>>, path: Vec<Interned<G::Condition>>,
/// Previously visited conditions, as an efficient and compact set.
visited_conditions: SmallBitmap<G::Condition>, visited_conditions: SmallBitmap<G::Condition>,
/// Previously visited (ie not skipped) nodes, as an efficient and compact set.
visited_nodes: SmallBitmap<QueryNode>, visited_nodes: SmallBitmap<QueryNode>,
/// The conditions that cannot be visited anymore
forbidden_conditions: SmallBitmap<G::Condition>, forbidden_conditions: SmallBitmap<G::Condition>,
forbidden_conditions_to_nodes: SmallBitmap<QueryNode>, /// The nodes that cannot be visited anymore (they must be skipped)
nodes_to_skip: SmallBitmap<QueryNode>,
} }
/// See module documentation
pub struct PathVisitor<'a, G: RankingRuleGraphTrait> { pub struct PathVisitor<'a, G: RankingRuleGraphTrait> {
state: VisitorState<G>, state: VisitorState<G>,
ctx: VisitorContext<'a, G>, ctx: VisitorContext<'a, G>,
@ -56,14 +110,13 @@ impl<'a, G: RankingRuleGraphTrait> PathVisitor<'a, G> {
forbidden_conditions: SmallBitmap::for_interned_values_in( forbidden_conditions: SmallBitmap::for_interned_values_in(
&graph.conditions_interner, &graph.conditions_interner,
), ),
forbidden_conditions_to_nodes: SmallBitmap::for_interned_values_in( nodes_to_skip: SmallBitmap::for_interned_values_in(&graph.query_graph.nodes),
&graph.query_graph.nodes,
),
}, },
ctx: VisitorContext { graph, all_costs_from_node, dead_ends_cache }, ctx: VisitorContext { graph, all_costs_from_node, dead_ends_cache },
} }
} }
/// See module documentation
pub fn visit_paths(mut self, visit: VisitFn<G>) -> Result<()> { pub fn visit_paths(mut self, visit: VisitFn<G>) -> Result<()> {
let _ = let _ =
self.state.visit_node(self.ctx.graph.query_graph.root_node, visit, &mut self.ctx)?; self.state.visit_node(self.ctx.graph.query_graph.root_node, visit, &mut self.ctx)?;
@ -72,22 +125,31 @@ impl<'a, G: RankingRuleGraphTrait> PathVisitor<'a, G> {
} }
impl<G: RankingRuleGraphTrait> VisitorState<G> { impl<G: RankingRuleGraphTrait> VisitorState<G> {
/// Visits a node: traverse all its valid conditional and unconditional edges.
///
/// Returns ControlFlow::Break if the path finding algorithm should stop.
/// Returns whether a valid path was found from this node otherwise.
fn visit_node( fn visit_node(
&mut self, &mut self,
from_node: Interned<QueryNode>, from_node: Interned<QueryNode>,
visit: VisitFn<G>, visit: VisitFn<G>,
ctx: &mut VisitorContext<G>, ctx: &mut VisitorContext<G>,
) -> Result<ControlFlow<(), bool>> { ) -> Result<ControlFlow<(), bool>> {
// any valid path will be found from this point
// if a valid path was found, then we know that the DeadEndsCache may have been updated,
// and we will need to do more work to potentially backtrack
let mut any_valid = false; let mut any_valid = false;
let edges = ctx.graph.edges_of_node.get(from_node).clone(); let edges = ctx.graph.edges_of_node.get(from_node).clone();
for edge_idx in edges.iter() { for edge_idx in edges.iter() {
// could be none if the edge was deleted
let Some(edge) = ctx.graph.edges_store.get(edge_idx).clone() else { continue }; let Some(edge) = ctx.graph.edges_store.get(edge_idx).clone() else { continue };
if self.remaining_cost < edge.cost as u64 { if self.remaining_cost < edge.cost as u64 {
continue; continue;
} }
self.remaining_cost -= edge.cost as u64; self.remaining_cost -= edge.cost as u64;
let cf = match edge.condition { let cf = match edge.condition {
Some(condition) => self.visit_condition( Some(condition) => self.visit_condition(
condition, condition,
@ -119,6 +181,10 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> {
Ok(ControlFlow::Continue(any_valid)) Ok(ControlFlow::Continue(any_valid))
} }
/// Visits an unconditional edge.
///
/// Returns ControlFlow::Break if the path finding algorithm should stop.
/// Returns whether a valid path was found from this node otherwise.
fn visit_no_condition( fn visit_no_condition(
&mut self, &mut self,
dest_node: Interned<QueryNode>, dest_node: Interned<QueryNode>,
@ -134,20 +200,29 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> {
{ {
return Ok(ControlFlow::Continue(false)); return Ok(ControlFlow::Continue(false));
} }
// We've reached the END node!
if dest_node == ctx.graph.query_graph.end_node { if dest_node == ctx.graph.query_graph.end_node {
let control_flow = visit(&self.path, ctx.graph, ctx.dead_ends_cache)?; let control_flow = visit(&self.path, ctx.graph, ctx.dead_ends_cache)?;
// We could change the return type of the visit closure such that the caller
// tells us whether the dead ends cache was updated or not.
// Alternatively, maybe the DeadEndsCache should have a generation number
// to it, so that we don't need to play with these booleans at all.
match control_flow { match control_flow {
ControlFlow::Continue(_) => Ok(ControlFlow::Continue(true)), ControlFlow::Continue(_) => Ok(ControlFlow::Continue(true)),
ControlFlow::Break(_) => Ok(ControlFlow::Break(())), ControlFlow::Break(_) => Ok(ControlFlow::Break(())),
} }
} else { } else {
let old_fbct = self.forbidden_conditions_to_nodes.clone(); let old_fbct = self.nodes_to_skip.clone();
self.forbidden_conditions_to_nodes.union(edge_new_nodes_to_skip); self.nodes_to_skip.union(edge_new_nodes_to_skip);
let cf = self.visit_node(dest_node, visit, ctx)?; let cf = self.visit_node(dest_node, visit, ctx)?;
self.forbidden_conditions_to_nodes = old_fbct; self.nodes_to_skip = old_fbct;
Ok(cf) Ok(cf)
} }
} }
/// Visits a conditional edge.
///
/// Returns ControlFlow::Break if the path finding algorithm should stop.
/// Returns whether a valid path was found from this node otherwise.
fn visit_condition( fn visit_condition(
&mut self, &mut self,
condition: Interned<G::Condition>, condition: Interned<G::Condition>,
@ -159,7 +234,7 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> {
assert!(dest_node != ctx.graph.query_graph.end_node); assert!(dest_node != ctx.graph.query_graph.end_node);
if self.forbidden_conditions.contains(condition) if self.forbidden_conditions.contains(condition)
|| self.forbidden_conditions_to_nodes.contains(dest_node) || self.nodes_to_skip.contains(dest_node)
|| edge_new_nodes_to_skip.intersects(&self.visited_nodes) || edge_new_nodes_to_skip.intersects(&self.visited_nodes)
{ {
return Ok(ControlFlow::Continue(false)); return Ok(ControlFlow::Continue(false));
@ -180,19 +255,19 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> {
self.visited_nodes.insert(dest_node); self.visited_nodes.insert(dest_node);
self.visited_conditions.insert(condition); self.visited_conditions.insert(condition);
let old_fc = self.forbidden_conditions.clone(); let old_forb_cond = self.forbidden_conditions.clone();
if let Some(next_forbidden) = if let Some(next_forbidden) =
ctx.dead_ends_cache.forbidden_conditions_after_prefix(self.path.iter().copied()) ctx.dead_ends_cache.forbidden_conditions_after_prefix(self.path.iter().copied())
{ {
self.forbidden_conditions.union(&next_forbidden); self.forbidden_conditions.union(&next_forbidden);
} }
let old_fctn = self.forbidden_conditions_to_nodes.clone(); let old_nodes_to_skip = self.nodes_to_skip.clone();
self.forbidden_conditions_to_nodes.union(edge_new_nodes_to_skip); self.nodes_to_skip.union(edge_new_nodes_to_skip);
let cf = self.visit_node(dest_node, visit, ctx)?; let cf = self.visit_node(dest_node, visit, ctx)?;
self.forbidden_conditions_to_nodes = old_fctn; self.nodes_to_skip = old_nodes_to_skip;
self.forbidden_conditions = old_fc; self.forbidden_conditions = old_forb_cond;
self.visited_conditions.remove(condition); self.visited_conditions.remove(condition);
self.visited_nodes.remove(dest_node); self.visited_nodes.remove(dest_node);

View File

@ -9,12 +9,8 @@ use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::SearchContext; use crate::search::new::SearchContext;
use crate::Result; use crate::Result;
// TODO: give a generation to each universe, then be able to get the exact
// delta of docids between two universes of different generations!
/// A cache storing the document ids associated with each ranking rule edge /// A cache storing the document ids associated with each ranking rule edge
pub struct ConditionDocIdsCache<G: RankingRuleGraphTrait> { pub struct ConditionDocIdsCache<G: RankingRuleGraphTrait> {
// TOOD: should be a mapped interner?
pub cache: FxHashMap<Interned<G::Condition>, ComputedCondition>, pub cache: FxHashMap<Interned<G::Condition>, ComputedCondition>,
_phantom: PhantomData<G>, _phantom: PhantomData<G>,
} }
@ -54,7 +50,7 @@ impl<G: RankingRuleGraphTrait> ConditionDocIdsCache<G> {
} }
let condition = graph.conditions_interner.get_mut(interned_condition); let condition = graph.conditions_interner.get_mut(interned_condition);
let computed = G::resolve_condition(ctx, condition, universe)?; let computed = G::resolve_condition(ctx, condition, universe)?;
// TODO: if computed.universe_len != universe.len() ? // Can we put an assert here for computed.universe_len == universe.len() ?
let _ = self.cache.insert(interned_condition, computed); let _ = self.cache.insert(interned_condition, computed);
let computed = &self.cache[&interned_condition]; let computed = &self.cache[&interned_condition];
Ok(computed) Ok(computed)

View File

@ -2,6 +2,7 @@ use crate::search::new::interner::{FixedSizeInterner, Interned};
use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::small_bitmap::SmallBitmap;
pub struct DeadEndsCache<T> { pub struct DeadEndsCache<T> {
// conditions and next could/should be part of the same vector
conditions: Vec<Interned<T>>, conditions: Vec<Interned<T>>,
next: Vec<Self>, next: Vec<Self>,
pub forbidden: SmallBitmap<T>, pub forbidden: SmallBitmap<T>,
@ -27,7 +28,7 @@ impl<T> DeadEndsCache<T> {
self.forbidden.insert(condition); self.forbidden.insert(condition);
} }
pub fn advance(&mut self, condition: Interned<T>) -> Option<&mut Self> { fn advance(&mut self, condition: Interned<T>) -> Option<&mut Self> {
if let Some(idx) = self.conditions.iter().position(|c| *c == condition) { if let Some(idx) = self.conditions.iter().position(|c| *c == condition) {
Some(&mut self.next[idx]) Some(&mut self.next[idx])
} else { } else {

View File

@ -69,14 +69,9 @@ impl RankingRuleGraphTrait for FidGraph {
let mut edges = vec![]; let mut edges = vec![];
for fid in all_fields { for fid in all_fields {
// TODO: We can improve performances and relevancy by storing
// the term subsets associated to each field ids fetched.
edges.push(( edges.push((
fid as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10. fid as u32 * term.term_ids.len() as u32,
conditions_interner.insert(FidCondition { conditions_interner.insert(FidCondition { term: term.clone(), fid }),
term: term.clone(), // TODO remove this ugly clone
fid,
}),
)); ));
} }

View File

@ -94,14 +94,9 @@ impl RankingRuleGraphTrait for PositionGraph {
let mut edges = vec![]; let mut edges = vec![];
for (cost, positions) in positions_for_costs { for (cost, positions) in positions_for_costs {
// TODO: We can improve performances and relevancy by storing
// the term subsets associated to each position fetched
edges.push(( edges.push((
cost, cost,
conditions_interner.insert(PositionCondition { conditions_interner.insert(PositionCondition { term: term.clone(), positions }),
term: term.clone(), // TODO remove this ugly clone
positions,
}),
)); ));
} }

View File

@ -65,13 +65,6 @@ pub fn compute_docids(
} }
} }
// TODO: add safeguard in case the cartesian product is too large!
// even if we restrict the word derivations to a maximum of 100, the size of the
// caterisan product could reach a maximum of 10_000 derivations, which is way too much.
// Maybe prioritise the product of zero typo derivations, then the product of zero-typo/one-typo
// + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been
// reached
for (left_phrase, left_word) in last_words_of_term_derivations(ctx, &left_term.term_subset)? { for (left_phrase, left_word) in last_words_of_term_derivations(ctx, &left_term.term_subset)? {
// Before computing the edges, check that the left word and left phrase // Before computing the edges, check that the left word and left phrase
// aren't disjoint with the universe, but only do it if there is more than // aren't disjoint with the universe, but only do it if there is more than
@ -111,8 +104,6 @@ pub fn compute_docids(
Ok(ComputedCondition { Ok(ComputedCondition {
docids, docids,
universe_len: universe.len(), universe_len: universe.len(),
// TODO: think about whether we want to reduce the subset,
// we probably should!
start_term_subset: Some(left_term.clone()), start_term_subset: Some(left_term.clone()),
end_term_subset: right_term.clone(), end_term_subset: right_term.clone(),
}) })
@ -203,12 +194,7 @@ fn compute_non_prefix_edges(
*docids |= new_docids; *docids |= new_docids;
} }
} }
if backward_proximity >= 1 if backward_proximity >= 1 && left_phrase.is_none() && right_phrase.is_none() {
// TODO: for now, we don't do any swapping when either term is a phrase
// but maybe we should. We'd need to look at the first/last word of the phrase
// depending on the context.
&& left_phrase.is_none() && right_phrase.is_none()
{
if let Some(new_docids) = if let Some(new_docids) =
ctx.get_db_word_pair_proximity_docids(word2, word1, backward_proximity)? ctx.get_db_word_pair_proximity_docids(word2, word1, backward_proximity)?
{ {

View File

@ -33,8 +33,6 @@ pub fn compute_query_term_subset_docids(
ctx: &mut SearchContext, ctx: &mut SearchContext,
term: &QueryTermSubset, term: &QueryTermSubset,
) -> Result<RoaringBitmap> { ) -> Result<RoaringBitmap> {
// TODO Use the roaring::MultiOps trait
let mut docids = RoaringBitmap::new(); let mut docids = RoaringBitmap::new();
for word in term.all_single_words_except_prefix_db(ctx)? { for word in term.all_single_words_except_prefix_db(ctx)? {
if let Some(word_docids) = ctx.word_docids(word)? { if let Some(word_docids) = ctx.word_docids(word)? {
@ -59,8 +57,6 @@ pub fn compute_query_term_subset_docids_within_field_id(
term: &QueryTermSubset, term: &QueryTermSubset,
fid: u16, fid: u16,
) -> Result<RoaringBitmap> { ) -> Result<RoaringBitmap> {
// TODO Use the roaring::MultiOps trait
let mut docids = RoaringBitmap::new(); let mut docids = RoaringBitmap::new();
for word in term.all_single_words_except_prefix_db(ctx)? { for word in term.all_single_words_except_prefix_db(ctx)? {
if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(word.interned(), fid)? { if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(word.interned(), fid)? {
@ -71,7 +67,6 @@ pub fn compute_query_term_subset_docids_within_field_id(
for phrase in term.all_phrases(ctx)? { for phrase in term.all_phrases(ctx)? {
// There may be false positives when resolving a phrase, so we're not // There may be false positives when resolving a phrase, so we're not
// guaranteed that all of its words are within a single fid. // guaranteed that all of its words are within a single fid.
// TODO: fix this?
if let Some(word) = phrase.words(ctx).iter().flatten().next() { if let Some(word) = phrase.words(ctx).iter().flatten().next() {
if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(*word, fid)? { if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(*word, fid)? {
docids |= ctx.get_phrase_docids(phrase)? & word_fid_docids; docids |= ctx.get_phrase_docids(phrase)? & word_fid_docids;
@ -95,7 +90,6 @@ pub fn compute_query_term_subset_docids_within_position(
term: &QueryTermSubset, term: &QueryTermSubset,
position: u16, position: u16,
) -> Result<RoaringBitmap> { ) -> Result<RoaringBitmap> {
// TODO Use the roaring::MultiOps trait
let mut docids = RoaringBitmap::new(); let mut docids = RoaringBitmap::new();
for word in term.all_single_words_except_prefix_db(ctx)? { for word in term.all_single_words_except_prefix_db(ctx)? {
if let Some(word_position_docids) = if let Some(word_position_docids) =
@ -108,7 +102,6 @@ pub fn compute_query_term_subset_docids_within_position(
for phrase in term.all_phrases(ctx)? { for phrase in term.all_phrases(ctx)? {
// It's difficult to know the expected position of the words in the phrase, // It's difficult to know the expected position of the words in the phrase,
// so instead we just check the first one. // so instead we just check the first one.
// TODO: fix this?
if let Some(word) = phrase.words(ctx).iter().flatten().next() { if let Some(word) = phrase.words(ctx).iter().flatten().next() {
if let Some(word_position_docids) = ctx.get_db_word_position_docids(*word, position)? { if let Some(word_position_docids) = ctx.get_db_word_position_docids(*word, position)? {
docids |= ctx.get_phrase_docids(phrase)? & word_position_docids docids |= ctx.get_phrase_docids(phrase)? & word_position_docids
@ -132,9 +125,6 @@ pub fn compute_query_graph_docids(
q: &QueryGraph, q: &QueryGraph,
universe: &RoaringBitmap, universe: &RoaringBitmap,
) -> Result<RoaringBitmap> { ) -> Result<RoaringBitmap> {
// TODO: there must be a faster way to compute this big
// roaring bitmap expression
let mut nodes_resolved = SmallBitmap::for_interned_values_in(&q.nodes); let mut nodes_resolved = SmallBitmap::for_interned_values_in(&q.nodes);
let mut path_nodes_docids = q.nodes.map(|_| RoaringBitmap::new()); let mut path_nodes_docids = q.nodes.map(|_| RoaringBitmap::new());

View File

@ -141,10 +141,6 @@ impl<'ctx, Query: RankingRuleQueryTrait> RankingRule<'ctx, Query> for Sort<'ctx,
universe: &RoaringBitmap, universe: &RoaringBitmap,
) -> Result<Option<RankingRuleOutput<Query>>> { ) -> Result<Option<RankingRuleOutput<Query>>> {
let iter = self.iter.as_mut().unwrap(); let iter = self.iter.as_mut().unwrap();
// TODO: we should make use of the universe in the function below
// good for correctness, but ideally iter.next_bucket would take the current universe into account,
// as right now it could return buckets that don't intersect with the universe, meaning we will make many
// unneeded calls.
if let Some(mut bucket) = iter.next_bucket()? { if let Some(mut bucket) = iter.next_bucket()? {
bucket.candidates &= universe; bucket.candidates &= universe;
Ok(Some(bucket)) Ok(Some(bucket))

View File

@ -527,7 +527,7 @@ fn test_distinct_all_candidates() {
let SearchResult { documents_ids, candidates, .. } = s.execute().unwrap(); let SearchResult { documents_ids, candidates, .. } = s.execute().unwrap();
let candidates = candidates.iter().collect::<Vec<_>>(); let candidates = candidates.iter().collect::<Vec<_>>();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]");
// TODO: this is incorrect! // This is incorrect, but unfortunately impossible to do better efficiently.
insta::assert_snapshot!(format!("{candidates:?}"), @"[1, 4, 7, 8, 14, 17, 19, 20, 23, 24, 25, 26]"); insta::assert_snapshot!(format!("{candidates:?}"), @"[1, 4, 7, 8, 14, 17, 19, 20, 23, 24, 25, 26]");
} }

View File

@ -122,11 +122,11 @@ fn create_edge_cases_index() -> TempIndex {
sta stb stc ste stf stg sth sti stj stk stl stm stn sto stp stq str stst stt stu stv stw stx sty stz sta stb stc ste stf stg sth sti stj stk stl stm stn sto stp stq str stst stt stu stv stw stx sty stz
" "
}, },
// The next 5 documents lay out a trap with the split word, phrase search, or synonym `sun flower`. // The next 5 documents lay out a trap with the split word, phrase search, or synonym `sun flower`.
// If the search query is "sunflower", the split word "Sun Flower" will match some documents. // If the search query is "sunflower", the split word "Sun Flower" will match some documents.
// If the query is `sunflower wilting`, then we should make sure that // If the query is `sunflower wilting`, then we should make sure that
// the sprximity condition `flower wilting: sprx N` also comes with the condition // the proximity condition `flower wilting: sprx N` also comes with the condition
// `sun wilting: sprx N+1`. TODO: this is not the exact condition we use for now. // `sun wilting: sprx N+1`, but this is not the exact condition we use for now.
// We only check that the phrase `sun flower` exists and `flower wilting: sprx N`, which // We only check that the phrase `sun flower` exists and `flower wilting: sprx N`, which
// is better than nothing but not the best. // is better than nothing but not the best.
{ {
@ -139,7 +139,7 @@ fn create_edge_cases_index() -> TempIndex {
}, },
{ {
"id": 3, "id": 3,
// This document matches the query `sunflower wilting`, but the sprximity condition // This document matches the query `sunflower wilting`, but the sprximity condition
// between `sunflower` and `wilting` cannot be through the split-word `Sun Flower` // between `sunflower` and `wilting` cannot be through the split-word `Sun Flower`
// which would reduce to only `flower` and `wilting` being in sprximity. // which would reduce to only `flower` and `wilting` being in sprximity.
"text": "A flower wilting under the sun, unlike a sunflower" "text": "A flower wilting under the sun, unlike a sunflower"
@ -299,7 +299,7 @@ fn test_proximity_split_word() {
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 5, 1, 3]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 5, 1, 3]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids); let texts = collect_field_values(&index, &txn, "text", &documents_ids);
// TODO: "2" and "4" should be swapped ideally // "2" and "4" should be swapped ideally
insta::assert_debug_snapshot!(texts, @r###" insta::assert_debug_snapshot!(texts, @r###"
[ [
"\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"", "\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",
@ -316,7 +316,7 @@ fn test_proximity_split_word() {
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids); let texts = collect_field_values(&index, &txn, "text", &documents_ids);
// TODO: "2" and "4" should be swapped ideally // "2" and "4" should be swapped ideally
insta::assert_debug_snapshot!(texts, @r###" insta::assert_debug_snapshot!(texts, @r###"
[ [
"\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"", "\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",
@ -341,7 +341,7 @@ fn test_proximity_split_word() {
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids); let texts = collect_field_values(&index, &txn, "text", &documents_ids);
// TODO: "2" and "4" should be swapped ideally // "2" and "4" should be swapped ideally
insta::assert_debug_snapshot!(texts, @r###" insta::assert_debug_snapshot!(texts, @r###"
[ [
"\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"", "\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",

View File

@ -2,9 +2,8 @@
This module tests the interactions between the proximity and typo ranking rules. This module tests the interactions between the proximity and typo ranking rules.
The proximity ranking rule should transform the query graph such that it The proximity ranking rule should transform the query graph such that it
only contains the word pairs that it used to compute its bucket. only contains the word pairs that it used to compute its bucket, but this is not currently
implemented.
TODO: This is not currently implemented.
*/ */
use crate::index::tests::TempIndex; use crate::index::tests::TempIndex;
@ -64,7 +63,7 @@ fn test_trap_basic() {
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids); let texts = collect_field_values(&index, &txn, "text", &documents_ids);
// TODO: this is incorrect, 1 should come before 0 // This is incorrect, 1 should come before 0
insta::assert_debug_snapshot!(texts, @r###" insta::assert_debug_snapshot!(texts, @r###"
[ [
"\"summer. holiday. sommer holidty\"", "\"summer. holiday. sommer holidty\"",

View File

@ -571,8 +571,8 @@ fn test_typo_synonyms() {
s.terms_matching_strategy(TermsMatchingStrategy::All); s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("the fast brownish fox jumps over the lackadaisical dog"); s.query("the fast brownish fox jumps over the lackadaisical dog");
// TODO: is this correct? interaction of ngrams + synonyms means that the // The interaction of ngrams + synonyms means that the multi-word synonyms end up having a typo cost.
// multi-word synonyms end up having a typo cost. This is probably not what we want. // This is probably not what we want.
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 0, 22]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 0, 22]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids); let texts = collect_field_values(&index, &txn, "text", &documents_ids);