Rewrite proximity ranking rule

This commit is contained in:
Loïc Lecrenier 2023-03-30 11:59:06 +02:00
parent ae6bb1ce17
commit 01e24dd630
3 changed files with 181 additions and 221 deletions

View File

@ -2,44 +2,26 @@
use super::ProximityCondition;
use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_graph::QueryNodeData;
use crate::search::new::query_term::LocatedQueryTerm;
use crate::search::new::{QueryNode, SearchContext};
use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::SearchContext;
use crate::Result;
pub fn build_edges(
_ctx: &mut SearchContext,
conditions_interner: &mut DedupInterner<ProximityCondition>,
from_node: &QueryNode,
to_node: &QueryNode,
) -> Result<Vec<(u8, Option<Interned<ProximityCondition>>)>> {
let right_term = match &to_node.data {
QueryNodeData::End => return Ok(vec![(0, None)]),
QueryNodeData::Deleted | QueryNodeData::Start => return Ok(vec![]),
QueryNodeData::Term(term) => term,
left_term: Option<&LocatedQueryTermSubset>,
right_term: &LocatedQueryTermSubset,
) -> Result<Vec<(u32, Interned<ProximityCondition>)>> {
let right_ngram_length = right_term.term_ids.len();
let Some(left_term) = left_term else {
return Ok(vec![(
(right_ngram_length - 1) as u32,
conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
)])
};
let LocatedQueryTerm { value: right_term_interned, positions: right_positions } = right_term;
let (right_start_position, right_ngram_length) =
(*right_positions.start(), right_positions.len());
let (left_term_interned, left_end_position) = match &from_node.data {
QueryNodeData::Term(LocatedQueryTerm { value, positions }) => (*value, *positions.end()),
QueryNodeData::Deleted => return Ok(vec![]),
QueryNodeData::Start => {
return Ok(vec![(
(right_ngram_length - 1) as u8,
Some(
conditions_interner
.insert(ProximityCondition::Term { term: *right_term_interned }),
),
)])
}
QueryNodeData::End => return Ok(vec![]),
};
if left_end_position + 1 != right_start_position {
if left_term.positions.end() + 1 != *right_term.positions.start() {
// We want to ignore this pair of terms
// Unconditionally walk through the edge without computing the docids
// This can happen when, in a query like `the sun flowers are beautiful`, the term
@ -47,30 +29,26 @@ pub fn build_edges(
// The remaining query graph represents `the sun .. are beautiful`
// but `sun` and `are` have no proximity condition between them
return Ok(vec![(
(right_ngram_length - 1) as u8,
Some(
conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned }),
),
(right_ngram_length - 1) as u32,
conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
)]);
}
let mut conditions = vec![];
for cost in right_ngram_length..(7 + right_ngram_length) {
let cost = cost as u8;
conditions.push((
cost,
Some(conditions_interner.insert(ProximityCondition::Uninit {
left_term: left_term_interned,
right_term: *right_term_interned,
right_term_ngram_len: right_ngram_length as u8,
cost,
})),
cost as u32,
conditions_interner.insert(ProximityCondition::Uninit {
left_term: left_term.clone(),
right_term: right_term.clone(),
cost: cost as u8,
}),
))
}
conditions.push((
(7 + right_ngram_length) as u8,
Some(conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned })),
(7 + right_ngram_length) as u32,
conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
));
Ok(conditions)

View File

@ -1,49 +1,37 @@
#![allow(clippy::too_many_arguments)]
use std::iter::FromIterator;
use super::ProximityCondition;
use crate::search::new::db_cache::DatabaseCache;
use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_term::{Phrase, QueryTerm};
use crate::search::new::resolve_query_graph::QueryTermDocIdsCache;
use crate::search::new::interner::Interned;
use crate::search::new::query_term::{Phrase, QueryTermSubset};
use crate::search::new::ranking_rule_graph::ComputedCondition;
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
use crate::search::new::SearchContext;
use crate::{CboRoaringBitmapCodec, Index, Result};
use fxhash::FxHashSet;
use heed::RoTxn;
use crate::{CboRoaringBitmapCodec, Result};
use roaring::RoaringBitmap;
use std::collections::BTreeSet;
pub fn compute_docids(
ctx: &mut SearchContext,
condition: &ProximityCondition,
universe: &RoaringBitmap,
) -> Result<(RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)> {
let (left_term, right_term, right_term_ngram_len, cost) = match condition {
ProximityCondition::Uninit { left_term, right_term, right_term_ngram_len, cost } => {
(*left_term, *right_term, *right_term_ngram_len, *cost)
) -> Result<ComputedCondition> {
let (left_term, right_term, cost) = match condition {
ProximityCondition::Uninit { left_term, right_term, cost } => {
(left_term, right_term, *cost)
}
ProximityCondition::Term { term } => {
let term_v = ctx.term_interner.get(*term);
return Ok((
ctx.term_docids
.get_query_term_docids(
ctx.index,
ctx.txn,
&mut ctx.db_cache,
&ctx.word_interner,
&ctx.term_interner,
&ctx.phrase_interner,
*term,
)?
.clone(),
FxHashSet::from_iter(term_v.all_single_words_except_prefix_db()),
FxHashSet::from_iter(term_v.all_phrases()),
));
let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?;
docids &= universe;
return Ok(ComputedCondition {
docids,
universe_len: universe.len(),
start_term_subset: None,
end_term_subset: term.clone(),
});
}
};
let left_term = ctx.term_interner.get(left_term);
let right_term = ctx.term_interner.get(right_term);
let right_term_ngram_len = right_term.term_ids.len() as u8;
// e.g. for the simple words `sun .. flower`
// the cost is 5
@ -57,20 +45,13 @@ pub fn compute_docids(
let forward_proximity = 1 + cost - right_term_ngram_len;
let backward_proximity = cost - right_term_ngram_len;
let mut used_words = FxHashSet::default();
let mut used_phrases = FxHashSet::default();
let mut docids = RoaringBitmap::new();
if let Some(right_prefix) = right_term.use_prefix_db {
for (left_phrase, left_word) in last_word_of_term_iter(left_term, &ctx.phrase_interner) {
if let Some(right_prefix) = right_term.term_subset.use_prefix_db(ctx) {
for (left_phrase, left_word) in last_words_of_term_derivations(ctx, &left_term.term_subset)?
{
compute_prefix_edges(
ctx.index,
ctx.txn,
&mut ctx.db_cache,
&mut ctx.term_docids,
&ctx.word_interner,
&ctx.phrase_interner,
ctx,
left_word,
right_prefix,
left_phrase,
@ -78,8 +59,6 @@ pub fn compute_docids(
backward_proximity,
&mut docids,
universe,
&mut used_words,
&mut used_phrases,
)?;
}
}
@ -91,39 +70,60 @@ pub fn compute_docids(
// + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been
// reached
for (left_phrase, left_word) in last_word_of_term_iter(left_term, &ctx.phrase_interner) {
for (right_word, right_phrase) in first_word_of_term_iter(right_term, &ctx.phrase_interner)
{
for (left_phrase, left_word) in last_words_of_term_derivations(ctx, &left_term.term_subset)? {
// Before computing the edges, check that the left word and left phrase
// aren't disjoint with the universe, but only do it if there is more than
// one word derivation to the right.
//
// This is an optimisation to avoid checking for an excessive number of
// pairs.
// WAIT, NO.
// This should only be done once per node.
// Here, we'll potentially do is.. 16 times?
// Maybe we should do it at edge-build time instead.
// Same for the future attribute ranking rule.
let right_derivs = first_word_of_term_iter(ctx, &right_term.term_subset)?;
if right_derivs.len() > 1 {
let universe = &universe;
if let Some(left_phrase) = left_phrase {
if universe.is_disjoint(ctx.get_phrase_docids(left_phrase)?) {
continue;
}
} else if let Some(lw_bytes) = ctx.get_db_word_docids(left_word)? {
let left_word_docids = CboRoaringBitmapCodec::deserialize_from(lw_bytes)?;
if universe.is_disjoint(&left_word_docids) {
continue;
}
}
}
for (right_word, right_phrase) in right_derivs {
compute_non_prefix_edges(
ctx.index,
ctx.txn,
&mut ctx.db_cache,
&mut ctx.term_docids,
&ctx.word_interner,
&ctx.phrase_interner,
ctx,
left_word,
right_word,
&[left_phrase, right_phrase].iter().copied().flatten().collect::<Vec<_>>(),
left_phrase,
right_phrase,
forward_proximity,
backward_proximity,
&mut docids,
universe,
&mut used_words,
&mut used_phrases,
)?;
}
}
Ok((docids, used_words, used_phrases))
Ok(ComputedCondition {
docids,
universe_len: universe.len(),
// TODO: think about whether we want to reduce the subset,
// we probably should!
start_term_subset: Some(left_term.clone()),
end_term_subset: right_term.clone(),
})
}
fn compute_prefix_edges<'ctx>(
index: &Index,
txn: &'ctx RoTxn,
db_cache: &mut DatabaseCache<'ctx>,
term_docids: &mut QueryTermDocIdsCache,
word_interner: &DedupInterner<String>,
phrase_interner: &DedupInterner<Phrase>,
fn compute_prefix_edges(
ctx: &mut SearchContext,
left_word: Interned<String>,
right_prefix: Interned<String>,
left_phrase: Option<Interned<Phrase>>,
@ -131,21 +131,16 @@ fn compute_prefix_edges<'ctx>(
backward_proximity: u8,
docids: &mut RoaringBitmap,
universe: &RoaringBitmap,
used_words: &mut FxHashSet<Interned<String>>,
used_phrases: &mut FxHashSet<Interned<Phrase>>,
) -> Result<()> {
let mut used_left_words = BTreeSet::new();
let mut used_left_phrases = BTreeSet::new();
let mut used_right_prefix = BTreeSet::new();
let mut universe = universe.clone();
if let Some(phrase) = left_phrase {
let phrase_docids = term_docids.get_phrase_docids(
index,
txn,
db_cache,
word_interner,
phrase_interner,
phrase,
)?;
let phrase_docids = ctx.get_phrase_docids(phrase)?;
if !phrase_docids.is_empty() {
used_phrases.insert(phrase);
used_left_phrases.insert(phrase);
}
universe &= phrase_docids;
if universe.is_empty() {
@ -153,36 +148,28 @@ fn compute_prefix_edges<'ctx>(
}
}
if let Some(new_docids) = db_cache.get_word_prefix_pair_proximity_docids(
index,
txn,
word_interner,
left_word,
right_prefix,
forward_proximity,
)? {
if let Some(new_docids) =
ctx.get_db_word_prefix_pair_proximity_docids(left_word, right_prefix, forward_proximity)?
{
let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
if !new_docids.is_empty() {
used_words.insert(left_word);
used_words.insert(right_prefix);
used_left_words.insert(left_word);
used_right_prefix.insert(right_prefix);
*docids |= new_docids;
}
}
// No swapping when computing the proximity between a phrase and a word
if left_phrase.is_none() {
if let Some(new_docids) = db_cache.get_prefix_word_pair_proximity_docids(
index,
txn,
word_interner,
if let Some(new_docids) = ctx.get_db_prefix_word_pair_proximity_docids(
right_prefix,
left_word,
backward_proximity,
)? {
let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
if !new_docids.is_empty() {
used_words.insert(left_word);
used_words.insert(right_prefix);
used_left_words.insert(left_word);
used_right_prefix.insert(right_prefix);
*docids |= new_docids;
}
}
@ -191,72 +178,59 @@ fn compute_prefix_edges<'ctx>(
Ok(())
}
fn compute_non_prefix_edges<'ctx>(
index: &Index,
txn: &'ctx RoTxn,
db_cache: &mut DatabaseCache<'ctx>,
term_docids: &mut QueryTermDocIdsCache,
word_interner: &DedupInterner<String>,
phrase_interner: &DedupInterner<Phrase>,
fn compute_non_prefix_edges(
ctx: &mut SearchContext,
word1: Interned<String>,
word2: Interned<String>,
phrases: &[Interned<Phrase>],
left_phrase: Option<Interned<Phrase>>,
right_phrase: Option<Interned<Phrase>>,
forward_proximity: u8,
backward_proximity: u8,
docids: &mut RoaringBitmap,
universe: &RoaringBitmap,
used_words: &mut FxHashSet<Interned<String>>,
used_phrases: &mut FxHashSet<Interned<Phrase>>,
) -> Result<()> {
let mut used_left_phrases = BTreeSet::new();
let mut used_right_phrases = BTreeSet::new();
let mut used_left_words = BTreeSet::new();
let mut used_right_words = BTreeSet::new();
let mut universe = universe.clone();
for phrase in phrases {
let phrase_docids = term_docids.get_phrase_docids(
index,
txn,
db_cache,
word_interner,
phrase_interner,
*phrase,
)?;
if !phrase_docids.is_empty() {
used_phrases.insert(*phrase);
}
for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() {
let phrase_docids = ctx.get_phrase_docids(phrase)?;
universe &= phrase_docids;
if universe.is_empty() {
return Ok(());
}
}
if let Some(new_docids) = db_cache.get_word_pair_proximity_docids(
index,
txn,
word_interner,
word1,
word2,
forward_proximity,
)? {
if let Some(left_phrase) = left_phrase {
used_left_phrases.insert(left_phrase);
}
if let Some(right_phrase) = right_phrase {
used_right_phrases.insert(right_phrase);
}
if let Some(new_docids) =
ctx.get_db_word_pair_proximity_docids(word1, word2, forward_proximity)?
{
let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
if !new_docids.is_empty() {
used_words.insert(word1);
used_words.insert(word2);
used_left_words.insert(word1);
used_right_words.insert(word2);
*docids |= new_docids;
}
}
if backward_proximity >= 1
// no swapping when either term is a phrase
&& phrases.is_empty()
&& left_phrase.is_none() && right_phrase.is_none()
{
if let Some(new_docids) = db_cache.get_word_pair_proximity_docids(
index,
txn,
word_interner,
word2,
word1,
backward_proximity,
)? {
if let Some(new_docids) =
ctx.get_db_word_pair_proximity_docids(word2, word1, backward_proximity)?
{
let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
if !new_docids.is_empty() {
used_words.insert(word1);
used_words.insert(word2);
used_left_words.insert(word2);
used_right_words.insert(word1);
*docids |= new_docids;
}
}
@ -265,25 +239,41 @@ fn compute_non_prefix_edges<'ctx>(
Ok(())
}
fn last_word_of_term_iter<'t>(
t: &'t QueryTerm,
phrase_interner: &'t DedupInterner<Phrase>,
) -> impl Iterator<Item = (Option<Interned<Phrase>>, Interned<String>)> + 't {
t.all_single_words_except_prefix_db().map(|w| (None, w)).chain(t.all_phrases().flat_map(
move |p| {
let phrase = phrase_interner.get(p);
phrase.words.last().unwrap().map(|last| (Some(p), last))
},
))
fn last_words_of_term_derivations(
ctx: &mut SearchContext,
t: &QueryTermSubset,
) -> Result<BTreeSet<(Option<Interned<Phrase>>, Interned<String>)>> {
let mut result = BTreeSet::new();
for w in t.all_single_words_except_prefix_db(ctx)? {
result.insert((None, w));
}
for p in t.all_phrases(ctx)? {
let phrase = ctx.phrase_interner.get(p);
let last_term_of_phrase = phrase.words.last().unwrap();
if let Some(last_word) = last_term_of_phrase {
result.insert((Some(p), *last_word));
}
}
Ok(result)
}
fn first_word_of_term_iter<'t>(
t: &'t QueryTerm,
phrase_interner: &'t DedupInterner<Phrase>,
) -> impl Iterator<Item = (Interned<String>, Option<Interned<Phrase>>)> + 't {
t.all_single_words_except_prefix_db().map(|w| (w, None)).chain(t.all_phrases().flat_map(
move |p| {
let phrase = phrase_interner.get(p);
phrase.words.first().unwrap().map(|first| (first, Some(p)))
},
))
fn first_word_of_term_iter(
ctx: &mut SearchContext,
t: &QueryTermSubset,
) -> Result<BTreeSet<(Interned<String>, Option<Interned<Phrase>>)>> {
let mut result = BTreeSet::new();
let all_words = t.all_single_words_except_prefix_db(ctx)?;
for w in all_words {
result.insert((w, None));
}
for p in t.all_phrases(ctx)? {
let phrase = ctx.phrase_interner.get(p);
let first_term_of_phrase = phrase.words.first().unwrap();
if let Some(first_word) = first_term_of_phrase {
result.insert((*first_word, Some(p)));
}
}
Ok(result)
}

View File

@ -1,27 +1,19 @@
pub mod build;
pub mod compute_docids;
use fxhash::FxHashSet;
use roaring::RoaringBitmap;
use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
use crate::search::new::interner::{DedupInterner, Interned, MappedInterner};
use crate::search::new::logger::SearchLogger;
use crate::search::new::query_term::{Phrase, QueryTerm};
use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::{QueryGraph, QueryNode, SearchContext};
use crate::Result;
#[derive(Clone, PartialEq, Eq, Hash)]
pub enum ProximityCondition {
Uninit {
left_term: Interned<QueryTerm>,
right_term: Interned<QueryTerm>,
right_term_ngram_len: u8,
cost: u8,
},
Term {
term: Interned<QueryTerm>,
},
Uninit { left_term: LocatedQueryTermSubset, right_term: LocatedQueryTermSubset, cost: u8 },
Term { term: LocatedQueryTermSubset },
}
pub enum ProximityGraph {}
@ -33,18 +25,17 @@ impl RankingRuleGraphTrait for ProximityGraph {
ctx: &mut SearchContext,
condition: &Self::Condition,
universe: &RoaringBitmap,
) -> Result<(roaring::RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)>
{
) -> Result<ComputedCondition> {
compute_docids::compute_docids(ctx, condition, universe)
}
fn build_edges(
ctx: &mut SearchContext,
conditions_interner: &mut DedupInterner<Self::Condition>,
source_node: &QueryNode,
dest_node: &QueryNode,
) -> Result<Vec<(u8, Option<Interned<Self::Condition>>)>> {
build::build_edges(ctx, conditions_interner, source_node, dest_node)
source_term: Option<&LocatedQueryTermSubset>,
dest_term: &LocatedQueryTermSubset,
) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
build::build_edges(ctx, conditions_interner, source_term, dest_term)
}
fn log_state(
@ -52,8 +43,8 @@ impl RankingRuleGraphTrait for ProximityGraph {
paths: &[Vec<Interned<ProximityCondition>>],
dead_ends_cache: &DeadEndsCache<Self::Condition>,
universe: &RoaringBitmap,
distances: &MappedInterner<QueryNode, Vec<u16>>,
cost: u16,
distances: &MappedInterner<QueryNode, Vec<u64>>,
cost: u64,
logger: &mut dyn SearchLogger<QueryGraph>,
) {
logger.log_proximity_state(graph, paths, dead_ends_cache, universe, distances, cost);
@ -66,8 +57,9 @@ impl RankingRuleGraphTrait for ProximityGraph {
Ok(format!("{cost}: cost"))
}
ProximityCondition::Term { term } => {
let term = ctx.term_interner.get(*term);
Ok(format!("{} : exists", ctx.word_interner.get(term.original)))
let original_term = ctx.term_interner.get(term.term_subset.original);
let original_word = ctx.word_interner.get(original_term.original);
Ok(format!("{original_word} : exists"))
}
}
}