Fix two bugs in proximity ranking rule

This commit is contained in:
Loïc Lecrenier 2023-03-21 11:43:25 +01:00
parent 83e5b4ed0d
commit 384fdc2df4
2 changed files with 76 additions and 53 deletions

View File

@ -213,8 +213,6 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase
dead_ends_cache.forbid_condition(latest_condition); dead_ends_cache.forbid_condition(latest_condition);
// 2. remove all the edges with this condition from the ranking rule graph // 2. remove all the edges with this condition from the ranking rule graph
graph.remove_edges_with_condition(latest_condition); graph.remove_edges_with_condition(latest_condition);
// 3. Also remove the entry from the condition_docids_cache, since we don't need it anymore
condition_docids_cache.cache.remove(&latest_condition);
return Ok(ControlFlow::Continue(())); return Ok(ControlFlow::Continue(()));
} }
path_docids &= condition_docids; path_docids &= condition_docids;

View File

@ -2,47 +2,37 @@
use std::iter::FromIterator; use std::iter::FromIterator;
use fxhash::FxHashSet;
use heed::RoTxn;
use roaring::RoaringBitmap;
use super::ProximityCondition; use super::ProximityCondition;
use crate::search::new::db_cache::DatabaseCache; use crate::search::new::db_cache::DatabaseCache;
use crate::search::new::interner::{DedupInterner, Interned}; use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_term::{Phrase, QueryTerm}; use crate::search::new::query_term::{Phrase, QueryTerm};
use crate::search::new::resolve_query_graph::QueryTermDocIdsCache;
use crate::search::new::SearchContext; use crate::search::new::SearchContext;
use crate::{CboRoaringBitmapCodec, Result}; use crate::{CboRoaringBitmapCodec, Index, Result};
use fxhash::FxHashSet;
use heed::RoTxn;
use roaring::RoaringBitmap;
pub fn compute_docids<'ctx>( pub fn compute_docids<'ctx>(
ctx: &mut SearchContext<'ctx>, ctx: &mut SearchContext<'ctx>,
condition: &ProximityCondition, condition: &ProximityCondition,
universe: &RoaringBitmap, universe: &RoaringBitmap,
) -> Result<(RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)> { ) -> Result<(RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)> {
let SearchContext {
index,
txn,
db_cache,
word_interner,
term_docids,
phrase_interner,
term_interner,
} = ctx;
let (left_term, right_term, right_term_ngram_len, cost) = match condition { let (left_term, right_term, right_term_ngram_len, cost) = match condition {
ProximityCondition::Uninit { left_term, right_term, right_term_ngram_len, cost } => { ProximityCondition::Uninit { left_term, right_term, right_term_ngram_len, cost } => {
(*left_term, *right_term, *right_term_ngram_len, *cost) (*left_term, *right_term, *right_term_ngram_len, *cost)
} }
ProximityCondition::Term { term } => { ProximityCondition::Term { term } => {
let term_v = term_interner.get(*term); let term_v = ctx.term_interner.get(*term);
return Ok(( return Ok((
term_docids ctx.term_docids
.get_query_term_docids( .get_query_term_docids(
index, ctx.index,
txn, ctx.txn,
db_cache, &mut ctx.db_cache,
word_interner, &ctx.word_interner,
term_interner, &ctx.term_interner,
phrase_interner, &ctx.phrase_interner,
*term, *term,
)? )?
.clone(), .clone(),
@ -52,8 +42,8 @@ pub fn compute_docids<'ctx>(
} }
}; };
let left_term = term_interner.get(left_term); let left_term = ctx.term_interner.get(left_term);
let right_term = term_interner.get(right_term); let right_term = ctx.term_interner.get(right_term);
// e.g. for the simple words `sun .. flower` // e.g. for the simple words `sun .. flower`
// the cost is 5 // the cost is 5
@ -73,12 +63,14 @@ pub fn compute_docids<'ctx>(
let mut docids = RoaringBitmap::new(); let mut docids = RoaringBitmap::new();
if let Some(right_prefix) = right_term.use_prefix_db { if let Some(right_prefix) = right_term.use_prefix_db {
for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) { for (left_phrase, left_word) in last_word_of_term_iter(left_term, &ctx.phrase_interner) {
compute_prefix_edges( compute_prefix_edges(
index, ctx.index,
txn, ctx.txn,
db_cache, &mut ctx.db_cache,
word_interner, &mut ctx.term_docids,
&ctx.word_interner,
&ctx.phrase_interner,
left_word, left_word,
right_prefix, right_prefix,
left_phrase, left_phrase,
@ -99,13 +91,16 @@ pub fn compute_docids<'ctx>(
// + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been // + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been
// reached // reached
for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) { for (left_phrase, left_word) in last_word_of_term_iter(left_term, &ctx.phrase_interner) {
for (right_word, right_phrase) in first_word_of_term_iter(right_term, phrase_interner) { for (right_word, right_phrase) in first_word_of_term_iter(right_term, &ctx.phrase_interner)
{
compute_non_prefix_edges( compute_non_prefix_edges(
index, ctx.index,
txn, ctx.txn,
db_cache, &mut ctx.db_cache,
word_interner, &mut ctx.term_docids,
&ctx.word_interner,
&ctx.phrase_interner,
left_word, left_word,
right_word, right_word,
&[left_phrase, right_phrase].iter().copied().flatten().collect::<Vec<_>>(), &[left_phrase, right_phrase].iter().copied().flatten().collect::<Vec<_>>(),
@ -123,10 +118,12 @@ pub fn compute_docids<'ctx>(
} }
fn compute_prefix_edges<'ctx>( fn compute_prefix_edges<'ctx>(
index: &mut &crate::Index, index: &Index,
txn: &'ctx RoTxn, txn: &'ctx RoTxn,
db_cache: &mut DatabaseCache<'ctx>, db_cache: &mut DatabaseCache<'ctx>,
word_interner: &mut DedupInterner<String>, term_docids: &mut QueryTermDocIdsCache,
word_interner: &DedupInterner<String>,
phrase_interner: &DedupInterner<Phrase>,
left_word: Interned<String>, left_word: Interned<String>,
right_prefix: Interned<String>, right_prefix: Interned<String>,
left_phrase: Option<Interned<Phrase>>, left_phrase: Option<Interned<Phrase>>,
@ -137,10 +134,23 @@ fn compute_prefix_edges<'ctx>(
used_words: &mut FxHashSet<Interned<String>>, used_words: &mut FxHashSet<Interned<String>>,
used_phrases: &mut FxHashSet<Interned<Phrase>>, used_phrases: &mut FxHashSet<Interned<Phrase>>,
) -> Result<()> { ) -> Result<()> {
let mut universe = universe.clone();
if let Some(phrase) = left_phrase { if let Some(phrase) = left_phrase {
// TODO: compute the phrase, take the intersection between let phrase_docids = term_docids.get_phrase_docids(
// the phrase and the docids index,
used_phrases.insert(phrase); // This is not fully correct txn,
db_cache,
word_interner,
phrase_interner,
phrase,
)?;
if !phrase_docids.is_empty() {
used_phrases.insert(phrase);
}
universe &= phrase_docids;
if universe.is_empty() {
return Ok(());
}
} }
if let Some(new_docids) = db_cache.get_word_prefix_pair_proximity_docids( if let Some(new_docids) = db_cache.get_word_prefix_pair_proximity_docids(
@ -151,7 +161,7 @@ fn compute_prefix_edges<'ctx>(
right_prefix, right_prefix,
forward_proximity, forward_proximity,
)? { )? {
let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
if !new_docids.is_empty() { if !new_docids.is_empty() {
used_words.insert(left_word); used_words.insert(left_word);
used_words.insert(right_prefix); used_words.insert(right_prefix);
@ -169,7 +179,7 @@ fn compute_prefix_edges<'ctx>(
left_word, left_word,
backward_proximity, backward_proximity,
)? { )? {
let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
if !new_docids.is_empty() { if !new_docids.is_empty() {
used_words.insert(left_word); used_words.insert(left_word);
used_words.insert(right_prefix); used_words.insert(right_prefix);
@ -182,10 +192,12 @@ fn compute_prefix_edges<'ctx>(
} }
fn compute_non_prefix_edges<'ctx>( fn compute_non_prefix_edges<'ctx>(
index: &mut &crate::Index, index: &Index,
txn: &'ctx RoTxn, txn: &'ctx RoTxn,
db_cache: &mut DatabaseCache<'ctx>, db_cache: &mut DatabaseCache<'ctx>,
word_interner: &mut DedupInterner<String>, term_docids: &mut QueryTermDocIdsCache,
word_interner: &DedupInterner<String>,
phrase_interner: &DedupInterner<Phrase>,
word1: Interned<String>, word1: Interned<String>,
word2: Interned<String>, word2: Interned<String>,
phrases: &[Interned<Phrase>], phrases: &[Interned<Phrase>],
@ -196,10 +208,23 @@ fn compute_non_prefix_edges<'ctx>(
used_words: &mut FxHashSet<Interned<String>>, used_words: &mut FxHashSet<Interned<String>>,
used_phrases: &mut FxHashSet<Interned<Phrase>>, used_phrases: &mut FxHashSet<Interned<Phrase>>,
) -> Result<()> { ) -> Result<()> {
if !phrases.is_empty() { let mut universe = universe.clone();
// TODO: compute the docids associated with these phrases for phrase in phrases {
// take their intersection with the new docids let phrase_docids = term_docids.get_phrase_docids(
used_phrases.extend(phrases); // This is not fully correct index,
txn,
db_cache,
word_interner,
phrase_interner,
*phrase,
)?;
if !phrase_docids.is_empty() {
used_phrases.insert(*phrase);
}
universe &= phrase_docids;
if universe.is_empty() {
return Ok(());
}
} }
if let Some(new_docids) = db_cache.get_word_pair_proximity_docids( if let Some(new_docids) = db_cache.get_word_pair_proximity_docids(
index, index,
@ -209,7 +234,7 @@ fn compute_non_prefix_edges<'ctx>(
word2, word2,
forward_proximity, forward_proximity,
)? { )? {
let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
if !new_docids.is_empty() { if !new_docids.is_empty() {
used_words.insert(word1); used_words.insert(word1);
used_words.insert(word2); used_words.insert(word2);
@ -228,7 +253,7 @@ fn compute_non_prefix_edges<'ctx>(
word1, word1,
backward_proximity, backward_proximity,
)? { )? {
let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
if !new_docids.is_empty() { if !new_docids.is_empty() {
used_words.insert(word1); used_words.insert(word1);
used_words.insert(word2); used_words.insert(word2);