From 01e24dd630d173986d5f7303f4dfe4f1df1ec9a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 30 Mar 2023 11:59:06 +0200 Subject: [PATCH] Rewrite proximity ranking rule --- .../new/ranking_rule_graph/proximity/build.rs | 68 ++-- .../proximity/compute_docids.rs | 298 +++++++++--------- .../new/ranking_rule_graph/proximity/mod.rs | 36 +-- 3 files changed, 181 insertions(+), 221 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 4d42463e8..660d59b3e 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -2,44 +2,26 @@ use super::ProximityCondition; use crate::search::new::interner::{DedupInterner, Interned}; -use crate::search::new::query_graph::QueryNodeData; -use crate::search::new::query_term::LocatedQueryTerm; -use crate::search::new::{QueryNode, SearchContext}; +use crate::search::new::query_term::LocatedQueryTermSubset; +use crate::search::new::SearchContext; use crate::Result; pub fn build_edges( _ctx: &mut SearchContext, conditions_interner: &mut DedupInterner, - from_node: &QueryNode, - to_node: &QueryNode, -) -> Result>)>> { - let right_term = match &to_node.data { - QueryNodeData::End => return Ok(vec![(0, None)]), - QueryNodeData::Deleted | QueryNodeData::Start => return Ok(vec![]), - QueryNodeData::Term(term) => term, + left_term: Option<&LocatedQueryTermSubset>, + right_term: &LocatedQueryTermSubset, +) -> Result)>> { + let right_ngram_length = right_term.term_ids.len(); + + let Some(left_term) = left_term else { + return Ok(vec![( + (right_ngram_length - 1) as u32, + conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }), + )]) }; - let LocatedQueryTerm { value: right_term_interned, positions: right_positions } = right_term; - - let (right_start_position, right_ngram_length) = - (*right_positions.start(), right_positions.len()); - - let (left_term_interned, left_end_position) = match &from_node.data { - QueryNodeData::Term(LocatedQueryTerm { value, positions }) => (*value, *positions.end()), - QueryNodeData::Deleted => return Ok(vec![]), - QueryNodeData::Start => { - return Ok(vec![( - (right_ngram_length - 1) as u8, - Some( - conditions_interner - .insert(ProximityCondition::Term { term: *right_term_interned }), - ), - )]) - } - QueryNodeData::End => return Ok(vec![]), - }; - - if left_end_position + 1 != right_start_position { + if left_term.positions.end() + 1 != *right_term.positions.start() { // We want to ignore this pair of terms // Unconditionally walk through the edge without computing the docids // This can happen when, in a query like `the sun flowers are beautiful`, the term @@ -47,30 +29,26 @@ pub fn build_edges( // The remaining query graph represents `the sun .. are beautiful` // but `sun` and `are` have no proximity condition between them return Ok(vec![( - (right_ngram_length - 1) as u8, - Some( - conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned }), - ), + (right_ngram_length - 1) as u32, + conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }), )]); } let mut conditions = vec![]; for cost in right_ngram_length..(7 + right_ngram_length) { - let cost = cost as u8; conditions.push(( - cost, - Some(conditions_interner.insert(ProximityCondition::Uninit { - left_term: left_term_interned, - right_term: *right_term_interned, - right_term_ngram_len: right_ngram_length as u8, - cost, - })), + cost as u32, + conditions_interner.insert(ProximityCondition::Uninit { + left_term: left_term.clone(), + right_term: right_term.clone(), + cost: cost as u8, + }), )) } conditions.push(( - (7 + right_ngram_length) as u8, - Some(conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned })), + (7 + right_ngram_length) as u32, + conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }), )); Ok(conditions) diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 6f56e6221..8496054b7 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -1,49 +1,37 @@ #![allow(clippy::too_many_arguments)] -use std::iter::FromIterator; - use super::ProximityCondition; -use crate::search::new::db_cache::DatabaseCache; -use crate::search::new::interner::{DedupInterner, Interned}; -use crate::search::new::query_term::{Phrase, QueryTerm}; -use crate::search::new::resolve_query_graph::QueryTermDocIdsCache; +use crate::search::new::interner::Interned; +use crate::search::new::query_term::{Phrase, QueryTermSubset}; +use crate::search::new::ranking_rule_graph::ComputedCondition; +use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; use crate::search::new::SearchContext; -use crate::{CboRoaringBitmapCodec, Index, Result}; -use fxhash::FxHashSet; -use heed::RoTxn; +use crate::{CboRoaringBitmapCodec, Result}; use roaring::RoaringBitmap; +use std::collections::BTreeSet; pub fn compute_docids( ctx: &mut SearchContext, condition: &ProximityCondition, universe: &RoaringBitmap, -) -> Result<(RoaringBitmap, FxHashSet>, FxHashSet>)> { - let (left_term, right_term, right_term_ngram_len, cost) = match condition { - ProximityCondition::Uninit { left_term, right_term, right_term_ngram_len, cost } => { - (*left_term, *right_term, *right_term_ngram_len, *cost) +) -> Result { + let (left_term, right_term, cost) = match condition { + ProximityCondition::Uninit { left_term, right_term, cost } => { + (left_term, right_term, *cost) } ProximityCondition::Term { term } => { - let term_v = ctx.term_interner.get(*term); - return Ok(( - ctx.term_docids - .get_query_term_docids( - ctx.index, - ctx.txn, - &mut ctx.db_cache, - &ctx.word_interner, - &ctx.term_interner, - &ctx.phrase_interner, - *term, - )? - .clone(), - FxHashSet::from_iter(term_v.all_single_words_except_prefix_db()), - FxHashSet::from_iter(term_v.all_phrases()), - )); + let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?; + docids &= universe; + return Ok(ComputedCondition { + docids, + universe_len: universe.len(), + start_term_subset: None, + end_term_subset: term.clone(), + }); } }; - let left_term = ctx.term_interner.get(left_term); - let right_term = ctx.term_interner.get(right_term); + let right_term_ngram_len = right_term.term_ids.len() as u8; // e.g. for the simple words `sun .. flower` // the cost is 5 @@ -57,20 +45,13 @@ pub fn compute_docids( let forward_proximity = 1 + cost - right_term_ngram_len; let backward_proximity = cost - right_term_ngram_len; - let mut used_words = FxHashSet::default(); - let mut used_phrases = FxHashSet::default(); - let mut docids = RoaringBitmap::new(); - if let Some(right_prefix) = right_term.use_prefix_db { - for (left_phrase, left_word) in last_word_of_term_iter(left_term, &ctx.phrase_interner) { + if let Some(right_prefix) = right_term.term_subset.use_prefix_db(ctx) { + for (left_phrase, left_word) in last_words_of_term_derivations(ctx, &left_term.term_subset)? + { compute_prefix_edges( - ctx.index, - ctx.txn, - &mut ctx.db_cache, - &mut ctx.term_docids, - &ctx.word_interner, - &ctx.phrase_interner, + ctx, left_word, right_prefix, left_phrase, @@ -78,8 +59,6 @@ pub fn compute_docids( backward_proximity, &mut docids, universe, - &mut used_words, - &mut used_phrases, )?; } } @@ -91,39 +70,60 @@ pub fn compute_docids( // + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been // reached - for (left_phrase, left_word) in last_word_of_term_iter(left_term, &ctx.phrase_interner) { - for (right_word, right_phrase) in first_word_of_term_iter(right_term, &ctx.phrase_interner) - { + for (left_phrase, left_word) in last_words_of_term_derivations(ctx, &left_term.term_subset)? { + // Before computing the edges, check that the left word and left phrase + // aren't disjoint with the universe, but only do it if there is more than + // one word derivation to the right. + // + // This is an optimisation to avoid checking for an excessive number of + // pairs. + // WAIT, NO. + // This should only be done once per node. + // Here, we'll potentially do is.. 16 times? + // Maybe we should do it at edge-build time instead. + // Same for the future attribute ranking rule. + let right_derivs = first_word_of_term_iter(ctx, &right_term.term_subset)?; + if right_derivs.len() > 1 { + let universe = &universe; + if let Some(left_phrase) = left_phrase { + if universe.is_disjoint(ctx.get_phrase_docids(left_phrase)?) { + continue; + } + } else if let Some(lw_bytes) = ctx.get_db_word_docids(left_word)? { + let left_word_docids = CboRoaringBitmapCodec::deserialize_from(lw_bytes)?; + if universe.is_disjoint(&left_word_docids) { + continue; + } + } + } + + for (right_word, right_phrase) in right_derivs { compute_non_prefix_edges( - ctx.index, - ctx.txn, - &mut ctx.db_cache, - &mut ctx.term_docids, - &ctx.word_interner, - &ctx.phrase_interner, + ctx, left_word, right_word, - &[left_phrase, right_phrase].iter().copied().flatten().collect::>(), + left_phrase, + right_phrase, forward_proximity, backward_proximity, &mut docids, universe, - &mut used_words, - &mut used_phrases, )?; } } - Ok((docids, used_words, used_phrases)) + Ok(ComputedCondition { + docids, + universe_len: universe.len(), + // TODO: think about whether we want to reduce the subset, + // we probably should! + start_term_subset: Some(left_term.clone()), + end_term_subset: right_term.clone(), + }) } -fn compute_prefix_edges<'ctx>( - index: &Index, - txn: &'ctx RoTxn, - db_cache: &mut DatabaseCache<'ctx>, - term_docids: &mut QueryTermDocIdsCache, - word_interner: &DedupInterner, - phrase_interner: &DedupInterner, +fn compute_prefix_edges( + ctx: &mut SearchContext, left_word: Interned, right_prefix: Interned, left_phrase: Option>, @@ -131,21 +131,16 @@ fn compute_prefix_edges<'ctx>( backward_proximity: u8, docids: &mut RoaringBitmap, universe: &RoaringBitmap, - used_words: &mut FxHashSet>, - used_phrases: &mut FxHashSet>, ) -> Result<()> { + let mut used_left_words = BTreeSet::new(); + let mut used_left_phrases = BTreeSet::new(); + let mut used_right_prefix = BTreeSet::new(); + let mut universe = universe.clone(); if let Some(phrase) = left_phrase { - let phrase_docids = term_docids.get_phrase_docids( - index, - txn, - db_cache, - word_interner, - phrase_interner, - phrase, - )?; + let phrase_docids = ctx.get_phrase_docids(phrase)?; if !phrase_docids.is_empty() { - used_phrases.insert(phrase); + used_left_phrases.insert(phrase); } universe &= phrase_docids; if universe.is_empty() { @@ -153,36 +148,28 @@ fn compute_prefix_edges<'ctx>( } } - if let Some(new_docids) = db_cache.get_word_prefix_pair_proximity_docids( - index, - txn, - word_interner, - left_word, - right_prefix, - forward_proximity, - )? { + if let Some(new_docids) = + ctx.get_db_word_prefix_pair_proximity_docids(left_word, right_prefix, forward_proximity)? + { let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; if !new_docids.is_empty() { - used_words.insert(left_word); - used_words.insert(right_prefix); + used_left_words.insert(left_word); + used_right_prefix.insert(right_prefix); *docids |= new_docids; } } // No swapping when computing the proximity between a phrase and a word if left_phrase.is_none() { - if let Some(new_docids) = db_cache.get_prefix_word_pair_proximity_docids( - index, - txn, - word_interner, + if let Some(new_docids) = ctx.get_db_prefix_word_pair_proximity_docids( right_prefix, left_word, backward_proximity, )? { let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; if !new_docids.is_empty() { - used_words.insert(left_word); - used_words.insert(right_prefix); + used_left_words.insert(left_word); + used_right_prefix.insert(right_prefix); *docids |= new_docids; } } @@ -191,72 +178,59 @@ fn compute_prefix_edges<'ctx>( Ok(()) } -fn compute_non_prefix_edges<'ctx>( - index: &Index, - txn: &'ctx RoTxn, - db_cache: &mut DatabaseCache<'ctx>, - term_docids: &mut QueryTermDocIdsCache, - word_interner: &DedupInterner, - phrase_interner: &DedupInterner, +fn compute_non_prefix_edges( + ctx: &mut SearchContext, word1: Interned, word2: Interned, - phrases: &[Interned], + left_phrase: Option>, + right_phrase: Option>, forward_proximity: u8, backward_proximity: u8, docids: &mut RoaringBitmap, universe: &RoaringBitmap, - used_words: &mut FxHashSet>, - used_phrases: &mut FxHashSet>, ) -> Result<()> { + let mut used_left_phrases = BTreeSet::new(); + let mut used_right_phrases = BTreeSet::new(); + let mut used_left_words = BTreeSet::new(); + let mut used_right_words = BTreeSet::new(); + let mut universe = universe.clone(); - for phrase in phrases { - let phrase_docids = term_docids.get_phrase_docids( - index, - txn, - db_cache, - word_interner, - phrase_interner, - *phrase, - )?; - if !phrase_docids.is_empty() { - used_phrases.insert(*phrase); - } + + for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() { + let phrase_docids = ctx.get_phrase_docids(phrase)?; universe &= phrase_docids; if universe.is_empty() { return Ok(()); } } - if let Some(new_docids) = db_cache.get_word_pair_proximity_docids( - index, - txn, - word_interner, - word1, - word2, - forward_proximity, - )? { + if let Some(left_phrase) = left_phrase { + used_left_phrases.insert(left_phrase); + } + if let Some(right_phrase) = right_phrase { + used_right_phrases.insert(right_phrase); + } + + if let Some(new_docids) = + ctx.get_db_word_pair_proximity_docids(word1, word2, forward_proximity)? + { let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; if !new_docids.is_empty() { - used_words.insert(word1); - used_words.insert(word2); + used_left_words.insert(word1); + used_right_words.insert(word2); *docids |= new_docids; } } if backward_proximity >= 1 // no swapping when either term is a phrase - && phrases.is_empty() + && left_phrase.is_none() && right_phrase.is_none() { - if let Some(new_docids) = db_cache.get_word_pair_proximity_docids( - index, - txn, - word_interner, - word2, - word1, - backward_proximity, - )? { + if let Some(new_docids) = + ctx.get_db_word_pair_proximity_docids(word2, word1, backward_proximity)? + { let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; if !new_docids.is_empty() { - used_words.insert(word1); - used_words.insert(word2); + used_left_words.insert(word2); + used_right_words.insert(word1); *docids |= new_docids; } } @@ -265,25 +239,41 @@ fn compute_non_prefix_edges<'ctx>( Ok(()) } -fn last_word_of_term_iter<'t>( - t: &'t QueryTerm, - phrase_interner: &'t DedupInterner, -) -> impl Iterator>, Interned)> + 't { - t.all_single_words_except_prefix_db().map(|w| (None, w)).chain(t.all_phrases().flat_map( - move |p| { - let phrase = phrase_interner.get(p); - phrase.words.last().unwrap().map(|last| (Some(p), last)) - }, - )) +fn last_words_of_term_derivations( + ctx: &mut SearchContext, + t: &QueryTermSubset, +) -> Result>, Interned)>> { + let mut result = BTreeSet::new(); + + for w in t.all_single_words_except_prefix_db(ctx)? { + result.insert((None, w)); + } + for p in t.all_phrases(ctx)? { + let phrase = ctx.phrase_interner.get(p); + let last_term_of_phrase = phrase.words.last().unwrap(); + if let Some(last_word) = last_term_of_phrase { + result.insert((Some(p), *last_word)); + } + } + + Ok(result) } -fn first_word_of_term_iter<'t>( - t: &'t QueryTerm, - phrase_interner: &'t DedupInterner, -) -> impl Iterator, Option>)> + 't { - t.all_single_words_except_prefix_db().map(|w| (w, None)).chain(t.all_phrases().flat_map( - move |p| { - let phrase = phrase_interner.get(p); - phrase.words.first().unwrap().map(|first| (first, Some(p))) - }, - )) +fn first_word_of_term_iter( + ctx: &mut SearchContext, + t: &QueryTermSubset, +) -> Result, Option>)>> { + let mut result = BTreeSet::new(); + let all_words = t.all_single_words_except_prefix_db(ctx)?; + for w in all_words { + result.insert((w, None)); + } + for p in t.all_phrases(ctx)? { + let phrase = ctx.phrase_interner.get(p); + let first_term_of_phrase = phrase.words.first().unwrap(); + if let Some(first_word) = first_term_of_phrase { + result.insert((*first_word, Some(p))); + } + } + + Ok(result) } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 3b98ed5b5..81c99fd9a 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -1,27 +1,19 @@ pub mod build; pub mod compute_docids; -use fxhash::FxHashSet; use roaring::RoaringBitmap; -use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; +use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::logger::SearchLogger; -use crate::search::new::query_term::{Phrase, QueryTerm}; +use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; #[derive(Clone, PartialEq, Eq, Hash)] pub enum ProximityCondition { - Uninit { - left_term: Interned, - right_term: Interned, - right_term_ngram_len: u8, - cost: u8, - }, - Term { - term: Interned, - }, + Uninit { left_term: LocatedQueryTermSubset, right_term: LocatedQueryTermSubset, cost: u8 }, + Term { term: LocatedQueryTermSubset }, } pub enum ProximityGraph {} @@ -33,18 +25,17 @@ impl RankingRuleGraphTrait for ProximityGraph { ctx: &mut SearchContext, condition: &Self::Condition, universe: &RoaringBitmap, - ) -> Result<(roaring::RoaringBitmap, FxHashSet>, FxHashSet>)> - { + ) -> Result { compute_docids::compute_docids(ctx, condition, universe) } fn build_edges( ctx: &mut SearchContext, conditions_interner: &mut DedupInterner, - source_node: &QueryNode, - dest_node: &QueryNode, - ) -> Result>)>> { - build::build_edges(ctx, conditions_interner, source_node, dest_node) + source_term: Option<&LocatedQueryTermSubset>, + dest_term: &LocatedQueryTermSubset, + ) -> Result)>> { + build::build_edges(ctx, conditions_interner, source_term, dest_term) } fn log_state( @@ -52,8 +43,8 @@ impl RankingRuleGraphTrait for ProximityGraph { paths: &[Vec>], dead_ends_cache: &DeadEndsCache, universe: &RoaringBitmap, - distances: &MappedInterner>, - cost: u16, + distances: &MappedInterner>, + cost: u64, logger: &mut dyn SearchLogger, ) { logger.log_proximity_state(graph, paths, dead_ends_cache, universe, distances, cost); @@ -66,8 +57,9 @@ impl RankingRuleGraphTrait for ProximityGraph { Ok(format!("{cost}: cost")) } ProximityCondition::Term { term } => { - let term = ctx.term_interner.get(*term); - Ok(format!("{} : exists", ctx.word_interner.get(term.original))) + let original_term = ctx.term_interner.get(term.term_subset.original); + let original_word = ctx.word_interner.get(original_term.original); + Ok(format!("{original_word} : exists")) } } }