Rewrite proximity ranking rule

2025-06-14 12:01:36 +02:00 · 2023-03-30 11:59:06 +02:00 · 2023-03-30 11:59:06 +02:00 · 01e24dd630
commit 01e24dd630
parent ae6bb1ce17
3 changed files with 181 additions and 221 deletions
--- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs
@ -2,44 +2,26 @@
 use super::ProximityCondition;
 use crate::search::new::interner::{DedupInterner, Interned};
-use crate::search::new::query_graph::QueryNodeData;
+use crate::search::new::query_term::LocatedQueryTermSubset;
-use crate::search::new::query_term::LocatedQueryTerm;
+use crate::search::new::SearchContext;
 use crate::search::new::{QueryNode, SearchContext};
 use crate::Result;
 pub fn build_edges(
    _ctx: &mut SearchContext,
    conditions_interner: &mut DedupInterner<ProximityCondition>,
-    from_node: &QueryNode,
+    left_term: Option<&LocatedQueryTermSubset>,
-    to_node: &QueryNode,
+    right_term: &LocatedQueryTermSubset,
-) -> Result<Vec<(u8, Option<Interned<ProximityCondition>>)>> {
+) -> Result<Vec<(u32, Interned<ProximityCondition>)>> {
-    let right_term = match &to_node.data {
+    let right_ngram_length = right_term.term_ids.len();
        QueryNodeData::End => return Ok(vec![(0, None)]),
        QueryNodeData::Deleted | QueryNodeData::Start => return Ok(vec![]),
        QueryNodeData::Term(term) => term,
    };
-    let LocatedQueryTerm { value: right_term_interned, positions: right_positions } = right_term;
+    let Some(left_term) = left_term else {
    let (right_start_position, right_ngram_length) =
        (*right_positions.start(), right_positions.len());
    let (left_term_interned, left_end_position) = match &from_node.data {
        QueryNodeData::Term(LocatedQueryTerm { value, positions }) => (*value, *positions.end()),
        QueryNodeData::Deleted => return Ok(vec![]),
        QueryNodeData::Start => {
        return Ok(vec![(
-                (right_ngram_length - 1) as u8,
+            (right_ngram_length - 1) as u32,
-                Some(
+            conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
                    conditions_interner
                        .insert(ProximityCondition::Term { term: *right_term_interned }),
                ),
        )])
        }
        QueryNodeData::End => return Ok(vec![]),
    };
-    if left_end_position + 1 != right_start_position {
+    if left_term.positions.end() + 1 != *right_term.positions.start() {
        // We want to ignore this pair of terms
        // Unconditionally walk through the edge without computing the docids
        // This can happen when, in a query like `the sun flowers are beautiful`, the term
@ -47,30 +29,26 @@ pub fn build_edges(
        // The remaining query graph represents `the sun .. are beautiful`
        // but `sun` and `are` have no proximity condition between them
        return Ok(vec![(
-            (right_ngram_length - 1) as u8,
+            (right_ngram_length - 1) as u32,
-            Some(
+            conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
                conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned }),
            ),
        )]);
    }
    let mut conditions = vec![];
    for cost in right_ngram_length..(7 + right_ngram_length) {
        let cost = cost as u8;
        conditions.push((
-            cost,
+            cost as u32,
-            Some(conditions_interner.insert(ProximityCondition::Uninit {
+            conditions_interner.insert(ProximityCondition::Uninit {
-                left_term: left_term_interned,
+                left_term: left_term.clone(),
-                right_term: *right_term_interned,
+                right_term: right_term.clone(),
-                right_term_ngram_len: right_ngram_length as u8,
+                cost: cost as u8,
-                cost,
+            }),
            })),
        ))
    }
    conditions.push((
-        (7 + right_ngram_length) as u8,
+        (7 + right_ngram_length) as u32,
-        Some(conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned })),
+        conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
    ));
    Ok(conditions)
--- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
@ -1,49 +1,37 @@
 #![allow(clippy::too_many_arguments)]
 use std::iter::FromIterator;
 use super::ProximityCondition;
-use crate::search::new::db_cache::DatabaseCache;
+use crate::search::new::interner::Interned;
-use crate::search::new::interner::{DedupInterner, Interned};
+use crate::search::new::query_term::{Phrase, QueryTermSubset};
-use crate::search::new::query_term::{Phrase, QueryTerm};
+use crate::search::new::ranking_rule_graph::ComputedCondition;
-use crate::search::new::resolve_query_graph::QueryTermDocIdsCache;
+use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
 use crate::search::new::SearchContext;
-use crate::{CboRoaringBitmapCodec, Index, Result};
+use crate::{CboRoaringBitmapCodec, Result};
 use fxhash::FxHashSet;
 use heed::RoTxn;
 use roaring::RoaringBitmap;
 use std::collections::BTreeSet;
 pub fn compute_docids(
    ctx: &mut SearchContext,
    condition: &ProximityCondition,
    universe: &RoaringBitmap,
-) -> Result<(RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)> {
+) -> Result<ComputedCondition> {
-    let (left_term, right_term, right_term_ngram_len, cost) = match condition {
+    let (left_term, right_term, cost) = match condition {
-        ProximityCondition::Uninit { left_term, right_term, right_term_ngram_len, cost } => {
+        ProximityCondition::Uninit { left_term, right_term, cost } => {
-            (*left_term, *right_term, *right_term_ngram_len, *cost)
+            (left_term, right_term, *cost)
        }
        ProximityCondition::Term { term } => {
-            let term_v = ctx.term_interner.get(*term);
+            let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?;
-            return Ok((
+            docids &= universe;
-                ctx.term_docids
+            return Ok(ComputedCondition {
-                    .get_query_term_docids(
+                docids,
-                        ctx.index,
+                universe_len: universe.len(),
-                        ctx.txn,
+                start_term_subset: None,
-                        &mut ctx.db_cache,
+                end_term_subset: term.clone(),
-                        &ctx.word_interner,
+            });
                        &ctx.term_interner,
                        &ctx.phrase_interner,
                        *term,
                    )?
                    .clone(),
                FxHashSet::from_iter(term_v.all_single_words_except_prefix_db()),
                FxHashSet::from_iter(term_v.all_phrases()),
            ));
        }
    };
-    let left_term = ctx.term_interner.get(left_term);
+    let right_term_ngram_len = right_term.term_ids.len() as u8;
    let right_term = ctx.term_interner.get(right_term);
    // e.g. for the simple words `sun .. flower`
    // the cost is 5
@ -57,20 +45,13 @@ pub fn compute_docids(
    let forward_proximity = 1 + cost - right_term_ngram_len;
    let backward_proximity = cost - right_term_ngram_len;
    let mut used_words = FxHashSet::default();
    let mut used_phrases = FxHashSet::default();
    let mut docids = RoaringBitmap::new();
-    if let Some(right_prefix) = right_term.use_prefix_db {
+    if let Some(right_prefix) = right_term.term_subset.use_prefix_db(ctx) {
-        for (left_phrase, left_word) in last_word_of_term_iter(left_term, &ctx.phrase_interner) {
+        for (left_phrase, left_word) in last_words_of_term_derivations(ctx, &left_term.term_subset)?
        {
            compute_prefix_edges(
-                ctx.index,
+                ctx,
                ctx.txn,
                &mut ctx.db_cache,
                &mut ctx.term_docids,
                &ctx.word_interner,
                &ctx.phrase_interner,
                left_word,
                right_prefix,
                left_phrase,
@ -78,8 +59,6 @@ pub fn compute_docids(
                backward_proximity,
                &mut docids,
                universe,
                &mut used_words,
                &mut used_phrases,
            )?;
        }
    }
@ -91,39 +70,60 @@ pub fn compute_docids(
    // + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been
    // reached
-    for (left_phrase, left_word) in last_word_of_term_iter(left_term, &ctx.phrase_interner) {
+    for (left_phrase, left_word) in last_words_of_term_derivations(ctx, &left_term.term_subset)? {
-        for (right_word, right_phrase) in first_word_of_term_iter(right_term, &ctx.phrase_interner)
+        // Before computing the edges, check that the left word and left phrase
-        {
+        // aren't disjoint with the universe, but only do it if there is more than
        // one word derivation to the right.
        //
        // This is an optimisation to avoid checking for an excessive number of
        // pairs.
        // WAIT, NO.
        // This should only be done once per node.
        // Here, we'll potentially do is.. 16 times?
        // Maybe we should do it at edge-build time instead.
        // Same for the future attribute ranking rule.
        let right_derivs = first_word_of_term_iter(ctx, &right_term.term_subset)?;
        if right_derivs.len() > 1 {
            let universe = &universe;
            if let Some(left_phrase) = left_phrase {
                if universe.is_disjoint(ctx.get_phrase_docids(left_phrase)?) {
                    continue;
                }
            } else if let Some(lw_bytes) = ctx.get_db_word_docids(left_word)? {
                let left_word_docids = CboRoaringBitmapCodec::deserialize_from(lw_bytes)?;
                if universe.is_disjoint(&left_word_docids) {
                    continue;
                }
            }
        }
        for (right_word, right_phrase) in right_derivs {
            compute_non_prefix_edges(
-                ctx.index,
+                ctx,
                ctx.txn,
                &mut ctx.db_cache,
                &mut ctx.term_docids,
                &ctx.word_interner,
                &ctx.phrase_interner,
                left_word,
                right_word,
-                &[left_phrase, right_phrase].iter().copied().flatten().collect::<Vec<_>>(),
+                left_phrase,
                right_phrase,
                forward_proximity,
                backward_proximity,
                &mut docids,
                universe,
                &mut used_words,
                &mut used_phrases,
            )?;
        }
    }
-    Ok((docids, used_words, used_phrases))
+    Ok(ComputedCondition {
        docids,
        universe_len: universe.len(),
        // TODO: think about whether we want to reduce the subset,
        // we probably should!
        start_term_subset: Some(left_term.clone()),
        end_term_subset: right_term.clone(),
    })
 }
-fn compute_prefix_edges<'ctx>(
+fn compute_prefix_edges(
-    index: &Index,
+    ctx: &mut SearchContext,
    txn: &'ctx RoTxn,
    db_cache: &mut DatabaseCache<'ctx>,
    term_docids: &mut QueryTermDocIdsCache,
    word_interner: &DedupInterner<String>,
    phrase_interner: &DedupInterner<Phrase>,
    left_word: Interned<String>,
    right_prefix: Interned<String>,
    left_phrase: Option<Interned<Phrase>>,
@ -131,21 +131,16 @@ fn compute_prefix_edges<'ctx>(
    backward_proximity: u8,
    docids: &mut RoaringBitmap,
    universe: &RoaringBitmap,
    used_words: &mut FxHashSet<Interned<String>>,
    used_phrases: &mut FxHashSet<Interned<Phrase>>,
 ) -> Result<()> {
    let mut used_left_words = BTreeSet::new();
    let mut used_left_phrases = BTreeSet::new();
    let mut used_right_prefix = BTreeSet::new();
    let mut universe = universe.clone();
    if let Some(phrase) = left_phrase {
-        let phrase_docids = term_docids.get_phrase_docids(
+        let phrase_docids = ctx.get_phrase_docids(phrase)?;
            index,
            txn,
            db_cache,
            word_interner,
            phrase_interner,
            phrase,
        )?;
        if !phrase_docids.is_empty() {
-            used_phrases.insert(phrase);
+            used_left_phrases.insert(phrase);
        }
        universe &= phrase_docids;
        if universe.is_empty() {
@ -153,36 +148,28 @@ fn compute_prefix_edges<'ctx>(
        }
    }
-    if let Some(new_docids) = db_cache.get_word_prefix_pair_proximity_docids(
+    if let Some(new_docids) =
-        index,
+        ctx.get_db_word_prefix_pair_proximity_docids(left_word, right_prefix, forward_proximity)?
-        txn,
+    {
        word_interner,
        left_word,
        right_prefix,
        forward_proximity,
    )? {
        let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
        if !new_docids.is_empty() {
-            used_words.insert(left_word);
+            used_left_words.insert(left_word);
-            used_words.insert(right_prefix);
+            used_right_prefix.insert(right_prefix);
            *docids |= new_docids;
        }
    }
    // No swapping when computing the proximity between a phrase and a word
    if left_phrase.is_none() {
-        if let Some(new_docids) = db_cache.get_prefix_word_pair_proximity_docids(
+        if let Some(new_docids) = ctx.get_db_prefix_word_pair_proximity_docids(
            index,
            txn,
            word_interner,
            right_prefix,
            left_word,
            backward_proximity,
        )? {
            let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
            if !new_docids.is_empty() {
-                used_words.insert(left_word);
+                used_left_words.insert(left_word);
-                used_words.insert(right_prefix);
+                used_right_prefix.insert(right_prefix);
                *docids |= new_docids;
            }
        }
@ -191,72 +178,59 @@ fn compute_prefix_edges<'ctx>(
    Ok(())
 }
-fn compute_non_prefix_edges<'ctx>(
+fn compute_non_prefix_edges(
-    index: &Index,
+    ctx: &mut SearchContext,
    txn: &'ctx RoTxn,
    db_cache: &mut DatabaseCache<'ctx>,
    term_docids: &mut QueryTermDocIdsCache,
    word_interner: &DedupInterner<String>,
    phrase_interner: &DedupInterner<Phrase>,
    word1: Interned<String>,
    word2: Interned<String>,
-    phrases: &[Interned<Phrase>],
+    left_phrase: Option<Interned<Phrase>>,
    right_phrase: Option<Interned<Phrase>>,
    forward_proximity: u8,
    backward_proximity: u8,
    docids: &mut RoaringBitmap,
    universe: &RoaringBitmap,
    used_words: &mut FxHashSet<Interned<String>>,
    used_phrases: &mut FxHashSet<Interned<Phrase>>,
 ) -> Result<()> {
    let mut used_left_phrases = BTreeSet::new();
    let mut used_right_phrases = BTreeSet::new();
    let mut used_left_words = BTreeSet::new();
    let mut used_right_words = BTreeSet::new();
    let mut universe = universe.clone();
-    for phrase in phrases {
+
-        let phrase_docids = term_docids.get_phrase_docids(
+    for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() {
-            index,
+        let phrase_docids = ctx.get_phrase_docids(phrase)?;
            txn,
            db_cache,
            word_interner,
            phrase_interner,
            *phrase,
        )?;
        if !phrase_docids.is_empty() {
            used_phrases.insert(*phrase);
        }
        universe &= phrase_docids;
        if universe.is_empty() {
            return Ok(());
        }
    }
-    if let Some(new_docids) = db_cache.get_word_pair_proximity_docids(
+    if let Some(left_phrase) = left_phrase {
-        index,
+        used_left_phrases.insert(left_phrase);
-        txn,
+    }
-        word_interner,
+    if let Some(right_phrase) = right_phrase {
-        word1,
+        used_right_phrases.insert(right_phrase);
-        word2,
+    }
-        forward_proximity,
+
-    )? {
+    if let Some(new_docids) =
        ctx.get_db_word_pair_proximity_docids(word1, word2, forward_proximity)?
    {
        let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
        if !new_docids.is_empty() {
-            used_words.insert(word1);
+            used_left_words.insert(word1);
-            used_words.insert(word2);
+            used_right_words.insert(word2);
            *docids |= new_docids;
        }
    }
    if backward_proximity >= 1
            // no swapping when either term is a phrase
-            && phrases.is_empty()
+            && left_phrase.is_none() && right_phrase.is_none()
    {
        if let Some(new_docids) =
            ctx.get_db_word_pair_proximity_docids(word2, word1, backward_proximity)?
        {
        if let Some(new_docids) = db_cache.get_word_pair_proximity_docids(
            index,
            txn,
            word_interner,
            word2,
            word1,
            backward_proximity,
        )? {
            let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
            if !new_docids.is_empty() {
-                used_words.insert(word1);
+                used_left_words.insert(word2);
-                used_words.insert(word2);
+                used_right_words.insert(word1);
                *docids |= new_docids;
            }
        }
@ -265,25 +239,41 @@ fn compute_non_prefix_edges<'ctx>(
    Ok(())
 }
-fn last_word_of_term_iter<'t>(
+fn last_words_of_term_derivations(
-    t: &'t QueryTerm,
+    ctx: &mut SearchContext,
-    phrase_interner: &'t DedupInterner<Phrase>,
+    t: &QueryTermSubset,
-) -> impl Iterator<Item = (Option<Interned<Phrase>>, Interned<String>)> + 't {
+) -> Result<BTreeSet<(Option<Interned<Phrase>>, Interned<String>)>> {
-    t.all_single_words_except_prefix_db().map(|w| (None, w)).chain(t.all_phrases().flat_map(
+    let mut result = BTreeSet::new();
-        move |p| {
+
-            let phrase = phrase_interner.get(p);
+    for w in t.all_single_words_except_prefix_db(ctx)? {
-            phrase.words.last().unwrap().map(|last| (Some(p), last))
+        result.insert((None, w));
-        },
+    }
-    ))
+    for p in t.all_phrases(ctx)? {
        let phrase = ctx.phrase_interner.get(p);
        let last_term_of_phrase = phrase.words.last().unwrap();
        if let Some(last_word) = last_term_of_phrase {
            result.insert((Some(p), *last_word));
        }
    }
    Ok(result)
 }
-fn first_word_of_term_iter<'t>(
+fn first_word_of_term_iter(
-    t: &'t QueryTerm,
+    ctx: &mut SearchContext,
-    phrase_interner: &'t DedupInterner<Phrase>,
+    t: &QueryTermSubset,
-) -> impl Iterator<Item = (Interned<String>, Option<Interned<Phrase>>)> + 't {
+) -> Result<BTreeSet<(Interned<String>, Option<Interned<Phrase>>)>> {
-    t.all_single_words_except_prefix_db().map(|w| (w, None)).chain(t.all_phrases().flat_map(
+    let mut result = BTreeSet::new();
-        move |p| {
+    let all_words = t.all_single_words_except_prefix_db(ctx)?;
-            let phrase = phrase_interner.get(p);
+    for w in all_words {
-            phrase.words.first().unwrap().map(|first| (first, Some(p)))
+        result.insert((w, None));
-        },
+    }
-    ))
+    for p in t.all_phrases(ctx)? {
        let phrase = ctx.phrase_interner.get(p);
        let first_term_of_phrase = phrase.words.first().unwrap();
        if let Some(first_word) = first_term_of_phrase {
            result.insert((*first_word, Some(p)));
        }
    }
    Ok(result)
 }
--- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs
@ -1,27 +1,19 @@
 pub mod build;
 pub mod compute_docids;
 use fxhash::FxHashSet;
 use roaring::RoaringBitmap;
-use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
+use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
 use crate::search::new::interner::{DedupInterner, Interned, MappedInterner};
 use crate::search::new::logger::SearchLogger;
-use crate::search::new::query_term::{Phrase, QueryTerm};
+use crate::search::new::query_term::LocatedQueryTermSubset;
 use crate::search::new::{QueryGraph, QueryNode, SearchContext};
 use crate::Result;
 #[derive(Clone, PartialEq, Eq, Hash)]
 pub enum ProximityCondition {
-    Uninit {
+    Uninit { left_term: LocatedQueryTermSubset, right_term: LocatedQueryTermSubset, cost: u8 },
-        left_term: Interned<QueryTerm>,
+    Term { term: LocatedQueryTermSubset },
        right_term: Interned<QueryTerm>,
        right_term_ngram_len: u8,
        cost: u8,
    },
    Term {
        term: Interned<QueryTerm>,
    },
 }
 pub enum ProximityGraph {}
@ -33,18 +25,17 @@ impl RankingRuleGraphTrait for ProximityGraph {
        ctx: &mut SearchContext,
        condition: &Self::Condition,
        universe: &RoaringBitmap,
-    ) -> Result<(roaring::RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)>
+    ) -> Result<ComputedCondition> {
    {
        compute_docids::compute_docids(ctx, condition, universe)
    }
    fn build_edges(
        ctx: &mut SearchContext,
        conditions_interner: &mut DedupInterner<Self::Condition>,
-        source_node: &QueryNode,
+        source_term: Option<&LocatedQueryTermSubset>,
-        dest_node: &QueryNode,
+        dest_term: &LocatedQueryTermSubset,
-    ) -> Result<Vec<(u8, Option<Interned<Self::Condition>>)>> {
+    ) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
-        build::build_edges(ctx, conditions_interner, source_node, dest_node)
+        build::build_edges(ctx, conditions_interner, source_term, dest_term)
    }
    fn log_state(
@ -52,8 +43,8 @@ impl RankingRuleGraphTrait for ProximityGraph {
        paths: &[Vec<Interned<ProximityCondition>>],
        dead_ends_cache: &DeadEndsCache<Self::Condition>,
        universe: &RoaringBitmap,
-        distances: &MappedInterner<QueryNode, Vec<u16>>,
+        distances: &MappedInterner<QueryNode, Vec<u64>>,
-        cost: u16,
+        cost: u64,
        logger: &mut dyn SearchLogger<QueryGraph>,
    ) {
        logger.log_proximity_state(graph, paths, dead_ends_cache, universe, distances, cost);
@ -66,8 +57,9 @@ impl RankingRuleGraphTrait for ProximityGraph {
                Ok(format!("{cost}: cost"))
            }
            ProximityCondition::Term { term } => {
-                let term = ctx.term_interner.get(*term);
+                let original_term = ctx.term_interner.get(term.term_subset.original);
-                Ok(format!("{} : exists", ctx.word_interner.get(term.original)))
+                let original_word = ctx.word_interner.get(original_term.original);
                Ok(format!("{original_word} : exists"))
            }
        }
    }