Rewrite proximity ranking rule

2025-06-13 19:41:37 +02:00 · 2023-03-30 11:59:06 +02:00 · 2023-03-30 11:59:06 +02:00 · 01e24dd630
commit 01e24dd630
parent ae6bb1ce17
3 changed files with 181 additions and 221 deletions
--- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs
@ -2,44 +2,26 @@

 use super::ProximityCondition;
 use crate::search::new::interner::{DedupInterner, Interned};
-use crate::search::new::query_graph::QueryNodeData;
-use crate::search::new::query_term::LocatedQueryTerm;
-use crate::search::new::{QueryNode, SearchContext};
+use crate::search::new::query_term::LocatedQueryTermSubset;
+use crate::search::new::SearchContext;
 use crate::Result;

 pub fn build_edges(
    _ctx: &mut SearchContext,
    conditions_interner: &mut DedupInterner<ProximityCondition>,
-    from_node: &QueryNode,
-    to_node: &QueryNode,
-) -> Result<Vec<(u8, Option<Interned<ProximityCondition>>)>> {
-    let right_term = match &to_node.data {
-        QueryNodeData::End => return Ok(vec![(0, None)]),
-        QueryNodeData::Deleted | QueryNodeData::Start => return Ok(vec![]),
-        QueryNodeData::Term(term) => term,
+    left_term: Option<&LocatedQueryTermSubset>,
+    right_term: &LocatedQueryTermSubset,
+) -> Result<Vec<(u32, Interned<ProximityCondition>)>> {
+    let right_ngram_length = right_term.term_ids.len();
+
+    let Some(left_term) = left_term else {
+        return Ok(vec![(
+            (right_ngram_length - 1) as u32,
+            conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
+        )])
    };

-    let LocatedQueryTerm { value: right_term_interned, positions: right_positions } = right_term;
-
-    let (right_start_position, right_ngram_length) =
-        (*right_positions.start(), right_positions.len());
-
-    let (left_term_interned, left_end_position) = match &from_node.data {
-        QueryNodeData::Term(LocatedQueryTerm { value, positions }) => (*value, *positions.end()),
-        QueryNodeData::Deleted => return Ok(vec![]),
-        QueryNodeData::Start => {
-            return Ok(vec![(
-                (right_ngram_length - 1) as u8,
-                Some(
-                    conditions_interner
-                        .insert(ProximityCondition::Term { term: *right_term_interned }),
-                ),
-            )])
-        }
-        QueryNodeData::End => return Ok(vec![]),
-    };
-
-    if left_end_position + 1 != right_start_position {
+    if left_term.positions.end() + 1 != *right_term.positions.start() {
        // We want to ignore this pair of terms
        // Unconditionally walk through the edge without computing the docids
        // This can happen when, in a query like `the sun flowers are beautiful`, the term
@ -47,30 +29,26 @@ pub fn build_edges(
        // The remaining query graph represents `the sun .. are beautiful`
        // but `sun` and `are` have no proximity condition between them
        return Ok(vec![(
-            (right_ngram_length - 1) as u8,
-            Some(
-                conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned }),
-            ),
+            (right_ngram_length - 1) as u32,
+            conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
        )]);
    }

    let mut conditions = vec![];
    for cost in right_ngram_length..(7 + right_ngram_length) {
-        let cost = cost as u8;
        conditions.push((
-            cost,
-            Some(conditions_interner.insert(ProximityCondition::Uninit {
-                left_term: left_term_interned,
-                right_term: *right_term_interned,
-                right_term_ngram_len: right_ngram_length as u8,
-                cost,
-            })),
+            cost as u32,
+            conditions_interner.insert(ProximityCondition::Uninit {
+                left_term: left_term.clone(),
+                right_term: right_term.clone(),
+                cost: cost as u8,
+            }),
        ))
    }

    conditions.push((
-        (7 + right_ngram_length) as u8,
-        Some(conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned })),
+        (7 + right_ngram_length) as u32,
+        conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
    ));

    Ok(conditions)
--- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
@ -1,49 +1,37 @@
 #![allow(clippy::too_many_arguments)]

-use std::iter::FromIterator;
-
 use super::ProximityCondition;
-use crate::search::new::db_cache::DatabaseCache;
-use crate::search::new::interner::{DedupInterner, Interned};
-use crate::search::new::query_term::{Phrase, QueryTerm};
-use crate::search::new::resolve_query_graph::QueryTermDocIdsCache;
+use crate::search::new::interner::Interned;
+use crate::search::new::query_term::{Phrase, QueryTermSubset};
+use crate::search::new::ranking_rule_graph::ComputedCondition;
+use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
 use crate::search::new::SearchContext;
-use crate::{CboRoaringBitmapCodec, Index, Result};
-use fxhash::FxHashSet;
-use heed::RoTxn;
+use crate::{CboRoaringBitmapCodec, Result};
 use roaring::RoaringBitmap;
+use std::collections::BTreeSet;

 pub fn compute_docids(
    ctx: &mut SearchContext,
    condition: &ProximityCondition,
    universe: &RoaringBitmap,
-) -> Result<(RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)> {
-    let (left_term, right_term, right_term_ngram_len, cost) = match condition {
-        ProximityCondition::Uninit { left_term, right_term, right_term_ngram_len, cost } => {
-            (*left_term, *right_term, *right_term_ngram_len, *cost)
+) -> Result<ComputedCondition> {
+    let (left_term, right_term, cost) = match condition {
+        ProximityCondition::Uninit { left_term, right_term, cost } => {
+            (left_term, right_term, *cost)
        }
        ProximityCondition::Term { term } => {
-            let term_v = ctx.term_interner.get(*term);
-            return Ok((
-                ctx.term_docids
-                    .get_query_term_docids(
-                        ctx.index,
-                        ctx.txn,
-                        &mut ctx.db_cache,
-                        &ctx.word_interner,
-                        &ctx.term_interner,
-                        &ctx.phrase_interner,
-                        *term,
-                    )?
-                    .clone(),
-                FxHashSet::from_iter(term_v.all_single_words_except_prefix_db()),
-                FxHashSet::from_iter(term_v.all_phrases()),
-            ));
+            let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?;
+            docids &= universe;
+            return Ok(ComputedCondition {
+                docids,
+                universe_len: universe.len(),
+                start_term_subset: None,
+                end_term_subset: term.clone(),
+            });
        }
    };

-    let left_term = ctx.term_interner.get(left_term);
-    let right_term = ctx.term_interner.get(right_term);
+    let right_term_ngram_len = right_term.term_ids.len() as u8;

    // e.g. for the simple words `sun .. flower`
    // the cost is 5
@ -57,20 +45,13 @@ pub fn compute_docids(
    let forward_proximity = 1 + cost - right_term_ngram_len;
    let backward_proximity = cost - right_term_ngram_len;

-    let mut used_words = FxHashSet::default();
-    let mut used_phrases = FxHashSet::default();
-
    let mut docids = RoaringBitmap::new();

-    if let Some(right_prefix) = right_term.use_prefix_db {
-        for (left_phrase, left_word) in last_word_of_term_iter(left_term, &ctx.phrase_interner) {
+    if let Some(right_prefix) = right_term.term_subset.use_prefix_db(ctx) {
+        for (left_phrase, left_word) in last_words_of_term_derivations(ctx, &left_term.term_subset)?
+        {
            compute_prefix_edges(
-                ctx.index,
-                ctx.txn,
-                &mut ctx.db_cache,
-                &mut ctx.term_docids,
-                &ctx.word_interner,
-                &ctx.phrase_interner,
+                ctx,
                left_word,
                right_prefix,
                left_phrase,
@ -78,8 +59,6 @@ pub fn compute_docids(
                backward_proximity,
                &mut docids,
                universe,
-                &mut used_words,
-                &mut used_phrases,
            )?;
        }
    }
@ -91,39 +70,60 @@ pub fn compute_docids(
    // + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been
    // reached

-    for (left_phrase, left_word) in last_word_of_term_iter(left_term, &ctx.phrase_interner) {
-        for (right_word, right_phrase) in first_word_of_term_iter(right_term, &ctx.phrase_interner)
-        {
+    for (left_phrase, left_word) in last_words_of_term_derivations(ctx, &left_term.term_subset)? {
+        // Before computing the edges, check that the left word and left phrase
+        // aren't disjoint with the universe, but only do it if there is more than
+        // one word derivation to the right.
+        //
+        // This is an optimisation to avoid checking for an excessive number of
+        // pairs.
+        // WAIT, NO.
+        // This should only be done once per node.
+        // Here, we'll potentially do is.. 16 times?
+        // Maybe we should do it at edge-build time instead.
+        // Same for the future attribute ranking rule.
+        let right_derivs = first_word_of_term_iter(ctx, &right_term.term_subset)?;
+        if right_derivs.len() > 1 {
+            let universe = &universe;
+            if let Some(left_phrase) = left_phrase {
+                if universe.is_disjoint(ctx.get_phrase_docids(left_phrase)?) {
+                    continue;
+                }
+            } else if let Some(lw_bytes) = ctx.get_db_word_docids(left_word)? {
+                let left_word_docids = CboRoaringBitmapCodec::deserialize_from(lw_bytes)?;
+                if universe.is_disjoint(&left_word_docids) {
+                    continue;
+                }
+            }
+        }
+
+        for (right_word, right_phrase) in right_derivs {
            compute_non_prefix_edges(
-                ctx.index,
-                ctx.txn,
-                &mut ctx.db_cache,
-                &mut ctx.term_docids,
-                &ctx.word_interner,
-                &ctx.phrase_interner,
+                ctx,
                left_word,
                right_word,
-                &[left_phrase, right_phrase].iter().copied().flatten().collect::<Vec<_>>(),
+                left_phrase,
+                right_phrase,
                forward_proximity,
                backward_proximity,
                &mut docids,
                universe,
-                &mut used_words,
-                &mut used_phrases,
            )?;
        }
    }

-    Ok((docids, used_words, used_phrases))
+    Ok(ComputedCondition {
+        docids,
+        universe_len: universe.len(),
+        // TODO: think about whether we want to reduce the subset,
+        // we probably should!
+        start_term_subset: Some(left_term.clone()),
+        end_term_subset: right_term.clone(),
+    })
 }

-fn compute_prefix_edges<'ctx>(
-    index: &Index,
-    txn: &'ctx RoTxn,
-    db_cache: &mut DatabaseCache<'ctx>,
-    term_docids: &mut QueryTermDocIdsCache,
-    word_interner: &DedupInterner<String>,
-    phrase_interner: &DedupInterner<Phrase>,
+fn compute_prefix_edges(
+    ctx: &mut SearchContext,
    left_word: Interned<String>,
    right_prefix: Interned<String>,
    left_phrase: Option<Interned<Phrase>>,
@ -131,21 +131,16 @@ fn compute_prefix_edges<'ctx>(
    backward_proximity: u8,
    docids: &mut RoaringBitmap,
    universe: &RoaringBitmap,
-    used_words: &mut FxHashSet<Interned<String>>,
-    used_phrases: &mut FxHashSet<Interned<Phrase>>,
 ) -> Result<()> {
+    let mut used_left_words = BTreeSet::new();
+    let mut used_left_phrases = BTreeSet::new();
+    let mut used_right_prefix = BTreeSet::new();
+
    let mut universe = universe.clone();
    if let Some(phrase) = left_phrase {
-        let phrase_docids = term_docids.get_phrase_docids(
-            index,
-            txn,
-            db_cache,
-            word_interner,
-            phrase_interner,
-            phrase,
-        )?;
+        let phrase_docids = ctx.get_phrase_docids(phrase)?;
        if !phrase_docids.is_empty() {
-            used_phrases.insert(phrase);
+            used_left_phrases.insert(phrase);
        }
        universe &= phrase_docids;
        if universe.is_empty() {
@ -153,36 +148,28 @@ fn compute_prefix_edges<'ctx>(
        }
    }

-    if let Some(new_docids) = db_cache.get_word_prefix_pair_proximity_docids(
-        index,
-        txn,
-        word_interner,
-        left_word,
-        right_prefix,
-        forward_proximity,
-    )? {
+    if let Some(new_docids) =
+        ctx.get_db_word_prefix_pair_proximity_docids(left_word, right_prefix, forward_proximity)?
+    {
        let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
        if !new_docids.is_empty() {
-            used_words.insert(left_word);
-            used_words.insert(right_prefix);
+            used_left_words.insert(left_word);
+            used_right_prefix.insert(right_prefix);
            *docids |= new_docids;
        }
    }

    // No swapping when computing the proximity between a phrase and a word
    if left_phrase.is_none() {
-        if let Some(new_docids) = db_cache.get_prefix_word_pair_proximity_docids(
-            index,
-            txn,
-            word_interner,
+        if let Some(new_docids) = ctx.get_db_prefix_word_pair_proximity_docids(
            right_prefix,
            left_word,
            backward_proximity,
        )? {
            let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
            if !new_docids.is_empty() {
-                used_words.insert(left_word);
-                used_words.insert(right_prefix);
+                used_left_words.insert(left_word);
+                used_right_prefix.insert(right_prefix);
                *docids |= new_docids;
            }
        }
@ -191,72 +178,59 @@ fn compute_prefix_edges<'ctx>(
    Ok(())
 }

-fn compute_non_prefix_edges<'ctx>(
-    index: &Index,
-    txn: &'ctx RoTxn,
-    db_cache: &mut DatabaseCache<'ctx>,
-    term_docids: &mut QueryTermDocIdsCache,
-    word_interner: &DedupInterner<String>,
-    phrase_interner: &DedupInterner<Phrase>,
+fn compute_non_prefix_edges(
+    ctx: &mut SearchContext,
    word1: Interned<String>,
    word2: Interned<String>,
-    phrases: &[Interned<Phrase>],
+    left_phrase: Option<Interned<Phrase>>,
+    right_phrase: Option<Interned<Phrase>>,
    forward_proximity: u8,
    backward_proximity: u8,
    docids: &mut RoaringBitmap,
    universe: &RoaringBitmap,
-    used_words: &mut FxHashSet<Interned<String>>,
-    used_phrases: &mut FxHashSet<Interned<Phrase>>,
 ) -> Result<()> {
+    let mut used_left_phrases = BTreeSet::new();
+    let mut used_right_phrases = BTreeSet::new();
+    let mut used_left_words = BTreeSet::new();
+    let mut used_right_words = BTreeSet::new();
+
    let mut universe = universe.clone();
-    for phrase in phrases {
-        let phrase_docids = term_docids.get_phrase_docids(
-            index,
-            txn,
-            db_cache,
-            word_interner,
-            phrase_interner,
-            *phrase,
-        )?;
-        if !phrase_docids.is_empty() {
-            used_phrases.insert(*phrase);
-        }
+
+    for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() {
+        let phrase_docids = ctx.get_phrase_docids(phrase)?;
        universe &= phrase_docids;
        if universe.is_empty() {
            return Ok(());
        }
    }
-    if let Some(new_docids) = db_cache.get_word_pair_proximity_docids(
-        index,
-        txn,
-        word_interner,
-        word1,
-        word2,
-        forward_proximity,
-    )? {
+    if let Some(left_phrase) = left_phrase {
+        used_left_phrases.insert(left_phrase);
+    }
+    if let Some(right_phrase) = right_phrase {
+        used_right_phrases.insert(right_phrase);
+    }
+
+    if let Some(new_docids) =
+        ctx.get_db_word_pair_proximity_docids(word1, word2, forward_proximity)?
+    {
        let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
        if !new_docids.is_empty() {
-            used_words.insert(word1);
-            used_words.insert(word2);
+            used_left_words.insert(word1);
+            used_right_words.insert(word2);
            *docids |= new_docids;
        }
    }
    if backward_proximity >= 1
            // no swapping when either term is a phrase
-            && phrases.is_empty()
+            && left_phrase.is_none() && right_phrase.is_none()
    {
-        if let Some(new_docids) = db_cache.get_word_pair_proximity_docids(
-            index,
-            txn,
-            word_interner,
-            word2,
-            word1,
-            backward_proximity,
-        )? {
+        if let Some(new_docids) =
+            ctx.get_db_word_pair_proximity_docids(word2, word1, backward_proximity)?
+        {
            let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
            if !new_docids.is_empty() {
-                used_words.insert(word1);
-                used_words.insert(word2);
+                used_left_words.insert(word2);
+                used_right_words.insert(word1);
                *docids |= new_docids;
            }
        }
@ -265,25 +239,41 @@ fn compute_non_prefix_edges<'ctx>(
    Ok(())
 }

-fn last_word_of_term_iter<'t>(
-    t: &'t QueryTerm,
-    phrase_interner: &'t DedupInterner<Phrase>,
-) -> impl Iterator<Item = (Option<Interned<Phrase>>, Interned<String>)> + 't {
-    t.all_single_words_except_prefix_db().map(|w| (None, w)).chain(t.all_phrases().flat_map(
-        move |p| {
-            let phrase = phrase_interner.get(p);
-            phrase.words.last().unwrap().map(|last| (Some(p), last))
-        },
-    ))
+fn last_words_of_term_derivations(
+    ctx: &mut SearchContext,
+    t: &QueryTermSubset,
+) -> Result<BTreeSet<(Option<Interned<Phrase>>, Interned<String>)>> {
+    let mut result = BTreeSet::new();
+
+    for w in t.all_single_words_except_prefix_db(ctx)? {
+        result.insert((None, w));
+    }
+    for p in t.all_phrases(ctx)? {
+        let phrase = ctx.phrase_interner.get(p);
+        let last_term_of_phrase = phrase.words.last().unwrap();
+        if let Some(last_word) = last_term_of_phrase {
+            result.insert((Some(p), *last_word));
+        }
+    }
+
+    Ok(result)
 }
-fn first_word_of_term_iter<'t>(
-    t: &'t QueryTerm,
-    phrase_interner: &'t DedupInterner<Phrase>,
-) -> impl Iterator<Item = (Interned<String>, Option<Interned<Phrase>>)> + 't {
-    t.all_single_words_except_prefix_db().map(|w| (w, None)).chain(t.all_phrases().flat_map(
-        move |p| {
-            let phrase = phrase_interner.get(p);
-            phrase.words.first().unwrap().map(|first| (first, Some(p)))
-        },
-    ))
+fn first_word_of_term_iter(
+    ctx: &mut SearchContext,
+    t: &QueryTermSubset,
+) -> Result<BTreeSet<(Interned<String>, Option<Interned<Phrase>>)>> {
+    let mut result = BTreeSet::new();
+    let all_words = t.all_single_words_except_prefix_db(ctx)?;
+    for w in all_words {
+        result.insert((w, None));
+    }
+    for p in t.all_phrases(ctx)? {
+        let phrase = ctx.phrase_interner.get(p);
+        let first_term_of_phrase = phrase.words.first().unwrap();
+        if let Some(first_word) = first_term_of_phrase {
+            result.insert((*first_word, Some(p)));
+        }
+    }
+
+    Ok(result)
 }
--- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs
@ -1,27 +1,19 @@
 pub mod build;
 pub mod compute_docids;

-use fxhash::FxHashSet;
 use roaring::RoaringBitmap;

-use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
+use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
 use crate::search::new::interner::{DedupInterner, Interned, MappedInterner};
 use crate::search::new::logger::SearchLogger;
-use crate::search::new::query_term::{Phrase, QueryTerm};
+use crate::search::new::query_term::LocatedQueryTermSubset;
 use crate::search::new::{QueryGraph, QueryNode, SearchContext};
 use crate::Result;

 #[derive(Clone, PartialEq, Eq, Hash)]
 pub enum ProximityCondition {
-    Uninit {
-        left_term: Interned<QueryTerm>,
-        right_term: Interned<QueryTerm>,
-        right_term_ngram_len: u8,
-        cost: u8,
-    },
-    Term {
-        term: Interned<QueryTerm>,
-    },
+    Uninit { left_term: LocatedQueryTermSubset, right_term: LocatedQueryTermSubset, cost: u8 },
+    Term { term: LocatedQueryTermSubset },
 }

 pub enum ProximityGraph {}
@ -33,18 +25,17 @@ impl RankingRuleGraphTrait for ProximityGraph {
        ctx: &mut SearchContext,
        condition: &Self::Condition,
        universe: &RoaringBitmap,
-    ) -> Result<(roaring::RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)>
-    {
+    ) -> Result<ComputedCondition> {
        compute_docids::compute_docids(ctx, condition, universe)
    }

    fn build_edges(
        ctx: &mut SearchContext,
        conditions_interner: &mut DedupInterner<Self::Condition>,
-        source_node: &QueryNode,
-        dest_node: &QueryNode,
-    ) -> Result<Vec<(u8, Option<Interned<Self::Condition>>)>> {
-        build::build_edges(ctx, conditions_interner, source_node, dest_node)
+        source_term: Option<&LocatedQueryTermSubset>,
+        dest_term: &LocatedQueryTermSubset,
+    ) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
+        build::build_edges(ctx, conditions_interner, source_term, dest_term)
    }

    fn log_state(
@ -52,8 +43,8 @@ impl RankingRuleGraphTrait for ProximityGraph {
        paths: &[Vec<Interned<ProximityCondition>>],
        dead_ends_cache: &DeadEndsCache<Self::Condition>,
        universe: &RoaringBitmap,
-        distances: &MappedInterner<QueryNode, Vec<u16>>,
-        cost: u16,
+        distances: &MappedInterner<QueryNode, Vec<u64>>,
+        cost: u64,
        logger: &mut dyn SearchLogger<QueryGraph>,
    ) {
        logger.log_proximity_state(graph, paths, dead_ends_cache, universe, distances, cost);
@ -66,8 +57,9 @@ impl RankingRuleGraphTrait for ProximityGraph {
                Ok(format!("{cost}: cost"))
            }
            ProximityCondition::Term { term } => {
-                let term = ctx.term_interner.get(*term);
-                Ok(format!("{} : exists", ctx.word_interner.get(term.original)))
+                let original_term = ctx.term_interner.get(term.term_subset.original);
+                let original_word = ctx.word_interner.get(original_term.original);
+                Ok(format!("{original_word} : exists"))
            }
        }
    }