Implement the attribute ranking rule edge computation

2025-03-29 11:00:39 +01:00 · 2023-04-12 11:40:44 +02:00 · 2023-04-12 11:40:44 +02:00 · d6a7c28e4d
commit d6a7c28e4d
parent e55efc419e
4 changed files with 98 additions and 26 deletions
--- a/milli/src/search/new/db_cache.rs
+++ b/milli/src/search/new/db_cache.rs
@ -34,6 +34,9 @@ pub struct DatabaseCache<'ctx> {
    pub words_fst: Option<fst::Set<Cow<'ctx, [u8]>>>,
    pub word_position_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
    pub word_fid_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
    pub word_prefix_fid_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
    pub word_fids: FxHashMap<Interned<String>, Vec<u16>>,
    pub word_prefix_fids: FxHashMap<Interned<String>, Vec<u16>>,
 }
 impl<'ctx> DatabaseCache<'ctx> {
    fn get_value<'v, K1, KC>(
@ -284,4 +287,68 @@ impl<'ctx> SearchContext<'ctx> {
        .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
        .transpose()
    }
    pub fn get_db_word_prefix_fid_docids(
        &mut self,
        word_prefix: Interned<String>,
        fid: u16,
    ) -> Result<Option<&'ctx [u8]>> {
        DatabaseCache::get_value(
            self.txn,
            (word_prefix, fid),
            &(self.word_interner.get(word_prefix).as_str(), fid),
            &mut self.db_cache.word_prefix_fid_docids,
            self.index.word_prefix_fid_docids.remap_data_type::<ByteSlice>(),
        )
    }
    pub fn get_db_word_fids(&mut self, word: Interned<String>) -> Result<Vec<u16>> {
        let fids = match self.db_cache.word_fids.entry(word) {
            Entry::Occupied(fids) => fids.get().clone(),
            Entry::Vacant(entry) => {
                let key = self.word_interner.get(word).as_bytes();
                let mut fids = vec![];
                let remap_key_type = self
                    .index
                    .word_fid_docids
                    .remap_types::<ByteSlice, ByteSlice>()
                    .prefix_iter(self.txn, key)?
                    .remap_key_type::<StrBEU16Codec>();
                for result in remap_key_type {
                    let ((_, fid), value) = result?;
                    // filling other caches to avoid searching for them again
                    self.db_cache.word_fid_docids.insert((word, fid), Some(value));
                    fids.push(fid);
                }
                entry.insert(fids.clone());
                fids
            }
        };
        Ok(fids)
    }
    pub fn get_db_word_prefix_fids(&mut self, word_prefix: Interned<String>) -> Result<Vec<u16>> {
        let fids = match self.db_cache.word_prefix_fids.entry(word_prefix) {
            Entry::Occupied(fids) => fids.get().clone(),
            Entry::Vacant(entry) => {
                let key = self.word_interner.get(word_prefix).as_bytes();
                let mut fids = vec![];
                let remap_key_type = self
                    .index
                    .word_prefix_fid_docids
                    .remap_types::<ByteSlice, ByteSlice>()
                    .prefix_iter(self.txn, key)?
                    .remap_key_type::<StrBEU16Codec>();
                for result in remap_key_type {
                    let ((_, fid), value) = result?;
                    // filling other caches to avoid searching for them again
                    self.db_cache.word_prefix_fid_docids.insert((word_prefix, fid), Some(value));
                    fids.push(fid);
                }
                entry.insert(fids.clone());
                fids
            }
        };
        Ok(fids)
    }
 }
--- a/milli/src/search/new/query_term/phrase.rs
+++ b/milli/src/search/new/query_term/phrase.rs
@ -13,4 +13,8 @@ impl Interned<Phrase> {
        let p = ctx.phrase_interner.get(self);
        p.words.iter().flatten().map(|w| ctx.word_interner.get(*w)).join(" ")
    }
    pub fn words(self, ctx: &SearchContext) -> Vec<Option<Interned<String>>> {
        let p = ctx.phrase_interner.get(self);
        p.words.clone()
    }
 }
--- a/milli/src/search/new/ranking_rule_graph/attribute/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/attribute/mod.rs
@ -1,3 +1,4 @@
 use fxhash::FxHashSet;
 use roaring::RoaringBitmap;
 use super::{ComputedCondition, RankingRuleGraphTrait};
@ -10,7 +11,7 @@ use crate::Result;
 #[derive(Clone, PartialEq, Eq, Hash)]
 pub struct AttributeCondition {
    term: LocatedQueryTermSubset,
-    nbr_typos: u8,
+    fid: u16,
 }
 pub enum AttributeGraph {}
@ -44,39 +45,37 @@ impl RankingRuleGraphTrait for AttributeGraph {
    ) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
        let term = to_term;
-        let mut edges = vec![];
+        let mut all_fields = FxHashSet::default();
        for word in term.term_subset.all_single_words_except_prefix_db(ctx)? {
-            // ...
+            let fields = ctx.get_db_word_fids(word)?;
            all_fields.extend(fields);
        }
-        // Ngrams have a base typo cost
+        for phrase in term.term_subset.all_phrases(ctx)? {
-        // 2-gram -> equivalent to 1 typo
+            for &word in phrase.words(ctx).iter().flatten() {
-        // 3-gram -> equivalent to 2 typos
+                let fields = ctx.get_db_word_fids(word)?;
-        let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 };
+                all_fields.extend(fields);
            }
        }
-        for nbr_typos in 0..=term.term_subset.max_nbr_typos(ctx) {
+        if let Some(word_prefix) = term.term_subset.use_prefix_db(ctx) {
-            let mut term = term.clone();
+            let fields = ctx.get_db_word_prefix_fids(word_prefix)?;
-            match nbr_typos {
+            all_fields.extend(fields);
-                0 => {
+        }
                    term.term_subset.clear_one_typo_subset();
                    term.term_subset.clear_two_typo_subset();
                }
                1 => {
                    term.term_subset.clear_zero_typo_subset();
                    term.term_subset.clear_two_typo_subset();
                }
                2 => {
                    term.term_subset.clear_zero_typo_subset();
                    term.term_subset.clear_one_typo_subset();
                }
                _ => panic!(),
            };
        let mut edges = vec![];
        for fid in all_fields {
            // TODO: We can improve performances and relevancy by storing
            //       the term subsets associated to each field ids fetched.
            edges.push((
-                nbr_typos as u32 + base_cost,
+                fid as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10.
-                conditions_interner.insert(AttributeCondition { term, nbr_typos }),
+                conditions_interner.insert(AttributeCondition {
                    term: term.clone(), // TODO remove this ugly clone
                    fid,
                }),
            ));
        }
        Ok(edges)
    }
 }
--- a/milli/src/search/new/ranking_rule_graph/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/mod.rs
@ -16,6 +16,8 @@ mod exactness;
 mod proximity;
 /// Implementation of the `typo` ranking rule
 mod typo;
 /// Implementation of the `attribute` ranking rule
 mod attribute;
 use std::hash::Hash;