From d6a7c28e4d902f0b6bb46845c2431f562400d5d4 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 12 Apr 2023 11:40:44 +0200 Subject: [PATCH] Implement the attribute ranking rule edge computation --- milli/src/search/new/db_cache.rs | 67 +++++++++++++++++++ milli/src/search/new/query_term/phrase.rs | 4 ++ .../new/ranking_rule_graph/attribute/mod.rs | 51 +++++++------- .../src/search/new/ranking_rule_graph/mod.rs | 2 + 4 files changed, 98 insertions(+), 26 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 6193f4c58..c32c7ba79 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -34,6 +34,9 @@ pub struct DatabaseCache<'ctx> { pub words_fst: Option>>, pub word_position_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, pub word_fid_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, + pub word_prefix_fid_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, + pub word_fids: FxHashMap, Vec>, + pub word_prefix_fids: FxHashMap, Vec>, } impl<'ctx> DatabaseCache<'ctx> { fn get_value<'v, K1, KC>( @@ -284,4 +287,68 @@ impl<'ctx> SearchContext<'ctx> { .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) .transpose() } + + pub fn get_db_word_prefix_fid_docids( + &mut self, + word_prefix: Interned, + fid: u16, + ) -> Result> { + DatabaseCache::get_value( + self.txn, + (word_prefix, fid), + &(self.word_interner.get(word_prefix).as_str(), fid), + &mut self.db_cache.word_prefix_fid_docids, + self.index.word_prefix_fid_docids.remap_data_type::(), + ) + } + + pub fn get_db_word_fids(&mut self, word: Interned) -> Result> { + let fids = match self.db_cache.word_fids.entry(word) { + Entry::Occupied(fids) => fids.get().clone(), + Entry::Vacant(entry) => { + let key = self.word_interner.get(word).as_bytes(); + let mut fids = vec![]; + let remap_key_type = self + .index + .word_fid_docids + .remap_types::() + .prefix_iter(self.txn, key)? + .remap_key_type::(); + for result in remap_key_type { + let ((_, fid), value) = result?; + // filling other caches to avoid searching for them again + self.db_cache.word_fid_docids.insert((word, fid), Some(value)); + fids.push(fid); + } + entry.insert(fids.clone()); + fids + } + }; + Ok(fids) + } + + pub fn get_db_word_prefix_fids(&mut self, word_prefix: Interned) -> Result> { + let fids = match self.db_cache.word_prefix_fids.entry(word_prefix) { + Entry::Occupied(fids) => fids.get().clone(), + Entry::Vacant(entry) => { + let key = self.word_interner.get(word_prefix).as_bytes(); + let mut fids = vec![]; + let remap_key_type = self + .index + .word_prefix_fid_docids + .remap_types::() + .prefix_iter(self.txn, key)? + .remap_key_type::(); + for result in remap_key_type { + let ((_, fid), value) = result?; + // filling other caches to avoid searching for them again + self.db_cache.word_prefix_fid_docids.insert((word_prefix, fid), Some(value)); + fids.push(fid); + } + entry.insert(fids.clone()); + fids + } + }; + Ok(fids) + } } diff --git a/milli/src/search/new/query_term/phrase.rs b/milli/src/search/new/query_term/phrase.rs index 2ea8e0d39..033c5cf12 100644 --- a/milli/src/search/new/query_term/phrase.rs +++ b/milli/src/search/new/query_term/phrase.rs @@ -13,4 +13,8 @@ impl Interned { let p = ctx.phrase_interner.get(self); p.words.iter().flatten().map(|w| ctx.word_interner.get(*w)).join(" ") } + pub fn words(self, ctx: &SearchContext) -> Vec>> { + let p = ctx.phrase_interner.get(self); + p.words.clone() + } } diff --git a/milli/src/search/new/ranking_rule_graph/attribute/mod.rs b/milli/src/search/new/ranking_rule_graph/attribute/mod.rs index cfa8a1fbc..2b25adc7e 100644 --- a/milli/src/search/new/ranking_rule_graph/attribute/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/attribute/mod.rs @@ -1,3 +1,4 @@ +use fxhash::FxHashSet; use roaring::RoaringBitmap; use super::{ComputedCondition, RankingRuleGraphTrait}; @@ -10,7 +11,7 @@ use crate::Result; #[derive(Clone, PartialEq, Eq, Hash)] pub struct AttributeCondition { term: LocatedQueryTermSubset, - nbr_typos: u8, + fid: u16, } pub enum AttributeGraph {} @@ -44,39 +45,37 @@ impl RankingRuleGraphTrait for AttributeGraph { ) -> Result)>> { let term = to_term; - let mut edges = vec![]; + let mut all_fields = FxHashSet::default(); for word in term.term_subset.all_single_words_except_prefix_db(ctx)? { - // ... + let fields = ctx.get_db_word_fids(word)?; + all_fields.extend(fields); } - // Ngrams have a base typo cost - // 2-gram -> equivalent to 1 typo - // 3-gram -> equivalent to 2 typos - let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 }; + for phrase in term.term_subset.all_phrases(ctx)? { + for &word in phrase.words(ctx).iter().flatten() { + let fields = ctx.get_db_word_fids(word)?; + all_fields.extend(fields); + } + } - for nbr_typos in 0..=term.term_subset.max_nbr_typos(ctx) { - let mut term = term.clone(); - match nbr_typos { - 0 => { - term.term_subset.clear_one_typo_subset(); - term.term_subset.clear_two_typo_subset(); - } - 1 => { - term.term_subset.clear_zero_typo_subset(); - term.term_subset.clear_two_typo_subset(); - } - 2 => { - term.term_subset.clear_zero_typo_subset(); - term.term_subset.clear_one_typo_subset(); - } - _ => panic!(), - }; + if let Some(word_prefix) = term.term_subset.use_prefix_db(ctx) { + let fields = ctx.get_db_word_prefix_fids(word_prefix)?; + all_fields.extend(fields); + } + let mut edges = vec![]; + for fid in all_fields { + // TODO: We can improve performances and relevancy by storing + // the term subsets associated to each field ids fetched. edges.push(( - nbr_typos as u32 + base_cost, - conditions_interner.insert(AttributeCondition { term, nbr_typos }), + fid as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10. + conditions_interner.insert(AttributeCondition { + term: term.clone(), // TODO remove this ugly clone + fid, + }), )); } + Ok(edges) } } diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 6a9bfff93..cccc0643a 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -16,6 +16,8 @@ mod exactness; mod proximity; /// Implementation of the `typo` ranking rule mod typo; +/// Implementation of the `attribute` ranking rule +mod attribute; use std::hash::Hash;