From 05fe856e6e75ce4e9d39c7f3123da85611edb4e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 15 Mar 2023 13:02:55 +0100 Subject: [PATCH] Merge forward and backward proximity conditions in proximity graph --- milli/src/search/new/mod.rs | 4 +- .../new/ranking_rule_graph/proximity/build.rs | 85 ++++++++----------- .../proximity/compute_docids.rs | 10 +-- .../new/ranking_rule_graph/proximity/mod.rs | 9 +- 4 files changed, 50 insertions(+), 58 deletions(-) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 02cd7b1de..d893691b8 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -303,7 +303,7 @@ mod tests { let mut ctx = SearchContext::new(&index, &txn); let results = execute_search( &mut ctx, - "releases from poison by the government", + "which a the releases from poison by the government", // "sun flower s are the best", // "zero config", TermsMatchingStrategy::Last, @@ -359,7 +359,7 @@ mod tests { let start = Instant::now(); let mut s = Search::new(&txn, &index); - s.query("releases from poison by the government"); + s.query("which a the releases from poison by the government"); s.terms_matching_strategy(TermsMatchingStrategy::Last); // s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); let docs = s.execute().unwrap(); diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index c7eaa5d0c..8ae634fbf 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -94,7 +94,7 @@ pub fn build_edges<'ctx>( )]); } - let mut cost_proximity_word_pairs = BTreeMap::>>::new(); + let mut cost_word_pairs = BTreeMap::>::new(); if let Some(right_prefix) = right_term.use_prefix_db { for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) { @@ -106,7 +106,7 @@ pub fn build_edges<'ctx>( right_ngram_length, left_word, right_prefix, - &mut cost_proximity_word_pairs, + &mut cost_word_pairs, left_phrase, )?; } @@ -129,28 +129,22 @@ pub fn build_edges<'ctx>( right_ngram_length, left_word, right_word, - &mut cost_proximity_word_pairs, + &mut cost_word_pairs, &[left_phrase, right_phrase].iter().copied().flatten().collect::>(), )?; } } - let mut new_edges = cost_proximity_word_pairs + let mut new_edges = cost_word_pairs .into_iter() - .flat_map(|(cost, proximity_word_pairs)| { - let mut edges = vec![]; - for (proximity, word_pairs) in proximity_word_pairs { - edges.push(( - cost, - EdgeCondition::Conditional(conditions_interner.insert( - ProximityCondition::Pairs { - pairs: word_pairs.into_boxed_slice(), - proximity, - }, - )), - )) - } - edges + .map(|(cost, word_pairs)| { + ( + cost, + EdgeCondition::Conditional( + conditions_interner + .insert(ProximityCondition::Pairs { pairs: word_pairs.into_boxed_slice() }), + ), + ) }) .collect::>(); new_edges.push(( @@ -170,7 +164,7 @@ fn add_prefix_edges<'ctx>( right_ngram_length: usize, left_word: Interned, right_prefix: Interned, - cost_proximity_word_pairs: &mut BTreeMap>>, + cost_proximity_word_pairs: &mut BTreeMap>, left_phrase: Option>, ) -> Result<()> { for proximity in 1..=(8 - right_ngram_length) { @@ -188,16 +182,12 @@ fn add_prefix_edges<'ctx>( )? .is_some() { - cost_proximity_word_pairs - .entry(cost) - .or_default() - .entry(proximity as u8) - .or_default() - .push(WordPair::WordPrefix { - phrases: left_phrase.into_iter().collect(), - left: left_word, - right_prefix, - }); + cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::WordPrefix { + phrases: left_phrase.into_iter().collect(), + left: left_word, + right_prefix, + proximity: proximity as u8, + }); } // No swapping when computing the proximity between a phrase and a word @@ -213,12 +203,11 @@ fn add_prefix_edges<'ctx>( )? .is_some() { - cost_proximity_word_pairs - .entry(cost) - .or_default() - .entry(proximity as u8) - .or_default() - .push(WordPair::WordPrefixSwapped { left_prefix: right_prefix, right: left_word }); + cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::WordPrefixSwapped { + left_prefix: right_prefix, + right: left_word, + proximity: proximity as u8 - 1, + }); } } Ok(()) @@ -232,7 +221,7 @@ fn add_non_prefix_edges<'ctx>( right_ngram_length: usize, word1: Interned, word2: Interned, - cost_proximity_word_pairs: &mut BTreeMap>>, + cost_proximity_word_pairs: &mut BTreeMap>, phrases: &[Interned], ) -> Result<()> { for proximity in 1..=(8 - right_ngram_length) { @@ -248,12 +237,12 @@ fn add_non_prefix_edges<'ctx>( )? .is_some() { - cost_proximity_word_pairs - .entry(cost) - .or_default() - .entry(proximity as u8) - .or_default() - .push(WordPair::Words { phrases: phrases.to_vec(), left: word1, right: word2 }); + cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::Words { + phrases: phrases.to_vec(), + left: word1, + right: word2, + proximity: proximity as u8, + }); } if proximity > 1 // no swapping when either term is a phrase @@ -269,12 +258,12 @@ fn add_non_prefix_edges<'ctx>( )? .is_some() { - cost_proximity_word_pairs - .entry(cost) - .or_default() - .entry(proximity as u8 - 1) - .or_default() - .push(WordPair::Words { phrases: vec![], left: word2, right: word1 }); + cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::Words { + phrases: vec![], + left: word2, + right: word1, + proximity: proximity as u8 - 1, + }); } } Ok(()) diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 1123692f3..cdf167cb0 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -18,7 +18,7 @@ pub fn compute_docids<'ctx>( phrase_interner, term_interner, } = ctx; - let (pairs, proximity) = match edge { + let pairs = match edge { ProximityCondition::Term { term } => { return term_docids .get_query_term_docids( @@ -32,12 +32,12 @@ pub fn compute_docids<'ctx>( ) .cloned() } - ProximityCondition::Pairs { pairs, proximity } => (pairs, proximity), + ProximityCondition::Pairs { pairs } => pairs, }; let mut pair_docids = RoaringBitmap::new(); for pair in pairs.iter() { let pair = match pair { - WordPair::Words { phrases, left, right } => { + WordPair::Words { phrases, left, right, proximity } => { let mut docids = db_cache .get_word_pair_proximity_docids( index, @@ -64,7 +64,7 @@ pub fn compute_docids<'ctx>( } docids } - WordPair::WordPrefix { phrases, left, right_prefix } => { + WordPair::WordPrefix { phrases, left, right_prefix, proximity } => { let mut docids = db_cache .get_word_prefix_pair_proximity_docids( index, @@ -91,7 +91,7 @@ pub fn compute_docids<'ctx>( } docids } - WordPair::WordPrefixSwapped { left_prefix, right } => db_cache + WordPair::WordPrefixSwapped { left_prefix, right, proximity } => db_cache .get_prefix_word_pair_proximity_docids( index, txn, diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 427a1e904..65c282108 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -18,22 +18,25 @@ pub enum WordPair { phrases: Vec>, left: Interned, right: Interned, + proximity: u8, }, WordPrefix { phrases: Vec>, left: Interned, right_prefix: Interned, + proximity: u8, }, WordPrefixSwapped { left_prefix: Interned, right: Interned, + proximity: u8, }, } #[derive(Clone, PartialEq, Eq, Hash)] pub enum ProximityCondition { Term { term: Interned }, - Pairs { pairs: Box<[WordPair]>, proximity: u8 }, + Pairs { pairs: Box<[WordPair]> }, } pub enum ProximityGraph {} @@ -46,8 +49,8 @@ impl RankingRuleGraphTrait for ProximityGraph { ProximityCondition::Term { term } => { format!("term {term}") } - ProximityCondition::Pairs { pairs, proximity } => { - format!("prox {proximity}, {} pairs", pairs.len()) + ProximityCondition::Pairs { pairs } => { + format!("pairs {}", pairs.len()) } } }