From 18d578dfc439db736892d58ccda6c60d323dca26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 15 Sep 2022 09:34:35 +0200 Subject: [PATCH] Adjust some algorithms using DBs of word pair proximities --- milli/src/search/criteria/exactness.rs | 1 + milli/src/search/criteria/mod.rs | 183 ++++++++++++++++++++----- 2 files changed, 153 insertions(+), 31 deletions(-) diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index e7775423c..5327f13e4 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -226,6 +226,7 @@ fn resolve_state( } // compute intersection on pair of words with a proximity of 0. Phrase(phrase) => { + // TODO: use resolve_phrase here let mut bitmaps = Vec::with_capacity(phrase.len().saturating_sub(1)); for words in phrase.windows(2) { if let [left, right] = words { diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 86cec1ddc..cefc071ee 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -71,6 +71,7 @@ pub trait Context<'c> { fn exact_word_docids(&self, word: &str) -> heed::Result>; fn word_prefix_docids(&self, word: &str) -> heed::Result>; fn exact_word_prefix_docids(&self, word: &str) -> heed::Result>; + fn word_pair_proximity_docids( &self, left: &str, @@ -83,6 +84,12 @@ pub trait Context<'c> { right: &str, proximity: u8, ) -> heed::Result>; + fn prefix_word_pair_proximity_docids( + &self, + prefix: &str, + right: &str, + proximity: u8, + ) -> heed::Result>; fn words_fst<'t>(&self) -> &'t fst::Set>; fn in_prefix_cache(&self, word: &str) -> bool; fn docid_words_positions( @@ -111,6 +118,68 @@ pub struct CriteriaBuilder<'t> { words_prefixes_fst: fst::Set>, } +/// Return the docids for the following word pairs and proximities using [`Context::word_pair_proximity_docids`]. +/// * `left, right, prox` (leftward proximity) +/// * `right, left, prox-1` (rightward proximity) +/// +/// ## Example +/// For a document with the text `the good fox eats the apple`, we have: +/// * `rightward_proximity(the, eats) = 3` +/// * `leftward_proximity(eats, the) = 1` +/// +/// So both the expressions `word_pair_overall_proximity_docids(ctx, the, eats, 3)` +/// and `word_pair_overall_proximity_docids(ctx, the, eats, 2)` would return a bitmap containing +/// the id of this document. +fn word_pair_overall_proximity_docids( + ctx: &dyn Context, + left: &str, + right: &str, + prox: u8, +) -> heed::Result> { + let rightward = ctx.word_pair_proximity_docids(left, right, prox)?; + let leftward = + if prox > 1 { ctx.word_pair_proximity_docids(right, left, prox - 1)? } else { None }; + if let Some(mut all) = rightward { + if let Some(leftward) = leftward { + all |= leftward; + } + Ok(Some(all)) + } else { + Ok(leftward) + } +} + +/// This function works identically to [`word_pair_overall_proximity_docids`] except that the +/// right word is replaced by a prefix string. +/// +/// It will return None if no documents were found or if the prefix does not exist in the +/// `word_prefix_pair_proximity_docids` database. +fn word_prefix_pair_overall_proximity_docids( + ctx: &dyn Context, + left: &str, + prefix: &str, + proximity: u8, +) -> heed::Result> { + // We retrieve the docids for the original and swapped word pairs: + // A: word1 prefix2 proximity + // B: prefix2 word1 proximity-1 + let rightward = ctx.word_prefix_pair_proximity_docids(left, prefix, proximity)?; + + let leftward = if proximity > 1 { + ctx.prefix_word_pair_proximity_docids(prefix, left, proximity - 1)? + } else { + None + }; + if let Some(mut all) = rightward { + if let Some(leftward) = leftward { + all |= leftward; + } + Ok(Some(all)) + } else { + Ok(leftward) + } +} + impl<'c> Context<'c> for CriteriaBuilder<'c> { fn documents_ids(&self) -> heed::Result { self.index.documents_ids(self.rtxn) @@ -138,18 +207,24 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { right: &str, proximity: u8, ) -> heed::Result> { - let key = (proximity, left, right); - self.index.word_pair_proximity_docids.get(self.rtxn, &key) + self.index.word_pair_proximity_docids.get(self.rtxn, &(proximity, left, right)) } fn word_prefix_pair_proximity_docids( &self, left: &str, + prefix: &str, + proximity: u8, + ) -> heed::Result> { + self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &(proximity, left, prefix)) + } + fn prefix_word_pair_proximity_docids( + &self, + prefix: &str, right: &str, proximity: u8, ) -> heed::Result> { - let key = (proximity, left, right); - self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key) + self.index.prefix_word_pair_proximity_docids.get(self.rtxn, &(proximity, prefix, right)) } fn words_fst<'t>(&self) -> &'t fst::Set> { @@ -353,17 +428,34 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result bitmaps.push(m), - // If there are no document for this distance, there will be no - // results for the phrase query. - None => return Ok(RoaringBitmap::new()), + if s1 == s2 { + continue; + } + if dist == 0 { + match ctx.word_pair_proximity_docids(s1, s2, 1)? { + Some(m) => bitmaps.push(m), + // If there are no document for this pair, there will be no + // results for the phrase query. + None => return Ok(RoaringBitmap::new()), + } + } else { + let mut bitmap = RoaringBitmap::new(); + for dist in 0..=dist { + match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { + Some(m) => bitmap |= m, + None => {} + } + } + if bitmap.is_empty() { + return Ok(bitmap); + } else { + bitmaps.push(bitmap); + } } } } @@ -387,7 +479,7 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result, U: AsRef>( +fn all_word_pair_overall_proximity_docids, U: AsRef>( ctx: &dyn Context, left_words: &[(T, u8)], right_words: &[(U, u8)], @@ -396,9 +488,9 @@ fn all_word_pair_proximity_docids, U: AsRef>( let mut docids = RoaringBitmap::new(); for (left, _l_typo) in left_words { for (right, _r_typo) in right_words { - let current_docids = ctx - .word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)? - .unwrap_or_default(); + let current_docids = + word_pair_overall_proximity_docids(ctx, left.as_ref(), right.as_ref(), proximity)? + .unwrap_or_default(); docids |= current_docids; } } @@ -472,7 +564,8 @@ fn query_pair_proximity_docids( match (&left.kind, &right.kind) { (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { if prefix { - match ctx.word_prefix_pair_proximity_docids( + match word_prefix_pair_overall_proximity_docids( + ctx, left.as_str(), right.as_str(), proximity, @@ -480,7 +573,12 @@ fn query_pair_proximity_docids( Some(docids) => Ok(docids), None => { let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; - all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) + all_word_pair_overall_proximity_docids( + ctx, + &[(left, 0)], + &r_words, + proximity, + ) } } } else { @@ -495,7 +593,8 @@ fn query_pair_proximity_docids( if prefix { let mut docids = RoaringBitmap::new(); for (left, _) in l_words { - let current_docids = match ctx.word_prefix_pair_proximity_docids( + let current_docids = match word_prefix_pair_overall_proximity_docids( + ctx, left.as_str(), right.as_str(), proximity, @@ -504,19 +603,24 @@ fn query_pair_proximity_docids( None => { let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; - all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) + all_word_pair_overall_proximity_docids( + ctx, + &[(left, 0)], + &r_words, + proximity, + ) } }?; docids |= current_docids; } Ok(docids) } else { - all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) + all_word_pair_overall_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) } } (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst(), wdcache)?; - all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) + all_word_pair_overall_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) } ( QueryKind::Tolerant { typo: l_typo, word: left }, @@ -525,7 +629,7 @@ fn query_pair_proximity_docids( let l_words = word_derivations(&left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned(); let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst(), wdcache)?; - all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) + all_word_pair_overall_proximity_docids(ctx, &l_words, &r_words, proximity) } } } @@ -552,6 +656,7 @@ pub mod test { exact_word_prefix_docids: HashMap, word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, + prefix_word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, docid_words: HashMap>, } @@ -588,13 +693,22 @@ pub mod test { fn word_prefix_pair_proximity_docids( &self, - left: &str, - right: &str, + word: &str, + prefix: &str, proximity: u8, ) -> heed::Result> { - let key = (left.to_string(), right.to_string(), proximity.into()); + let key = (word.to_string(), prefix.to_string(), proximity.into()); Ok(self.word_prefix_pair_proximity_docids.get(&key).cloned()) } + fn prefix_word_pair_proximity_docids( + &self, + prefix: &str, + word: &str, + proximity: u8, + ) -> heed::Result> { + let key = (prefix.to_string(), word.to_string(), proximity.into()); + Ok(self.prefix_word_pair_proximity_docids.get(&key).cloned()) + } fn words_fst<'t>(&self) -> &'t fst::Set> { &self.words_fst @@ -708,6 +822,8 @@ pub mod test { let mut word_pair_proximity_docids = HashMap::new(); let mut word_prefix_pair_proximity_docids = HashMap::new(); + let mut prefix_word_pair_proximity_docids = HashMap::new(); + for (lword, lcandidates) in &word_docids { for (rword, rcandidates) in &word_docids { if lword == rword { @@ -740,15 +856,19 @@ pub mod test { let lposition = docid_words.iter().position(|w| w == lword).unwrap(); let rposition = docid_words.iter().position(|w| w.starts_with(pword)).unwrap(); - let key = if lposition < rposition { - (s(lword), s(pword), (rposition - lposition) as i32) + if lposition < rposition { + let key = (s(lword), s(pword), (rposition - lposition) as i32); + let docids = word_prefix_pair_proximity_docids + .entry(key) + .or_insert(RoaringBitmap::new()); + docids.push(candidate); } else { - (s(lword), s(pword), (lposition - rposition + 1) as i32) + let key = (s(lword), s(pword), (lposition - rposition) as i32); + let docids = prefix_word_pair_proximity_docids + .entry(key) + .or_insert(RoaringBitmap::new()); + docids.push(candidate); }; - let docids = word_prefix_pair_proximity_docids - .entry(key) - .or_insert(RoaringBitmap::new()); - docids.push(candidate); } } } @@ -766,6 +886,7 @@ pub mod test { exact_word_prefix_docids, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + prefix_word_pair_proximity_docids, docid_words, } }