Adjust some algorithms using DBs of word pair proximities

This commit is contained in:
Loïc Lecrenier 2022-09-15 09:34:35 +02:00 committed by Loïc Lecrenier
parent 072b576514
commit 18d578dfc4
2 changed files with 153 additions and 31 deletions

View File

@ -226,6 +226,7 @@ fn resolve_state(
} }
// compute intersection on pair of words with a proximity of 0. // compute intersection on pair of words with a proximity of 0.
Phrase(phrase) => { Phrase(phrase) => {
// TODO: use resolve_phrase here
let mut bitmaps = Vec::with_capacity(phrase.len().saturating_sub(1)); let mut bitmaps = Vec::with_capacity(phrase.len().saturating_sub(1));
for words in phrase.windows(2) { for words in phrase.windows(2) {
if let [left, right] = words { if let [left, right] = words {

View File

@ -71,6 +71,7 @@ pub trait Context<'c> {
fn exact_word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>; fn exact_word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>; fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
fn exact_word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>; fn exact_word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
fn word_pair_proximity_docids( fn word_pair_proximity_docids(
&self, &self,
left: &str, left: &str,
@ -83,6 +84,12 @@ pub trait Context<'c> {
right: &str, right: &str,
proximity: u8, proximity: u8,
) -> heed::Result<Option<RoaringBitmap>>; ) -> heed::Result<Option<RoaringBitmap>>;
fn prefix_word_pair_proximity_docids(
&self,
prefix: &str,
right: &str,
proximity: u8,
) -> heed::Result<Option<RoaringBitmap>>;
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>; fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>;
fn in_prefix_cache(&self, word: &str) -> bool; fn in_prefix_cache(&self, word: &str) -> bool;
fn docid_words_positions( fn docid_words_positions(
@ -111,6 +118,68 @@ pub struct CriteriaBuilder<'t> {
words_prefixes_fst: fst::Set<Cow<'t, [u8]>>, words_prefixes_fst: fst::Set<Cow<'t, [u8]>>,
} }
/// Return the docids for the following word pairs and proximities using [`Context::word_pair_proximity_docids`].
/// * `left, right, prox` (leftward proximity)
/// * `right, left, prox-1` (rightward proximity)
///
/// ## Example
/// For a document with the text `the good fox eats the apple`, we have:
/// * `rightward_proximity(the, eats) = 3`
/// * `leftward_proximity(eats, the) = 1`
///
/// So both the expressions `word_pair_overall_proximity_docids(ctx, the, eats, 3)`
/// and `word_pair_overall_proximity_docids(ctx, the, eats, 2)` would return a bitmap containing
/// the id of this document.
fn word_pair_overall_proximity_docids(
ctx: &dyn Context,
left: &str,
right: &str,
prox: u8,
) -> heed::Result<Option<RoaringBitmap>> {
let rightward = ctx.word_pair_proximity_docids(left, right, prox)?;
let leftward =
if prox > 1 { ctx.word_pair_proximity_docids(right, left, prox - 1)? } else { None };
if let Some(mut all) = rightward {
if let Some(leftward) = leftward {
all |= leftward;
}
Ok(Some(all))
} else {
Ok(leftward)
}
}
/// This function works identically to [`word_pair_overall_proximity_docids`] except that the
/// right word is replaced by a prefix string.
///
/// It will return None if no documents were found or if the prefix does not exist in the
/// `word_prefix_pair_proximity_docids` database.
fn word_prefix_pair_overall_proximity_docids(
ctx: &dyn Context,
left: &str,
prefix: &str,
proximity: u8,
) -> heed::Result<Option<RoaringBitmap>> {
// We retrieve the docids for the original and swapped word pairs:
// A: word1 prefix2 proximity
// B: prefix2 word1 proximity-1
let rightward = ctx.word_prefix_pair_proximity_docids(left, prefix, proximity)?;
let leftward = if proximity > 1 {
ctx.prefix_word_pair_proximity_docids(prefix, left, proximity - 1)?
} else {
None
};
if let Some(mut all) = rightward {
if let Some(leftward) = leftward {
all |= leftward;
}
Ok(Some(all))
} else {
Ok(leftward)
}
}
impl<'c> Context<'c> for CriteriaBuilder<'c> { impl<'c> Context<'c> for CriteriaBuilder<'c> {
fn documents_ids(&self) -> heed::Result<RoaringBitmap> { fn documents_ids(&self) -> heed::Result<RoaringBitmap> {
self.index.documents_ids(self.rtxn) self.index.documents_ids(self.rtxn)
@ -138,18 +207,24 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
right: &str, right: &str,
proximity: u8, proximity: u8,
) -> heed::Result<Option<RoaringBitmap>> { ) -> heed::Result<Option<RoaringBitmap>> {
let key = (proximity, left, right); self.index.word_pair_proximity_docids.get(self.rtxn, &(proximity, left, right))
self.index.word_pair_proximity_docids.get(self.rtxn, &key)
} }
fn word_prefix_pair_proximity_docids( fn word_prefix_pair_proximity_docids(
&self, &self,
left: &str, left: &str,
prefix: &str,
proximity: u8,
) -> heed::Result<Option<RoaringBitmap>> {
self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &(proximity, left, prefix))
}
fn prefix_word_pair_proximity_docids(
&self,
prefix: &str,
right: &str, right: &str,
proximity: u8, proximity: u8,
) -> heed::Result<Option<RoaringBitmap>> { ) -> heed::Result<Option<RoaringBitmap>> {
let key = (proximity, left, right); self.index.prefix_word_pair_proximity_docids.get(self.rtxn, &(proximity, prefix, right))
self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)
} }
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>> { fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>> {
@ -353,17 +428,34 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result<RoaringBit
let mut candidates = RoaringBitmap::new(); let mut candidates = RoaringBitmap::new();
let mut first_iter = true; let mut first_iter = true;
let winsize = phrase.len().min(7); let winsize = phrase.len().min(7);
for win in phrase.windows(winsize) { for win in phrase.windows(winsize) {
// Get all the documents with the matching distance for each word pairs. // Get all the documents with the matching distance for each word pairs.
let mut bitmaps = Vec::with_capacity(winsize.pow(2)); let mut bitmaps = Vec::with_capacity(winsize.pow(2));
for (offset, s1) in win.iter().enumerate() { for (offset, s1) in win.iter().enumerate() {
for (dist, s2) in win.iter().skip(offset + 1).enumerate() { for (dist, s2) in win.iter().skip(offset + 1).enumerate() {
match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { if s1 == s2 {
Some(m) => bitmaps.push(m), continue;
// If there are no document for this distance, there will be no }
// results for the phrase query. if dist == 0 {
None => return Ok(RoaringBitmap::new()), match ctx.word_pair_proximity_docids(s1, s2, 1)? {
Some(m) => bitmaps.push(m),
// If there are no document for this pair, there will be no
// results for the phrase query.
None => return Ok(RoaringBitmap::new()),
}
} else {
let mut bitmap = RoaringBitmap::new();
for dist in 0..=dist {
match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? {
Some(m) => bitmap |= m,
None => {}
}
}
if bitmap.is_empty() {
return Ok(bitmap);
} else {
bitmaps.push(bitmap);
}
} }
} }
} }
@ -387,7 +479,7 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result<RoaringBit
Ok(candidates) Ok(candidates)
} }
fn all_word_pair_proximity_docids<T: AsRef<str>, U: AsRef<str>>( fn all_word_pair_overall_proximity_docids<T: AsRef<str>, U: AsRef<str>>(
ctx: &dyn Context, ctx: &dyn Context,
left_words: &[(T, u8)], left_words: &[(T, u8)],
right_words: &[(U, u8)], right_words: &[(U, u8)],
@ -396,9 +488,9 @@ fn all_word_pair_proximity_docids<T: AsRef<str>, U: AsRef<str>>(
let mut docids = RoaringBitmap::new(); let mut docids = RoaringBitmap::new();
for (left, _l_typo) in left_words { for (left, _l_typo) in left_words {
for (right, _r_typo) in right_words { for (right, _r_typo) in right_words {
let current_docids = ctx let current_docids =
.word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)? word_pair_overall_proximity_docids(ctx, left.as_ref(), right.as_ref(), proximity)?
.unwrap_or_default(); .unwrap_or_default();
docids |= current_docids; docids |= current_docids;
} }
} }
@ -472,7 +564,8 @@ fn query_pair_proximity_docids(
match (&left.kind, &right.kind) { match (&left.kind, &right.kind) {
(QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => {
if prefix { if prefix {
match ctx.word_prefix_pair_proximity_docids( match word_prefix_pair_overall_proximity_docids(
ctx,
left.as_str(), left.as_str(),
right.as_str(), right.as_str(),
proximity, proximity,
@ -480,7 +573,12 @@ fn query_pair_proximity_docids(
Some(docids) => Ok(docids), Some(docids) => Ok(docids),
None => { None => {
let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?;
all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) all_word_pair_overall_proximity_docids(
ctx,
&[(left, 0)],
&r_words,
proximity,
)
} }
} }
} else { } else {
@ -495,7 +593,8 @@ fn query_pair_proximity_docids(
if prefix { if prefix {
let mut docids = RoaringBitmap::new(); let mut docids = RoaringBitmap::new();
for (left, _) in l_words { for (left, _) in l_words {
let current_docids = match ctx.word_prefix_pair_proximity_docids( let current_docids = match word_prefix_pair_overall_proximity_docids(
ctx,
left.as_str(), left.as_str(),
right.as_str(), right.as_str(),
proximity, proximity,
@ -504,19 +603,24 @@ fn query_pair_proximity_docids(
None => { None => {
let r_words = let r_words =
word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?;
all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) all_word_pair_overall_proximity_docids(
ctx,
&[(left, 0)],
&r_words,
proximity,
)
} }
}?; }?;
docids |= current_docids; docids |= current_docids;
} }
Ok(docids) Ok(docids)
} else { } else {
all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) all_word_pair_overall_proximity_docids(ctx, &l_words, &[(right, 0)], proximity)
} }
} }
(QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => {
let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst(), wdcache)?; let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst(), wdcache)?;
all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) all_word_pair_overall_proximity_docids(ctx, &[(left, 0)], &r_words, proximity)
} }
( (
QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: l_typo, word: left },
@ -525,7 +629,7 @@ fn query_pair_proximity_docids(
let l_words = let l_words =
word_derivations(&left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned(); word_derivations(&left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned();
let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst(), wdcache)?; let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst(), wdcache)?;
all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) all_word_pair_overall_proximity_docids(ctx, &l_words, &r_words, proximity)
} }
} }
} }
@ -552,6 +656,7 @@ pub mod test {
exact_word_prefix_docids: HashMap<String, RoaringBitmap>, exact_word_prefix_docids: HashMap<String, RoaringBitmap>,
word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
prefix_word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
docid_words: HashMap<u32, Vec<String>>, docid_words: HashMap<u32, Vec<String>>,
} }
@ -588,13 +693,22 @@ pub mod test {
fn word_prefix_pair_proximity_docids( fn word_prefix_pair_proximity_docids(
&self, &self,
left: &str, word: &str,
right: &str, prefix: &str,
proximity: u8, proximity: u8,
) -> heed::Result<Option<RoaringBitmap>> { ) -> heed::Result<Option<RoaringBitmap>> {
let key = (left.to_string(), right.to_string(), proximity.into()); let key = (word.to_string(), prefix.to_string(), proximity.into());
Ok(self.word_prefix_pair_proximity_docids.get(&key).cloned()) Ok(self.word_prefix_pair_proximity_docids.get(&key).cloned())
} }
fn prefix_word_pair_proximity_docids(
&self,
prefix: &str,
word: &str,
proximity: u8,
) -> heed::Result<Option<RoaringBitmap>> {
let key = (prefix.to_string(), word.to_string(), proximity.into());
Ok(self.prefix_word_pair_proximity_docids.get(&key).cloned())
}
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>> { fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>> {
&self.words_fst &self.words_fst
@ -708,6 +822,8 @@ pub mod test {
let mut word_pair_proximity_docids = HashMap::new(); let mut word_pair_proximity_docids = HashMap::new();
let mut word_prefix_pair_proximity_docids = HashMap::new(); let mut word_prefix_pair_proximity_docids = HashMap::new();
let mut prefix_word_pair_proximity_docids = HashMap::new();
for (lword, lcandidates) in &word_docids { for (lword, lcandidates) in &word_docids {
for (rword, rcandidates) in &word_docids { for (rword, rcandidates) in &word_docids {
if lword == rword { if lword == rword {
@ -740,15 +856,19 @@ pub mod test {
let lposition = docid_words.iter().position(|w| w == lword).unwrap(); let lposition = docid_words.iter().position(|w| w == lword).unwrap();
let rposition = let rposition =
docid_words.iter().position(|w| w.starts_with(pword)).unwrap(); docid_words.iter().position(|w| w.starts_with(pword)).unwrap();
let key = if lposition < rposition { if lposition < rposition {
(s(lword), s(pword), (rposition - lposition) as i32) let key = (s(lword), s(pword), (rposition - lposition) as i32);
let docids = word_prefix_pair_proximity_docids
.entry(key)
.or_insert(RoaringBitmap::new());
docids.push(candidate);
} else { } else {
(s(lword), s(pword), (lposition - rposition + 1) as i32) let key = (s(lword), s(pword), (lposition - rposition) as i32);
let docids = prefix_word_pair_proximity_docids
.entry(key)
.or_insert(RoaringBitmap::new());
docids.push(candidate);
}; };
let docids = word_prefix_pair_proximity_docids
.entry(key)
.or_insert(RoaringBitmap::new());
docids.push(candidate);
} }
} }
} }
@ -766,6 +886,7 @@ pub mod test {
exact_word_prefix_docids, exact_word_prefix_docids,
word_pair_proximity_docids, word_pair_proximity_docids,
word_prefix_pair_proximity_docids, word_prefix_pair_proximity_docids,
prefix_word_pair_proximity_docids,
docid_words, docid_words,
} }
} }