Improve the mDFS performance and return the proximity

This commit is contained in:
Kerollmops 2020-10-05 16:40:33 +02:00
parent bb15f16d8c
commit e9e03259c1
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
3 changed files with 30 additions and 25 deletions

View File

@ -32,18 +32,18 @@ impl<'a> Mdfs<'a> {
} }
impl<'a> Iterator for Mdfs<'a> { impl<'a> Iterator for Mdfs<'a> {
type Item = anyhow::Result<RoaringBitmap>; type Item = anyhow::Result<(u32, RoaringBitmap)>;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
// If there is less or only one word therefore the only // If there is less or only one word therefore the only
// possible documents that we can return are the candidates. // possible documents that we can return are the candidates.
if self.words.len() <= 1 { if self.words.len() <= 1 {
if self.candidates.is_empty() { return None } if self.candidates.is_empty() { return None }
return Some(Ok(mem::take(&mut self.candidates))); return Some(Ok((0, mem::take(&mut self.candidates))));
} }
let mut answer = RoaringBitmap::new();
while self.mana <= self.max_mana { while self.mana <= self.max_mana {
let mut answer = RoaringBitmap::new();
let result = mdfs_step( let result = mdfs_step(
&self.index, &self.index,
&self.rtxn, &self.rtxn,
@ -52,24 +52,25 @@ impl<'a> Iterator for Mdfs<'a> {
&self.candidates, &self.candidates,
&self.candidates, &self.candidates,
&mut self.union_cache, &mut self.union_cache,
&mut answer,
); );
match result { match result {
Ok(Some(a)) => { Ok(()) => {
// We remove the answered documents from the list of // We always increase the mana for the next loop.
// candidates to be sure we don't search for them again. let proximity = self.mana;
self.candidates.difference_with(&a);
answer.union_with(&a);
},
Ok(None) => {
// We found the last iteration for this amount of mana that gives nothing,
// we can now store that the next mana to use for the loop is incremented.
self.mana = self.mana + 1; self.mana = self.mana + 1;
// If the answer is empty it means that we found nothing for this amount
// of mana therefore we continue with a bigger mana. // If no documents were found we must not return and continue
// the search with more mana.
if !answer.is_empty() { if !answer.is_empty() {
// Otherwise we return the answer.
return Some(Ok(answer)); // We remove the answered documents from the list of
// candidates to be sure we don't search for them again.
self.candidates.difference_with(&answer);
// We return the answer.
return Some(Ok((proximity, answer)));
} }
}, },
Err(e) => return Some(Err(e)), Err(e) => return Some(Err(e)),
@ -88,7 +89,8 @@ fn mdfs_step(
candidates: &RoaringBitmap, candidates: &RoaringBitmap,
parent_docids: &RoaringBitmap, parent_docids: &RoaringBitmap,
union_cache: &mut HashMap<(usize, u8), RoaringBitmap>, union_cache: &mut HashMap<(usize, u8), RoaringBitmap>,
) -> anyhow::Result<Option<RoaringBitmap>> answer: &mut RoaringBitmap,
) -> anyhow::Result<()>
{ {
use std::cmp::{min, max}; use std::cmp::{min, max};
@ -126,19 +128,22 @@ fn mdfs_step(
} }
}; };
// We must be sure that we only return docids that are present in the candidates.
docids.intersect_with(parent_docids); docids.intersect_with(parent_docids);
if !docids.is_empty() { if !docids.is_empty() {
let mana = mana.checked_sub(proximity as u32).unwrap(); let mana = mana.checked_sub(proximity as u32).unwrap();
// We are the last pair, we return without recursing as we don't have any child. if tail.len() < 2 {
if tail.len() < 2 { return Ok(Some(docids)) } // We are the last pair, we return without recuring as we don't have any child.
if let Some(di) = mdfs_step(index, rtxn, mana, tail, candidates, &docids, union_cache)? { answer.union_with(&docids);
return Ok(Some(di)) return Ok(());
} else {
return mdfs_step(index, rtxn, mana, tail, candidates, &docids, union_cache, answer);
} }
} }
} }
Ok(None) Ok(())
} }
fn words_pair_combinations<'h>( fn words_pair_combinations<'h>(

View File

@ -164,8 +164,8 @@ impl<'a> Search<'a> {
// We execute the Mdfs iterator until we find enough documents. // We execute the Mdfs iterator until we find enough documents.
while documents.iter().map(RoaringBitmap::len).sum::<u64>() < limit as u64 { while documents.iter().map(RoaringBitmap::len).sum::<u64>() < limit as u64 {
match mdfs.next().transpose()? { match mdfs.next().transpose()? {
Some(answer) => { Some((proximity, answer)) => {
debug!("answer: {:?}", answer); debug!("answer with a proximity of {}: {:?}", proximity, answer);
documents.push(answer); documents.push(answer);
}, },
None => break, None => break,

View File

@ -90,7 +90,7 @@ fn is_chinese(c: char) -> bool {
/// length of the found key. Otherwise `None` is returned. /// length of the found key. Otherwise `None` is returned.
/// ///
/// This can be used to e.g. build tokenizing functions. /// This can be used to e.g. build tokenizing functions.
// // Copyright @llogiq
// https://github.com/BurntSushi/fst/pull/104 // https://github.com/BurntSushi/fst/pull/104
#[inline] #[inline]
fn find_longest_prefix(fst: &Fst<&[u8]>, value: &[u8]) -> Option<(u64, usize)> { fn find_longest_prefix(fst: &Fst<&[u8]>, value: &[u8]) -> Option<(u64, usize)> {