mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-23 19:57:30 +01:00
Introduce the Mdfs Iterator that explore the proximity graph using a mana DFS
This commit is contained in:
parent
d4e80407e5
commit
007e647462
@ -1,4 +1,5 @@
|
||||
mod criterion;
|
||||
mod mdfs;
|
||||
mod query_tokens;
|
||||
mod search;
|
||||
pub mod heed_codec;
|
||||
|
158
src/mdfs.rs
Normal file
158
src/mdfs.rs
Normal file
@ -0,0 +1,158 @@
|
||||
use std::collections::hash_map::Entry::{Occupied, Vacant};
|
||||
use std::collections::HashMap;
|
||||
use std::mem;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
use crate::Index;
|
||||
|
||||
/// A mana depth first search implementation.
|
||||
pub struct Mdfs<'a> {
|
||||
index: &'a Index,
|
||||
rtxn: &'a heed::RoTxn,
|
||||
words: &'a [(HashMap<String, (u8, RoaringBitmap)>, RoaringBitmap)],
|
||||
union_cache: HashMap<(usize, u8), RoaringBitmap>,
|
||||
candidates: RoaringBitmap,
|
||||
mana: u32,
|
||||
max_mana: u32,
|
||||
}
|
||||
|
||||
impl<'a> Mdfs<'a> {
|
||||
pub fn new(
|
||||
index: &'a Index,
|
||||
rtxn: &'a heed::RoTxn,
|
||||
words: &'a [(HashMap<String, (u8, RoaringBitmap)>, RoaringBitmap)],
|
||||
candidates: RoaringBitmap,
|
||||
) -> Mdfs<'a>
|
||||
{
|
||||
// Compute the number of pairs (windows) we have for this list of words.
|
||||
let mana = words.len().checked_sub(1).unwrap_or(0) as u32;
|
||||
let max_mana = mana * 8;
|
||||
Mdfs { index, rtxn, words, union_cache: HashMap::new(), candidates, mana, max_mana }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Mdfs<'a> {
|
||||
type Item = anyhow::Result<RoaringBitmap>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
// If there is less or only one word therefore the only
|
||||
// possible documents that we can return are the candidates.
|
||||
if self.words.len() <= 1 {
|
||||
if self.candidates.is_empty() { return None }
|
||||
return Some(Ok(mem::take(&mut self.candidates)));
|
||||
}
|
||||
|
||||
let mut answer = RoaringBitmap::new();
|
||||
while self.mana <= self.max_mana {
|
||||
let result = mdfs_step(
|
||||
&self.index,
|
||||
&self.rtxn,
|
||||
self.mana,
|
||||
self.words,
|
||||
&self.candidates,
|
||||
&self.candidates,
|
||||
&mut self.union_cache,
|
||||
);
|
||||
|
||||
match result {
|
||||
Ok(Some(a)) => {
|
||||
// We remove the answered documents from the list of
|
||||
// candidates to be sure we don't search for them again.
|
||||
self.candidates.difference_with(&a);
|
||||
answer.union_with(&a);
|
||||
},
|
||||
Ok(None) => {
|
||||
// We found the last iteration for this amount of mana that gives nothing,
|
||||
// we can now store that the next mana to use for the loop is incremented.
|
||||
self.mana = self.mana + 1;
|
||||
// If the answer is empty it means that we found nothing for this amount
|
||||
// of mana therefore we continue with a bigger mana.
|
||||
if !answer.is_empty() {
|
||||
// Otherwise we return the answer.
|
||||
return Some(Ok(answer));
|
||||
}
|
||||
},
|
||||
Err(e) => return Some(Err(e)),
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn mdfs_step(
|
||||
index: &Index,
|
||||
rtxn: &heed::RoTxn,
|
||||
mana: u32,
|
||||
words: &[(HashMap<String, (u8, RoaringBitmap)>, RoaringBitmap)],
|
||||
candidates: &RoaringBitmap,
|
||||
parent_docids: &RoaringBitmap,
|
||||
union_cache: &mut HashMap<(usize, u8), RoaringBitmap>,
|
||||
) -> anyhow::Result<Option<RoaringBitmap>>
|
||||
{
|
||||
use std::cmp::{min, max};
|
||||
|
||||
let (words1, words2) = (&words[0].0, &words[1].0);
|
||||
let pairs = words_pair_combinations(words1, words2);
|
||||
let tail = &words[1..];
|
||||
let nb_children = tail.len() as u32 - 1;
|
||||
|
||||
// The minimum amount of mana that you must consume is at least 1 and the
|
||||
// amount of mana that your children can consume. Because the last child must
|
||||
// consume the remaining mana, it is mandatory that there not too much at the end.
|
||||
let min_proximity = max(1, mana.saturating_sub(nb_children * 8)) as u8;
|
||||
|
||||
// The maximum amount of mana that you can use is 8 or the remaining amount of
|
||||
// mana minus your children, as you can't just consume all the mana,
|
||||
// your children must have at least 1 mana.
|
||||
let max_proximity = min(8, mana - nb_children) as u8;
|
||||
|
||||
for proximity in min_proximity..=max_proximity {
|
||||
let mut docids = match union_cache.entry((words.len(), proximity)) {
|
||||
Occupied(entry) => entry.get().clone(),
|
||||
Vacant(entry) => {
|
||||
let mut docids = RoaringBitmap::new();
|
||||
if proximity == 8 {
|
||||
docids = candidates.clone();
|
||||
} else {
|
||||
for (w1, w2) in pairs.iter().cloned() {
|
||||
let key = (w1, w2, proximity);
|
||||
if let Some(di) = index.word_pair_proximity_docids.get(rtxn, &key)? {
|
||||
docids.union_with(&di);
|
||||
}
|
||||
}
|
||||
}
|
||||
entry.insert(docids).clone()
|
||||
}
|
||||
};
|
||||
|
||||
docids.intersect_with(parent_docids);
|
||||
|
||||
if !docids.is_empty() {
|
||||
let mana = mana.checked_sub(proximity as u32).unwrap();
|
||||
// We are the last pair, we return without recursing as we don't have any child.
|
||||
if tail.len() < 2 { return Ok(Some(docids)) }
|
||||
if let Some(di) = mdfs_step(index, rtxn, mana, tail, candidates, &docids, union_cache)? {
|
||||
return Ok(Some(di))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn words_pair_combinations<'h>(
|
||||
w1: &'h HashMap<String, (u8, RoaringBitmap)>,
|
||||
w2: &'h HashMap<String, (u8, RoaringBitmap)>,
|
||||
) -> Vec<(&'h str, &'h str)>
|
||||
{
|
||||
let mut pairs = Vec::new();
|
||||
for (w1, (_typos, docids1)) in w1 {
|
||||
for (w2, (_typos, docids2)) in w2 {
|
||||
if !docids1.is_disjoint(&docids2) {
|
||||
pairs.push((w1.as_str(), w2.as_str()));
|
||||
}
|
||||
}
|
||||
}
|
||||
pairs
|
||||
}
|
132
src/search.rs
132
src/search.rs
@ -1,5 +1,4 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::collections::hash_map::Entry::{Occupied, Vacant};
|
||||
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use levenshtein_automata::DFA;
|
||||
@ -9,6 +8,7 @@ use once_cell::sync::Lazy;
|
||||
use roaring::bitmap::RoaringBitmap;
|
||||
|
||||
use crate::query_tokens::{QueryTokens, QueryToken};
|
||||
use crate::mdfs::Mdfs;
|
||||
use crate::{Index, DocumentId};
|
||||
|
||||
// Building these factories is not free.
|
||||
@ -132,111 +132,6 @@ impl<'a> Search<'a> {
|
||||
candidates
|
||||
}
|
||||
|
||||
// TODO Move this elsewhere!
|
||||
fn mana_depth_first_search(
|
||||
&self,
|
||||
words: &[(HashMap<String, (u8, RoaringBitmap)>, RoaringBitmap)],
|
||||
candidates: &RoaringBitmap,
|
||||
union_cache: &mut HashMap<(usize, u8), RoaringBitmap>,
|
||||
) -> anyhow::Result<Option<RoaringBitmap>>
|
||||
{
|
||||
fn words_pair_combinations<'h>(
|
||||
w1: &'h HashMap<String, (u8, RoaringBitmap)>,
|
||||
w2: &'h HashMap<String, (u8, RoaringBitmap)>,
|
||||
) -> Vec<(&'h str, &'h str)>
|
||||
{
|
||||
let mut pairs = Vec::new();
|
||||
for (w1, (_typos, docids1)) in w1 {
|
||||
for (w2, (_typos, docids2)) in w2 {
|
||||
if !docids1.is_disjoint(&docids2) {
|
||||
pairs.push((w1.as_str(), w2.as_str()));
|
||||
}
|
||||
}
|
||||
}
|
||||
pairs
|
||||
}
|
||||
|
||||
fn mdfs(
|
||||
index: &Index,
|
||||
rtxn: &heed::RoTxn,
|
||||
mana: u32,
|
||||
words: &[(HashMap<String, (u8, RoaringBitmap)>, RoaringBitmap)],
|
||||
candidates: &RoaringBitmap,
|
||||
parent_docids: &RoaringBitmap,
|
||||
union_cache: &mut HashMap<(usize, u8), RoaringBitmap>,
|
||||
) -> anyhow::Result<Option<RoaringBitmap>>
|
||||
{
|
||||
use std::cmp::{min, max};
|
||||
|
||||
let (words1, words2) = (&words[0].0, &words[1].0);
|
||||
let pairs = words_pair_combinations(words1, words2);
|
||||
let tail = &words[1..];
|
||||
let nb_children = tail.len() as u32 - 1;
|
||||
|
||||
// The minimum amount of mana that you must consume is at least 1 and the
|
||||
// amount of mana that your children can consume. Because the last child must
|
||||
// consume the remaining mana, it is mandatory that there not too much at the end.
|
||||
let min_proximity = max(1, mana.saturating_sub(nb_children * 8)) as u8;
|
||||
|
||||
// The maximum amount of mana that you can use is 8 or the remaining amount of
|
||||
// mana minus your children, as you can't just consume all the mana,
|
||||
// your children must have at least 1 mana.
|
||||
let max_proximity = min(8, mana - nb_children) as u8;
|
||||
|
||||
for proximity in min_proximity..=max_proximity {
|
||||
let mut docids = match union_cache.entry((words.len(), proximity)) {
|
||||
Occupied(entry) => entry.get().clone(),
|
||||
Vacant(entry) => {
|
||||
let mut docids = RoaringBitmap::new();
|
||||
if proximity == 8 {
|
||||
docids = candidates.clone();
|
||||
} else {
|
||||
for (w1, w2) in pairs.iter().cloned() {
|
||||
let key = (w1, w2, proximity);
|
||||
if let Some(di) = index.word_pair_proximity_docids.get(rtxn, &key)? {
|
||||
docids.union_with(&di);
|
||||
}
|
||||
}
|
||||
}
|
||||
entry.insert(docids).clone()
|
||||
}
|
||||
};
|
||||
|
||||
docids.intersect_with(parent_docids);
|
||||
|
||||
if !docids.is_empty() {
|
||||
let mana = mana.checked_sub(proximity as u32).unwrap();
|
||||
// We are the last pair, we return without recursing as we don't have any child.
|
||||
if tail.len() < 2 { return Ok(Some(docids)) }
|
||||
if let Some(di) = mdfs(index, rtxn, mana, tail, candidates, &docids, union_cache)? {
|
||||
return Ok(Some(di))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
// Compute the number of pairs (windows) we have for this list of words.
|
||||
// If there only is one word therefore the only possible documents are the candidates.
|
||||
let initial_mana = match words.len().checked_sub(1) {
|
||||
Some(nb_windows) if nb_windows != 0 => nb_windows as u32,
|
||||
_ => return Ok(Some(candidates.clone())),
|
||||
};
|
||||
|
||||
// TODO We must keep track of where we are in terms of mana and that should either be
|
||||
// handled by an Iterator or by the caller. Keeping track of the amount of mana
|
||||
// is an optimization, it makes this mdfs to only be called with the next valid
|
||||
// mana and not called with all of the previous mana values.
|
||||
for mana in initial_mana..=initial_mana * 8 {
|
||||
if let Some(answer) = mdfs(&self.index, &self.rtxn, mana, words, candidates, candidates, union_cache)? {
|
||||
return Ok(Some(answer));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
pub fn execute(&self) -> anyhow::Result<SearchResult> {
|
||||
let limit = self.limit;
|
||||
|
||||
@ -257,29 +152,24 @@ impl<'a> Search<'a> {
|
||||
}
|
||||
|
||||
let derived_words = self.fetch_words_docids(&fst, dfas)?;
|
||||
let mut candidates = Self::compute_candidates(&derived_words);
|
||||
let candidates = Self::compute_candidates(&derived_words);
|
||||
|
||||
debug!("candidates: {:?}", candidates);
|
||||
|
||||
// The mana depth first search is a revised DFS that explore
|
||||
// solutions in the order of their proximities.
|
||||
let mut mdfs = Mdfs::new(self.index, self.rtxn, &derived_words, candidates);
|
||||
let mut documents = Vec::new();
|
||||
let mut union_cache = HashMap::new();
|
||||
|
||||
// We execute the DFS until we find enough documents, we run it with the
|
||||
// candidates list and remove the found documents from this list at each iteration.
|
||||
// We execute the Mdfs iterator until we find enough documents.
|
||||
while documents.iter().map(RoaringBitmap::len).sum::<u64>() < limit as u64 {
|
||||
let answer = self.mana_depth_first_search(&derived_words, &candidates, &mut union_cache)?;
|
||||
|
||||
let answer = match answer {
|
||||
Some(answer) if !answer.is_empty() => answer,
|
||||
_ => break,
|
||||
};
|
||||
|
||||
match mdfs.next().transpose()? {
|
||||
Some(answer) => {
|
||||
debug!("answer: {:?}", answer);
|
||||
|
||||
// We remove the answered documents from the list of
|
||||
// candidates to be sure we don't search for them again.
|
||||
candidates.difference_with(&answer);
|
||||
documents.push(answer);
|
||||
},
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
|
||||
let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect();
|
||||
|
Loading…
x
Reference in New Issue
Block a user