mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-30 00:34:26 +01:00
Implement attribute criterion for small amounts of candidates
This commit is contained in:
parent
0f4c0beffd
commit
4ff67ec2ee
@ -1,10 +1,13 @@
|
||||
use log::debug;
|
||||
use std::collections::{BTreeMap, HashMap, btree_map};
|
||||
use std::mem::take;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::{search::build_dfa};
|
||||
use crate::search::criteria::Query;
|
||||
use crate::search::query_tree::Operation;
|
||||
use crate::search::query_tree::{Operation, QueryKind};
|
||||
use crate::search::WordDerivationsCache;
|
||||
use super::{Criterion, CriterionResult, Context};
|
||||
use super::{Criterion, CriterionResult, Context, resolve_query_tree};
|
||||
|
||||
pub struct Attribute<'t> {
|
||||
ctx: &'t dyn Context,
|
||||
@ -12,6 +15,8 @@ pub struct Attribute<'t> {
|
||||
candidates: Option<RoaringBitmap>,
|
||||
bucket_candidates: RoaringBitmap,
|
||||
parent: Option<Box<dyn Criterion + 't>>,
|
||||
flattened_query_tree: Option<Vec<Vec<Query>>>,
|
||||
current_buckets: Option<btree_map::IntoIter<u64, RoaringBitmap>>,
|
||||
}
|
||||
|
||||
impl<'t> Attribute<'t> {
|
||||
@ -27,6 +32,8 @@ impl<'t> Attribute<'t> {
|
||||
candidates,
|
||||
bucket_candidates: RoaringBitmap::new(),
|
||||
parent: None,
|
||||
flattened_query_tree: None,
|
||||
current_buckets: None,
|
||||
}
|
||||
}
|
||||
|
||||
@ -37,6 +44,8 @@ impl<'t> Attribute<'t> {
|
||||
candidates: None,
|
||||
bucket_candidates: RoaringBitmap::new(),
|
||||
parent: Some(parent),
|
||||
flattened_query_tree: None,
|
||||
current_buckets: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -44,12 +53,153 @@ impl<'t> Attribute<'t> {
|
||||
impl<'t> Criterion for Attribute<'t> {
|
||||
#[logging_timer::time("Attribute::{}")]
|
||||
fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>> {
|
||||
todo!("Attribute")
|
||||
loop {
|
||||
match (&self.query_tree, &mut self.candidates) {
|
||||
(_, Some(candidates)) if candidates.is_empty() => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: self.query_tree.take(),
|
||||
candidates: self.candidates.take(),
|
||||
bucket_candidates: take(&mut self.bucket_candidates),
|
||||
}));
|
||||
},
|
||||
(Some(qt), Some(candidates)) => {
|
||||
let flattened_query_tree = self.flattened_query_tree.get_or_insert_with(|| flatten_query_tree(&qt));
|
||||
let current_buckets = if let Some(current_buckets) = self.current_buckets.as_mut() {
|
||||
current_buckets
|
||||
} else {
|
||||
let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?;
|
||||
self.current_buckets.get_or_insert(new_buckets.into_iter())
|
||||
};
|
||||
|
||||
let found_candidates = if let Some((_score, candidates)) = current_buckets.next() {
|
||||
candidates
|
||||
} else {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: self.query_tree.take(),
|
||||
candidates: self.candidates.take(),
|
||||
bucket_candidates: take(&mut self.bucket_candidates),
|
||||
}));
|
||||
};
|
||||
candidates.difference_with(&found_candidates);
|
||||
|
||||
let bucket_candidates = match self.parent {
|
||||
Some(_) => take(&mut self.bucket_candidates),
|
||||
None => found_candidates.clone(),
|
||||
};
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: self.query_tree.clone(),
|
||||
candidates: Some(found_candidates),
|
||||
bucket_candidates: bucket_candidates,
|
||||
}));
|
||||
},
|
||||
(Some(qt), None) => {
|
||||
let query_tree_candidates = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?;
|
||||
self.bucket_candidates.union_with(&query_tree_candidates);
|
||||
self.candidates = Some(query_tree_candidates);
|
||||
},
|
||||
(None, Some(_)) => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: self.query_tree.take(),
|
||||
candidates: self.candidates.take(),
|
||||
bucket_candidates: take(&mut self.bucket_candidates),
|
||||
}));
|
||||
},
|
||||
(None, None) => {
|
||||
match self.parent.as_mut() {
|
||||
Some(parent) => {
|
||||
match parent.next(wdcache)? {
|
||||
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
|
||||
self.query_tree = query_tree;
|
||||
self.candidates = candidates;
|
||||
self.bucket_candidates.union_with(&bucket_candidates);
|
||||
self.flattened_query_tree = None;
|
||||
self.current_buckets = None;
|
||||
},
|
||||
None => return Ok(None),
|
||||
}
|
||||
},
|
||||
None => return Ok(None),
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn linear_compute_candidates(
|
||||
ctx: &dyn Context,
|
||||
branches: &Vec<Vec<Query>>,
|
||||
allowed_candidates: &RoaringBitmap,
|
||||
) -> anyhow::Result<BTreeMap<u64, RoaringBitmap>>
|
||||
{
|
||||
fn compute_candidate_rank(branches: &Vec<Vec<Query>>, words_positions: HashMap<String, RoaringBitmap>) -> u64 {
|
||||
let mut min_rank = u64::max_value();
|
||||
for branch in branches {
|
||||
let mut branch_rank = 0;
|
||||
for Query { prefix, kind } in branch {
|
||||
// find the best position of the current word in the document.
|
||||
let position = match kind {
|
||||
QueryKind::Exact { word, .. } => {
|
||||
if *prefix {
|
||||
word_derivations(word, true, 0, &words_positions)
|
||||
.flat_map(|positions| positions.iter().next()).min()
|
||||
} else {
|
||||
words_positions.get(word)
|
||||
.map(|positions| positions.iter().next())
|
||||
.flatten()
|
||||
}
|
||||
},
|
||||
QueryKind::Tolerant { typo, word } => {
|
||||
word_derivations(word, *prefix, *typo, &words_positions)
|
||||
.flat_map(|positions| positions.iter().next()).min()
|
||||
},
|
||||
};
|
||||
|
||||
// if a position is found, we add it to the branch score,
|
||||
// otherwise the branch is considered as unfindable in this document and we break.
|
||||
if let Some(position) = position {
|
||||
branch_rank += position as u64;
|
||||
} else {
|
||||
branch_rank = u64::max_value();
|
||||
break;
|
||||
}
|
||||
}
|
||||
min_rank = min_rank.min(branch_rank);
|
||||
}
|
||||
|
||||
min_rank
|
||||
}
|
||||
|
||||
fn word_derivations<'a>(
|
||||
word: &str,
|
||||
is_prefix: bool,
|
||||
max_typo: u8,
|
||||
words_positions: &'a HashMap<String, RoaringBitmap>,
|
||||
) -> impl Iterator<Item = &'a RoaringBitmap>
|
||||
{
|
||||
let dfa = build_dfa(word, max_typo, is_prefix);
|
||||
words_positions.iter().filter_map(move |(document_word, positions)| {
|
||||
use levenshtein_automata::Distance;
|
||||
match dfa.eval(document_word) {
|
||||
Distance::Exact(_) => Some(positions),
|
||||
Distance::AtLeast(_) => None,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
let mut candidates = BTreeMap::new();
|
||||
for docid in allowed_candidates {
|
||||
let words_positions = ctx.docid_words_positions(docid)?;
|
||||
let rank = compute_candidate_rank(branches, words_positions);
|
||||
candidates.entry(rank).or_insert_with(RoaringBitmap::new).insert(docid);
|
||||
}
|
||||
|
||||
Ok(candidates)
|
||||
}
|
||||
|
||||
// TODO can we keep refs of Query
|
||||
fn explode_query_tree(query_tree: &Operation) -> Vec<Vec<Query>> {
|
||||
fn flatten_query_tree(query_tree: &Operation) -> Vec<Vec<Query>> {
|
||||
use crate::search::criteria::Operation::{And, Or, Consecutive};
|
||||
|
||||
fn and_recurse(head: &Operation, tail: &[Operation]) -> Vec<Vec<Query>> {
|
||||
@ -91,7 +241,7 @@ mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn simple_explode_query_tree() {
|
||||
fn simple_flatten_query_tree() {
|
||||
let query_tree = Operation::Or(false, vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }),
|
||||
Operation::And(vec![
|
||||
@ -127,7 +277,7 @@ mod tests {
|
||||
],
|
||||
];
|
||||
|
||||
let result = explode_query_tree(&query_tree);
|
||||
let result = flatten_query_tree(&query_tree);
|
||||
assert_eq!(expected, result);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user