mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-30 00:34:26 +01:00
Implement attribute criterion for small amounts of candidates
This commit is contained in:
parent
0f4c0beffd
commit
4ff67ec2ee
@ -1,10 +1,13 @@
|
|||||||
use log::debug;
|
use std::collections::{BTreeMap, HashMap, btree_map};
|
||||||
|
use std::mem::take;
|
||||||
|
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
use crate::{search::build_dfa};
|
||||||
use crate::search::criteria::Query;
|
use crate::search::criteria::Query;
|
||||||
use crate::search::query_tree::Operation;
|
use crate::search::query_tree::{Operation, QueryKind};
|
||||||
use crate::search::WordDerivationsCache;
|
use crate::search::WordDerivationsCache;
|
||||||
use super::{Criterion, CriterionResult, Context};
|
use super::{Criterion, CriterionResult, Context, resolve_query_tree};
|
||||||
|
|
||||||
pub struct Attribute<'t> {
|
pub struct Attribute<'t> {
|
||||||
ctx: &'t dyn Context,
|
ctx: &'t dyn Context,
|
||||||
@ -12,6 +15,8 @@ pub struct Attribute<'t> {
|
|||||||
candidates: Option<RoaringBitmap>,
|
candidates: Option<RoaringBitmap>,
|
||||||
bucket_candidates: RoaringBitmap,
|
bucket_candidates: RoaringBitmap,
|
||||||
parent: Option<Box<dyn Criterion + 't>>,
|
parent: Option<Box<dyn Criterion + 't>>,
|
||||||
|
flattened_query_tree: Option<Vec<Vec<Query>>>,
|
||||||
|
current_buckets: Option<btree_map::IntoIter<u64, RoaringBitmap>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'t> Attribute<'t> {
|
impl<'t> Attribute<'t> {
|
||||||
@ -27,6 +32,8 @@ impl<'t> Attribute<'t> {
|
|||||||
candidates,
|
candidates,
|
||||||
bucket_candidates: RoaringBitmap::new(),
|
bucket_candidates: RoaringBitmap::new(),
|
||||||
parent: None,
|
parent: None,
|
||||||
|
flattened_query_tree: None,
|
||||||
|
current_buckets: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -37,6 +44,8 @@ impl<'t> Attribute<'t> {
|
|||||||
candidates: None,
|
candidates: None,
|
||||||
bucket_candidates: RoaringBitmap::new(),
|
bucket_candidates: RoaringBitmap::new(),
|
||||||
parent: Some(parent),
|
parent: Some(parent),
|
||||||
|
flattened_query_tree: None,
|
||||||
|
current_buckets: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -44,12 +53,153 @@ impl<'t> Attribute<'t> {
|
|||||||
impl<'t> Criterion for Attribute<'t> {
|
impl<'t> Criterion for Attribute<'t> {
|
||||||
#[logging_timer::time("Attribute::{}")]
|
#[logging_timer::time("Attribute::{}")]
|
||||||
fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>> {
|
fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>> {
|
||||||
todo!("Attribute")
|
loop {
|
||||||
|
match (&self.query_tree, &mut self.candidates) {
|
||||||
|
(_, Some(candidates)) if candidates.is_empty() => {
|
||||||
|
return Ok(Some(CriterionResult {
|
||||||
|
query_tree: self.query_tree.take(),
|
||||||
|
candidates: self.candidates.take(),
|
||||||
|
bucket_candidates: take(&mut self.bucket_candidates),
|
||||||
|
}));
|
||||||
|
},
|
||||||
|
(Some(qt), Some(candidates)) => {
|
||||||
|
let flattened_query_tree = self.flattened_query_tree.get_or_insert_with(|| flatten_query_tree(&qt));
|
||||||
|
let current_buckets = if let Some(current_buckets) = self.current_buckets.as_mut() {
|
||||||
|
current_buckets
|
||||||
|
} else {
|
||||||
|
let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?;
|
||||||
|
self.current_buckets.get_or_insert(new_buckets.into_iter())
|
||||||
|
};
|
||||||
|
|
||||||
|
let found_candidates = if let Some((_score, candidates)) = current_buckets.next() {
|
||||||
|
candidates
|
||||||
|
} else {
|
||||||
|
return Ok(Some(CriterionResult {
|
||||||
|
query_tree: self.query_tree.take(),
|
||||||
|
candidates: self.candidates.take(),
|
||||||
|
bucket_candidates: take(&mut self.bucket_candidates),
|
||||||
|
}));
|
||||||
|
};
|
||||||
|
candidates.difference_with(&found_candidates);
|
||||||
|
|
||||||
|
let bucket_candidates = match self.parent {
|
||||||
|
Some(_) => take(&mut self.bucket_candidates),
|
||||||
|
None => found_candidates.clone(),
|
||||||
|
};
|
||||||
|
|
||||||
|
return Ok(Some(CriterionResult {
|
||||||
|
query_tree: self.query_tree.clone(),
|
||||||
|
candidates: Some(found_candidates),
|
||||||
|
bucket_candidates: bucket_candidates,
|
||||||
|
}));
|
||||||
|
},
|
||||||
|
(Some(qt), None) => {
|
||||||
|
let query_tree_candidates = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?;
|
||||||
|
self.bucket_candidates.union_with(&query_tree_candidates);
|
||||||
|
self.candidates = Some(query_tree_candidates);
|
||||||
|
},
|
||||||
|
(None, Some(_)) => {
|
||||||
|
return Ok(Some(CriterionResult {
|
||||||
|
query_tree: self.query_tree.take(),
|
||||||
|
candidates: self.candidates.take(),
|
||||||
|
bucket_candidates: take(&mut self.bucket_candidates),
|
||||||
|
}));
|
||||||
|
},
|
||||||
|
(None, None) => {
|
||||||
|
match self.parent.as_mut() {
|
||||||
|
Some(parent) => {
|
||||||
|
match parent.next(wdcache)? {
|
||||||
|
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
|
||||||
|
self.query_tree = query_tree;
|
||||||
|
self.candidates = candidates;
|
||||||
|
self.bucket_candidates.union_with(&bucket_candidates);
|
||||||
|
self.flattened_query_tree = None;
|
||||||
|
self.current_buckets = None;
|
||||||
|
},
|
||||||
|
None => return Ok(None),
|
||||||
|
}
|
||||||
|
},
|
||||||
|
None => return Ok(None),
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn linear_compute_candidates(
|
||||||
|
ctx: &dyn Context,
|
||||||
|
branches: &Vec<Vec<Query>>,
|
||||||
|
allowed_candidates: &RoaringBitmap,
|
||||||
|
) -> anyhow::Result<BTreeMap<u64, RoaringBitmap>>
|
||||||
|
{
|
||||||
|
fn compute_candidate_rank(branches: &Vec<Vec<Query>>, words_positions: HashMap<String, RoaringBitmap>) -> u64 {
|
||||||
|
let mut min_rank = u64::max_value();
|
||||||
|
for branch in branches {
|
||||||
|
let mut branch_rank = 0;
|
||||||
|
for Query { prefix, kind } in branch {
|
||||||
|
// find the best position of the current word in the document.
|
||||||
|
let position = match kind {
|
||||||
|
QueryKind::Exact { word, .. } => {
|
||||||
|
if *prefix {
|
||||||
|
word_derivations(word, true, 0, &words_positions)
|
||||||
|
.flat_map(|positions| positions.iter().next()).min()
|
||||||
|
} else {
|
||||||
|
words_positions.get(word)
|
||||||
|
.map(|positions| positions.iter().next())
|
||||||
|
.flatten()
|
||||||
|
}
|
||||||
|
},
|
||||||
|
QueryKind::Tolerant { typo, word } => {
|
||||||
|
word_derivations(word, *prefix, *typo, &words_positions)
|
||||||
|
.flat_map(|positions| positions.iter().next()).min()
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
// if a position is found, we add it to the branch score,
|
||||||
|
// otherwise the branch is considered as unfindable in this document and we break.
|
||||||
|
if let Some(position) = position {
|
||||||
|
branch_rank += position as u64;
|
||||||
|
} else {
|
||||||
|
branch_rank = u64::max_value();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
min_rank = min_rank.min(branch_rank);
|
||||||
|
}
|
||||||
|
|
||||||
|
min_rank
|
||||||
|
}
|
||||||
|
|
||||||
|
fn word_derivations<'a>(
|
||||||
|
word: &str,
|
||||||
|
is_prefix: bool,
|
||||||
|
max_typo: u8,
|
||||||
|
words_positions: &'a HashMap<String, RoaringBitmap>,
|
||||||
|
) -> impl Iterator<Item = &'a RoaringBitmap>
|
||||||
|
{
|
||||||
|
let dfa = build_dfa(word, max_typo, is_prefix);
|
||||||
|
words_positions.iter().filter_map(move |(document_word, positions)| {
|
||||||
|
use levenshtein_automata::Distance;
|
||||||
|
match dfa.eval(document_word) {
|
||||||
|
Distance::Exact(_) => Some(positions),
|
||||||
|
Distance::AtLeast(_) => None,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut candidates = BTreeMap::new();
|
||||||
|
for docid in allowed_candidates {
|
||||||
|
let words_positions = ctx.docid_words_positions(docid)?;
|
||||||
|
let rank = compute_candidate_rank(branches, words_positions);
|
||||||
|
candidates.entry(rank).or_insert_with(RoaringBitmap::new).insert(docid);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(candidates)
|
||||||
|
}
|
||||||
|
|
||||||
// TODO can we keep refs of Query
|
// TODO can we keep refs of Query
|
||||||
fn explode_query_tree(query_tree: &Operation) -> Vec<Vec<Query>> {
|
fn flatten_query_tree(query_tree: &Operation) -> Vec<Vec<Query>> {
|
||||||
use crate::search::criteria::Operation::{And, Or, Consecutive};
|
use crate::search::criteria::Operation::{And, Or, Consecutive};
|
||||||
|
|
||||||
fn and_recurse(head: &Operation, tail: &[Operation]) -> Vec<Vec<Query>> {
|
fn and_recurse(head: &Operation, tail: &[Operation]) -> Vec<Vec<Query>> {
|
||||||
@ -91,7 +241,7 @@ mod tests {
|
|||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn simple_explode_query_tree() {
|
fn simple_flatten_query_tree() {
|
||||||
let query_tree = Operation::Or(false, vec![
|
let query_tree = Operation::Or(false, vec![
|
||||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }),
|
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }),
|
||||||
Operation::And(vec![
|
Operation::And(vec![
|
||||||
@ -127,7 +277,7 @@ mod tests {
|
|||||||
],
|
],
|
||||||
];
|
];
|
||||||
|
|
||||||
let result = explode_query_tree(&query_tree);
|
let result = flatten_query_tree(&query_tree);
|
||||||
assert_eq!(expected, result);
|
assert_eq!(expected, result);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user