mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-24 13:40:31 +01:00
Introduce a special word_derivations function for Proximity
This commit is contained in:
parent
facfb4b615
commit
d301859bbd
@ -67,7 +67,7 @@ pub trait Context {
|
||||
fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>;
|
||||
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>;
|
||||
fn in_prefix_cache(&self, word: &str) -> bool;
|
||||
fn docid_word_positions(&self, docid: DocumentId, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
||||
fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>>;
|
||||
}
|
||||
pub struct CriteriaBuilder<'t> {
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
@ -107,9 +107,13 @@ impl<'a> Context for CriteriaBuilder<'a> {
|
||||
self.words_prefixes_fst.contains(word)
|
||||
}
|
||||
|
||||
fn docid_word_positions(&self, docid: DocumentId, word: &str) -> heed::Result<Option<RoaringBitmap>> {
|
||||
let key = (docid, word);
|
||||
self.index.docid_word_positions.get(self.rtxn, &key)
|
||||
fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> {
|
||||
let mut words_positions = HashMap::new();
|
||||
for result in self.index.docid_word_positions.prefix_iter(self.rtxn, &(docid, ""))? {
|
||||
let ((_, word), positions) = result?;
|
||||
words_positions.insert(word.to_string(), positions);
|
||||
}
|
||||
Ok(words_positions)
|
||||
}
|
||||
}
|
||||
|
||||
@ -391,7 +395,7 @@ pub mod test {
|
||||
self.word_prefix_docids.contains_key(&word.to_string())
|
||||
}
|
||||
|
||||
fn docid_word_positions(&self, _docid: DocumentId, _word: &str) -> heed::Result<Option<RoaringBitmap>> {
|
||||
fn docid_words_positions(&self, _docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
@ -1,14 +1,13 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::btree_map::{self, BTreeMap};
|
||||
use std::collections::hash_map::{HashMap, Entry};
|
||||
use std::collections::hash_map::HashMap;
|
||||
use std::mem::take;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
use log::debug;
|
||||
|
||||
use crate::{DocumentId, Position, search::{query_tree::QueryKind, word_derivations}};
|
||||
use crate::{DocumentId, Position, search::{query_tree::QueryKind}};
|
||||
use crate::search::query_tree::{maximum_proximity, Operation, Query};
|
||||
use crate::search::WordDerivationsCache;
|
||||
use crate::search::{build_dfa, WordDerivationsCache};
|
||||
use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree};
|
||||
|
||||
pub struct Proximity<'t> {
|
||||
@ -358,7 +357,7 @@ fn resolve_plane_sweep_candidates(
|
||||
docid: DocumentId,
|
||||
consecutive: bool,
|
||||
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
|
||||
dwpcache: &mut HashMap<String, Option<RoaringBitmap>>,
|
||||
words_positions: &HashMap<String, RoaringBitmap>,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> anyhow::Result<Vec<(Position, u8, Position)>>
|
||||
{
|
||||
@ -400,7 +399,7 @@ fn resolve_plane_sweep_candidates(
|
||||
let mut groups_positions = Vec::with_capacity(groups_len);
|
||||
|
||||
for operation in operations {
|
||||
let positions = resolve_operation(ctx, operation, docid, rocache, dwpcache, wdcache)?;
|
||||
let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?;
|
||||
groups_positions.push(positions.into_iter());
|
||||
}
|
||||
|
||||
@ -476,7 +475,7 @@ fn resolve_plane_sweep_candidates(
|
||||
query_tree: &'a Operation,
|
||||
docid: DocumentId,
|
||||
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
|
||||
dwpcache: &mut HashMap<String, Option<RoaringBitmap>>,
|
||||
words_positions: &HashMap<String, RoaringBitmap>,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> anyhow::Result<Vec<(Position, u8, Position)>>
|
||||
{
|
||||
@ -487,44 +486,34 @@ fn resolve_plane_sweep_candidates(
|
||||
}
|
||||
|
||||
let result = match query_tree {
|
||||
And(ops) => plane_sweep(ctx, ops, docid, false, rocache, dwpcache, wdcache)?,
|
||||
Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, dwpcache, wdcache)?,
|
||||
And(ops) => plane_sweep(ctx, ops, docid, false, rocache, words_positions, wdcache)?,
|
||||
Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, words_positions, wdcache)?,
|
||||
Or(_, ops) => {
|
||||
let mut result = Vec::new();
|
||||
for op in ops {
|
||||
result.extend(resolve_operation(ctx, op, docid, rocache, dwpcache, wdcache)?)
|
||||
result.extend(resolve_operation(ctx, op, docid, rocache, words_positions, wdcache)?)
|
||||
}
|
||||
|
||||
result.sort_unstable();
|
||||
result
|
||||
},
|
||||
Operation::Query(Query { prefix, kind }) => {
|
||||
let fst = ctx.words_fst();
|
||||
let words = match kind {
|
||||
let mut result = Vec::new();
|
||||
match kind {
|
||||
QueryKind::Exact { word, .. } => {
|
||||
if *prefix {
|
||||
Cow::Borrowed(word_derivations(word, true, 0, fst, wdcache)?)
|
||||
let iter = word_derivations(word, true, 0, &words_positions)
|
||||
.flat_map(|positions| positions.iter().map(|p| (p, 0, p)));
|
||||
result.extend(iter);
|
||||
} else {
|
||||
Cow::Owned(vec![(word.to_string(), 0)])
|
||||
if let Some(positions) = words_positions.get(word) {
|
||||
result.extend(positions.iter().map(|p| (p, 0, p)));
|
||||
}
|
||||
}
|
||||
},
|
||||
QueryKind::Tolerant { typo, word } => {
|
||||
Cow::Borrowed(word_derivations(word, *prefix, *typo, fst, wdcache)?)
|
||||
}
|
||||
};
|
||||
|
||||
let mut result = Vec::new();
|
||||
for (word, _) in words.as_ref() {
|
||||
let positions = match dwpcache.entry(word.to_string()) {
|
||||
Entry::Occupied(entry) => entry.into_mut(),
|
||||
Entry::Vacant(entry) => {
|
||||
let positions = ctx.docid_word_positions(docid, word)?;
|
||||
entry.insert(positions)
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(positions) = positions {
|
||||
let iter = positions.iter().map(|p| (p, 0, p));
|
||||
let iter = word_derivations(word, *prefix, *typo, &words_positions)
|
||||
.flat_map(|positions| positions.iter().map(|p| (p, 0, p)));
|
||||
result.extend(iter);
|
||||
}
|
||||
}
|
||||
@ -538,18 +527,34 @@ fn resolve_plane_sweep_candidates(
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
let mut word_positions_cache = HashMap::new();
|
||||
fn word_derivations<'a>(
|
||||
word: &str,
|
||||
is_prefix: bool,
|
||||
max_typo: u8,
|
||||
words_positions: &'a HashMap<String, RoaringBitmap>,
|
||||
) -> impl Iterator<Item = &'a RoaringBitmap>
|
||||
{
|
||||
let dfa = build_dfa(word, max_typo, is_prefix);
|
||||
words_positions.iter().filter_map(move |(document_word, positions)| {
|
||||
use levenshtein_automata::Distance;
|
||||
match dfa.eval(document_word) {
|
||||
Distance::Exact(_) => Some(positions),
|
||||
Distance::AtLeast(_) => None,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
let mut resolve_operation_cache = HashMap::new();
|
||||
let mut candidates = BTreeMap::new();
|
||||
for docid in allowed_candidates {
|
||||
word_positions_cache.clear();
|
||||
let words_positions = ctx.docid_words_positions(docid)?;
|
||||
resolve_operation_cache.clear();
|
||||
let positions = resolve_operation(
|
||||
ctx,
|
||||
query_tree,
|
||||
docid,
|
||||
&mut resolve_operation_cache,
|
||||
&mut word_positions_cache,
|
||||
&words_positions,
|
||||
wdcache,
|
||||
)?;
|
||||
let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity);
|
||||
|
Loading…
x
Reference in New Issue
Block a user