mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-12 06:24:29 +01:00
Introduce a special word_derivations function for Proximity
This commit is contained in:
parent
facfb4b615
commit
d301859bbd
@ -67,7 +67,7 @@ pub trait Context {
|
|||||||
fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>;
|
fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>;
|
||||||
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>;
|
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>;
|
||||||
fn in_prefix_cache(&self, word: &str) -> bool;
|
fn in_prefix_cache(&self, word: &str) -> bool;
|
||||||
fn docid_word_positions(&self, docid: DocumentId, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>>;
|
||||||
}
|
}
|
||||||
pub struct CriteriaBuilder<'t> {
|
pub struct CriteriaBuilder<'t> {
|
||||||
rtxn: &'t heed::RoTxn<'t>,
|
rtxn: &'t heed::RoTxn<'t>,
|
||||||
@ -107,9 +107,13 @@ impl<'a> Context for CriteriaBuilder<'a> {
|
|||||||
self.words_prefixes_fst.contains(word)
|
self.words_prefixes_fst.contains(word)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn docid_word_positions(&self, docid: DocumentId, word: &str) -> heed::Result<Option<RoaringBitmap>> {
|
fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> {
|
||||||
let key = (docid, word);
|
let mut words_positions = HashMap::new();
|
||||||
self.index.docid_word_positions.get(self.rtxn, &key)
|
for result in self.index.docid_word_positions.prefix_iter(self.rtxn, &(docid, ""))? {
|
||||||
|
let ((_, word), positions) = result?;
|
||||||
|
words_positions.insert(word.to_string(), positions);
|
||||||
|
}
|
||||||
|
Ok(words_positions)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -391,7 +395,7 @@ pub mod test {
|
|||||||
self.word_prefix_docids.contains_key(&word.to_string())
|
self.word_prefix_docids.contains_key(&word.to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn docid_word_positions(&self, _docid: DocumentId, _word: &str) -> heed::Result<Option<RoaringBitmap>> {
|
fn docid_words_positions(&self, _docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> {
|
||||||
todo!()
|
todo!()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,14 +1,13 @@
|
|||||||
use std::borrow::Cow;
|
|
||||||
use std::collections::btree_map::{self, BTreeMap};
|
use std::collections::btree_map::{self, BTreeMap};
|
||||||
use std::collections::hash_map::{HashMap, Entry};
|
use std::collections::hash_map::HashMap;
|
||||||
use std::mem::take;
|
use std::mem::take;
|
||||||
|
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use log::debug;
|
use log::debug;
|
||||||
|
|
||||||
use crate::{DocumentId, Position, search::{query_tree::QueryKind, word_derivations}};
|
use crate::{DocumentId, Position, search::{query_tree::QueryKind}};
|
||||||
use crate::search::query_tree::{maximum_proximity, Operation, Query};
|
use crate::search::query_tree::{maximum_proximity, Operation, Query};
|
||||||
use crate::search::WordDerivationsCache;
|
use crate::search::{build_dfa, WordDerivationsCache};
|
||||||
use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree};
|
use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree};
|
||||||
|
|
||||||
pub struct Proximity<'t> {
|
pub struct Proximity<'t> {
|
||||||
@ -358,7 +357,7 @@ fn resolve_plane_sweep_candidates(
|
|||||||
docid: DocumentId,
|
docid: DocumentId,
|
||||||
consecutive: bool,
|
consecutive: bool,
|
||||||
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
|
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
|
||||||
dwpcache: &mut HashMap<String, Option<RoaringBitmap>>,
|
words_positions: &HashMap<String, RoaringBitmap>,
|
||||||
wdcache: &mut WordDerivationsCache,
|
wdcache: &mut WordDerivationsCache,
|
||||||
) -> anyhow::Result<Vec<(Position, u8, Position)>>
|
) -> anyhow::Result<Vec<(Position, u8, Position)>>
|
||||||
{
|
{
|
||||||
@ -400,7 +399,7 @@ fn resolve_plane_sweep_candidates(
|
|||||||
let mut groups_positions = Vec::with_capacity(groups_len);
|
let mut groups_positions = Vec::with_capacity(groups_len);
|
||||||
|
|
||||||
for operation in operations {
|
for operation in operations {
|
||||||
let positions = resolve_operation(ctx, operation, docid, rocache, dwpcache, wdcache)?;
|
let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?;
|
||||||
groups_positions.push(positions.into_iter());
|
groups_positions.push(positions.into_iter());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -476,7 +475,7 @@ fn resolve_plane_sweep_candidates(
|
|||||||
query_tree: &'a Operation,
|
query_tree: &'a Operation,
|
||||||
docid: DocumentId,
|
docid: DocumentId,
|
||||||
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
|
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
|
||||||
dwpcache: &mut HashMap<String, Option<RoaringBitmap>>,
|
words_positions: &HashMap<String, RoaringBitmap>,
|
||||||
wdcache: &mut WordDerivationsCache,
|
wdcache: &mut WordDerivationsCache,
|
||||||
) -> anyhow::Result<Vec<(Position, u8, Position)>>
|
) -> anyhow::Result<Vec<(Position, u8, Position)>>
|
||||||
{
|
{
|
||||||
@ -487,44 +486,34 @@ fn resolve_plane_sweep_candidates(
|
|||||||
}
|
}
|
||||||
|
|
||||||
let result = match query_tree {
|
let result = match query_tree {
|
||||||
And(ops) => plane_sweep(ctx, ops, docid, false, rocache, dwpcache, wdcache)?,
|
And(ops) => plane_sweep(ctx, ops, docid, false, rocache, words_positions, wdcache)?,
|
||||||
Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, dwpcache, wdcache)?,
|
Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, words_positions, wdcache)?,
|
||||||
Or(_, ops) => {
|
Or(_, ops) => {
|
||||||
let mut result = Vec::new();
|
let mut result = Vec::new();
|
||||||
for op in ops {
|
for op in ops {
|
||||||
result.extend(resolve_operation(ctx, op, docid, rocache, dwpcache, wdcache)?)
|
result.extend(resolve_operation(ctx, op, docid, rocache, words_positions, wdcache)?)
|
||||||
}
|
}
|
||||||
|
|
||||||
result.sort_unstable();
|
result.sort_unstable();
|
||||||
result
|
result
|
||||||
},
|
},
|
||||||
Operation::Query(Query { prefix, kind }) => {
|
Operation::Query(Query { prefix, kind }) => {
|
||||||
let fst = ctx.words_fst();
|
let mut result = Vec::new();
|
||||||
let words = match kind {
|
match kind {
|
||||||
QueryKind::Exact { word, .. } => {
|
QueryKind::Exact { word, .. } => {
|
||||||
if *prefix {
|
if *prefix {
|
||||||
Cow::Borrowed(word_derivations(word, true, 0, fst, wdcache)?)
|
let iter = word_derivations(word, true, 0, &words_positions)
|
||||||
|
.flat_map(|positions| positions.iter().map(|p| (p, 0, p)));
|
||||||
|
result.extend(iter);
|
||||||
} else {
|
} else {
|
||||||
Cow::Owned(vec![(word.to_string(), 0)])
|
if let Some(positions) = words_positions.get(word) {
|
||||||
|
result.extend(positions.iter().map(|p| (p, 0, p)));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
QueryKind::Tolerant { typo, word } => {
|
QueryKind::Tolerant { typo, word } => {
|
||||||
Cow::Borrowed(word_derivations(word, *prefix, *typo, fst, wdcache)?)
|
let iter = word_derivations(word, *prefix, *typo, &words_positions)
|
||||||
}
|
.flat_map(|positions| positions.iter().map(|p| (p, 0, p)));
|
||||||
};
|
|
||||||
|
|
||||||
let mut result = Vec::new();
|
|
||||||
for (word, _) in words.as_ref() {
|
|
||||||
let positions = match dwpcache.entry(word.to_string()) {
|
|
||||||
Entry::Occupied(entry) => entry.into_mut(),
|
|
||||||
Entry::Vacant(entry) => {
|
|
||||||
let positions = ctx.docid_word_positions(docid, word)?;
|
|
||||||
entry.insert(positions)
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
if let Some(positions) = positions {
|
|
||||||
let iter = positions.iter().map(|p| (p, 0, p));
|
|
||||||
result.extend(iter);
|
result.extend(iter);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -538,18 +527,34 @@ fn resolve_plane_sweep_candidates(
|
|||||||
Ok(result)
|
Ok(result)
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut word_positions_cache = HashMap::new();
|
fn word_derivations<'a>(
|
||||||
|
word: &str,
|
||||||
|
is_prefix: bool,
|
||||||
|
max_typo: u8,
|
||||||
|
words_positions: &'a HashMap<String, RoaringBitmap>,
|
||||||
|
) -> impl Iterator<Item = &'a RoaringBitmap>
|
||||||
|
{
|
||||||
|
let dfa = build_dfa(word, max_typo, is_prefix);
|
||||||
|
words_positions.iter().filter_map(move |(document_word, positions)| {
|
||||||
|
use levenshtein_automata::Distance;
|
||||||
|
match dfa.eval(document_word) {
|
||||||
|
Distance::Exact(_) => Some(positions),
|
||||||
|
Distance::AtLeast(_) => None,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
let mut resolve_operation_cache = HashMap::new();
|
let mut resolve_operation_cache = HashMap::new();
|
||||||
let mut candidates = BTreeMap::new();
|
let mut candidates = BTreeMap::new();
|
||||||
for docid in allowed_candidates {
|
for docid in allowed_candidates {
|
||||||
word_positions_cache.clear();
|
let words_positions = ctx.docid_words_positions(docid)?;
|
||||||
resolve_operation_cache.clear();
|
resolve_operation_cache.clear();
|
||||||
let positions = resolve_operation(
|
let positions = resolve_operation(
|
||||||
ctx,
|
ctx,
|
||||||
query_tree,
|
query_tree,
|
||||||
docid,
|
docid,
|
||||||
&mut resolve_operation_cache,
|
&mut resolve_operation_cache,
|
||||||
&mut word_positions_cache,
|
&words_positions,
|
||||||
wdcache,
|
wdcache,
|
||||||
)?;
|
)?;
|
||||||
let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity);
|
let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user