mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-25 14:10:06 +01:00
Avoid a prefix-related worst-case scenario in the proximity criterion
This commit is contained in:
parent
a8defb585b
commit
777b387dc4
@ -17,6 +17,7 @@ use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind};
|
|||||||
use super::CriterionImplementationStrategy;
|
use super::CriterionImplementationStrategy;
|
||||||
use crate::search::criteria::geo::Geo;
|
use crate::search::criteria::geo::Geo;
|
||||||
use crate::search::{word_derivations, Distinct, WordDerivationsCache};
|
use crate::search::{word_derivations, Distinct, WordDerivationsCache};
|
||||||
|
use crate::update::{MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB};
|
||||||
use crate::{AscDesc as AscDescName, DocumentId, FieldId, Index, Member, Result};
|
use crate::{AscDesc as AscDescName, DocumentId, FieldId, Index, Member, Result};
|
||||||
|
|
||||||
mod asc_desc;
|
mod asc_desc;
|
||||||
@ -653,14 +654,30 @@ fn query_pair_proximity_docids(
|
|||||||
match (&left.kind, &right.kind) {
|
match (&left.kind, &right.kind) {
|
||||||
(QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => {
|
(QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => {
|
||||||
if prefix {
|
if prefix {
|
||||||
match word_prefix_pair_overall_proximity_docids(
|
// There are three distinct cases which we need to distinguish regarding the prefix `right`:
|
||||||
ctx,
|
//
|
||||||
left.as_str(),
|
// 1. `right` is not in any prefix cache because it is not the prefix of many words
|
||||||
right.as_str(),
|
// (and thus, it doesn't have many word derivations)
|
||||||
proximity,
|
// 2. `right` is in the prefix cache but cannot be found in the "word prefix pair proximity" databases either
|
||||||
)? {
|
// because it is too long or because the given proximity is too high.
|
||||||
Some(docids) => Ok(docids),
|
// 3. `right` is in the prefix cache and can be found in the "word prefix pair proximity" databases
|
||||||
None => {
|
//
|
||||||
|
// The three cases are handled as follows:
|
||||||
|
// 1. We manually retrieve all the word derivations of `right` and check the `word_pair_proximity`
|
||||||
|
// database for each of them.
|
||||||
|
// 2. It would be too expensive to apply the same strategy as (1), therefore, we "disable" the
|
||||||
|
// proximity ranking rule for the prefixes of the right word. This is done as follows:
|
||||||
|
// 1. Only find the documents where left is in proximity to the exact (ie non-prefix) right word
|
||||||
|
// 2. Otherwise, assume that their proximity in all the documents in which they coexist is >= 8
|
||||||
|
//
|
||||||
|
// 3. Query the prefix proximity databases.
|
||||||
|
match (
|
||||||
|
ctx.in_prefix_cache(right),
|
||||||
|
right.len() <= MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB
|
||||||
|
&& proximity <= MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB,
|
||||||
|
) {
|
||||||
|
// Case 1: not in prefix cache
|
||||||
|
(false, _) => {
|
||||||
let r_words = word_derivations(right, true, 0, ctx.words_fst(), wdcache)?;
|
let r_words = word_derivations(right, true, 0, ctx.words_fst(), wdcache)?;
|
||||||
all_word_pair_overall_proximity_docids(
|
all_word_pair_overall_proximity_docids(
|
||||||
ctx,
|
ctx,
|
||||||
@ -669,10 +686,35 @@ fn query_pair_proximity_docids(
|
|||||||
proximity,
|
proximity,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
// Case 2: in prefix cache but either the prefix length or the proximity makes it impossible to
|
||||||
|
// query the prefix proximity databases.
|
||||||
|
(true, false) => {
|
||||||
|
// To "save" the relevancy a little bit, we still find the documents where the
|
||||||
|
// exact (i.e. non-prefix) right word is in the given proximity to the left word.
|
||||||
|
Ok(word_pair_overall_proximity_docids(
|
||||||
|
ctx,
|
||||||
|
left.as_str(),
|
||||||
|
right.as_str(),
|
||||||
|
proximity,
|
||||||
|
)?
|
||||||
|
.unwrap_or_default())
|
||||||
|
}
|
||||||
|
// Case 3: in prefix cache, short enough, and proximity is low enough
|
||||||
|
(true, true) => Ok(word_prefix_pair_overall_proximity_docids(
|
||||||
|
ctx,
|
||||||
|
left.as_str(),
|
||||||
|
right.as_str(),
|
||||||
|
proximity,
|
||||||
|
)?
|
||||||
|
.unwrap_or_default()),
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
Ok(ctx
|
Ok(word_pair_overall_proximity_docids(
|
||||||
.word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?
|
ctx,
|
||||||
|
left.as_str(),
|
||||||
|
right.as_str(),
|
||||||
|
proximity,
|
||||||
|
)?
|
||||||
.unwrap_or_default())
|
.unwrap_or_default())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -680,29 +722,55 @@ fn query_pair_proximity_docids(
|
|||||||
let l_words =
|
let l_words =
|
||||||
word_derivations(left, false, *typo, ctx.words_fst(), wdcache)?.to_owned();
|
word_derivations(left, false, *typo, ctx.words_fst(), wdcache)?.to_owned();
|
||||||
if prefix {
|
if prefix {
|
||||||
|
// The logic here is almost identical to the one in the previous match branch.
|
||||||
|
// The difference is that we fetch the docids for each derivation of the left word.
|
||||||
|
match (
|
||||||
|
ctx.in_prefix_cache(right),
|
||||||
|
right.len() <= MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB
|
||||||
|
&& proximity <= MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB,
|
||||||
|
) {
|
||||||
|
// Case 1: not in prefix cache
|
||||||
|
(false, _) => {
|
||||||
let mut docids = RoaringBitmap::new();
|
let mut docids = RoaringBitmap::new();
|
||||||
|
let r_words = word_derivations(right, true, 0, ctx.words_fst(), wdcache)?;
|
||||||
for (left, _) in l_words {
|
for (left, _) in l_words {
|
||||||
let current_docids = match word_prefix_pair_overall_proximity_docids(
|
docids |= all_word_pair_overall_proximity_docids(
|
||||||
ctx,
|
|
||||||
left.as_str(),
|
|
||||||
right.as_str(),
|
|
||||||
proximity,
|
|
||||||
)? {
|
|
||||||
Some(docids) => Ok(docids),
|
|
||||||
None => {
|
|
||||||
let r_words =
|
|
||||||
word_derivations(right, true, 0, ctx.words_fst(), wdcache)?;
|
|
||||||
all_word_pair_overall_proximity_docids(
|
|
||||||
ctx,
|
ctx,
|
||||||
&[(left, 0)],
|
&[(left, 0)],
|
||||||
r_words,
|
r_words,
|
||||||
proximity,
|
proximity,
|
||||||
)
|
)?;
|
||||||
}
|
|
||||||
}?;
|
|
||||||
docids |= current_docids;
|
|
||||||
}
|
}
|
||||||
Ok(docids)
|
Ok(docids)
|
||||||
|
}
|
||||||
|
// Case 2: in prefix cache but either the prefix length or the proximity makes it impossible to
|
||||||
|
// query the prefix proximity databases.
|
||||||
|
(true, false) => {
|
||||||
|
// To "save" the relevancy a little bit, we still find the documents where the
|
||||||
|
// exact (i.e. non-prefix) right word is in proximity to any derivation of the left word.
|
||||||
|
let mut candidates = RoaringBitmap::new();
|
||||||
|
for (left, _) in l_words {
|
||||||
|
candidates |= ctx
|
||||||
|
.word_pair_proximity_docids(&left, right, proximity)?
|
||||||
|
.unwrap_or_default();
|
||||||
|
}
|
||||||
|
Ok(candidates)
|
||||||
|
}
|
||||||
|
// Case 3: in prefix cache, short enough, and proximity is low enough
|
||||||
|
(true, true) => {
|
||||||
|
let mut docids = RoaringBitmap::new();
|
||||||
|
for (left, _) in l_words {
|
||||||
|
docids |= word_prefix_pair_overall_proximity_docids(
|
||||||
|
ctx,
|
||||||
|
left.as_str(),
|
||||||
|
right.as_str(),
|
||||||
|
proximity,
|
||||||
|
)?
|
||||||
|
.unwrap_or_default();
|
||||||
|
}
|
||||||
|
Ok(docids)
|
||||||
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
all_word_pair_overall_proximity_docids(ctx, &l_words, &[(right, 0)], proximity)
|
all_word_pair_overall_proximity_docids(ctx, &l_words, &[(right, 0)], proximity)
|
||||||
}
|
}
|
||||||
|
@ -590,3 +590,6 @@ fn resolve_plane_sweep_candidates(
|
|||||||
|
|
||||||
Ok(candidates)
|
Ok(candidates)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {}
|
||||||
|
@ -7,7 +7,10 @@ pub use self::index_documents::{
|
|||||||
DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
|
DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
|
||||||
};
|
};
|
||||||
pub use self::indexer_config::IndexerConfig;
|
pub use self::indexer_config::IndexerConfig;
|
||||||
pub use self::prefix_word_pairs::PrefixWordPairsProximityDocids;
|
pub use self::prefix_word_pairs::{
|
||||||
|
PrefixWordPairsProximityDocids, MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB,
|
||||||
|
MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB,
|
||||||
|
};
|
||||||
pub use self::settings::{Setting, Settings};
|
pub use self::settings::{Setting, Settings};
|
||||||
pub use self::update_step::UpdateIndexingStep;
|
pub use self::update_step::UpdateIndexingStep;
|
||||||
pub use self::word_prefix_docids::WordPrefixDocids;
|
pub use self::word_prefix_docids::WordPrefixDocids;
|
||||||
|
@ -14,6 +14,9 @@ mod word_prefix;
|
|||||||
pub use prefix_word::index_prefix_word_database;
|
pub use prefix_word::index_prefix_word_database;
|
||||||
pub use word_prefix::index_word_prefix_database;
|
pub use word_prefix::index_word_prefix_database;
|
||||||
|
|
||||||
|
pub const MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB: u8 = 4;
|
||||||
|
pub const MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB: usize = 2;
|
||||||
|
|
||||||
pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> {
|
pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> {
|
||||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
@ -32,31 +35,12 @@ impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> {
|
|||||||
Self {
|
Self {
|
||||||
wtxn,
|
wtxn,
|
||||||
index,
|
index,
|
||||||
max_proximity: 4,
|
max_proximity: MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB,
|
||||||
max_prefix_length: 2,
|
max_prefix_length: MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB,
|
||||||
chunk_compression_type,
|
chunk_compression_type,
|
||||||
chunk_compression_level,
|
chunk_compression_level,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/// Set the maximum proximity required to make a prefix be part of the words prefixes
|
|
||||||
/// database. If two words are too far from the threshold the associated documents will
|
|
||||||
/// not be part of the prefix database.
|
|
||||||
///
|
|
||||||
/// Default value is 4. This value must be lower or equal than 7 and will be clamped
|
|
||||||
/// to this bound otherwise.
|
|
||||||
pub fn max_proximity(&mut self, value: u8) -> &mut Self {
|
|
||||||
self.max_proximity = value.max(7);
|
|
||||||
self
|
|
||||||
}
|
|
||||||
/// Set the maximum length the prefix of a word pair is allowed to have to be part of the words
|
|
||||||
/// prefixes database. If the prefix length is higher than the threshold, the associated documents
|
|
||||||
/// will not be part of the prefix database.
|
|
||||||
///
|
|
||||||
/// Default value is 2.
|
|
||||||
pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
|
|
||||||
self.max_prefix_length = value;
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
#[logging_timer::time("WordPrefixPairProximityDocids::{}")]
|
#[logging_timer::time("WordPrefixPairProximityDocids::{}")]
|
||||||
pub fn execute<'a>(
|
pub fn execute<'a>(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user