mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-06-19 13:17:35 +02:00
127 lines
4.4 KiB
Rust
127 lines
4.4 KiB
Rust
use fxhash::{FxHashMap, FxHashSet};
|
|
use roaring::RoaringBitmap;
|
|
|
|
use super::{ComputedCondition, RankingRuleGraphTrait};
|
|
use crate::search::new::interner::{DedupInterner, Interned};
|
|
use crate::search::new::query_term::LocatedQueryTermSubset;
|
|
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids_within_position;
|
|
use crate::search::new::SearchContext;
|
|
use crate::Result;
|
|
|
|
#[derive(Clone, PartialEq, Eq, Hash)]
|
|
pub struct PositionCondition {
|
|
term: LocatedQueryTermSubset,
|
|
positions: Vec<u16>,
|
|
}
|
|
|
|
pub enum PositionGraph {}
|
|
|
|
impl RankingRuleGraphTrait for PositionGraph {
|
|
type Condition = PositionCondition;
|
|
|
|
fn resolve_condition(
|
|
ctx: &mut SearchContext,
|
|
condition: &Self::Condition,
|
|
universe: &RoaringBitmap,
|
|
) -> Result<ComputedCondition> {
|
|
let PositionCondition { term, positions } = condition;
|
|
let mut docids = RoaringBitmap::new();
|
|
for position in positions {
|
|
// maybe compute_query_term_subset_docids_within_position should accept a universe as argument
|
|
docids |= universe
|
|
& compute_query_term_subset_docids_within_position(
|
|
ctx,
|
|
&term.term_subset,
|
|
*position,
|
|
)?;
|
|
}
|
|
Ok(ComputedCondition {
|
|
docids,
|
|
universe_len: universe.len(),
|
|
start_term_subset: None,
|
|
end_term_subset: term.clone(),
|
|
})
|
|
}
|
|
|
|
fn build_edges(
|
|
ctx: &mut SearchContext,
|
|
conditions_interner: &mut DedupInterner<Self::Condition>,
|
|
_from: Option<&LocatedQueryTermSubset>,
|
|
to_term: &LocatedQueryTermSubset,
|
|
) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
|
|
let term = to_term;
|
|
|
|
let mut all_positions = FxHashSet::default();
|
|
for word in term.term_subset.all_single_words_except_prefix_db(ctx)? {
|
|
let positions = ctx.get_db_word_positions(word.interned())?;
|
|
all_positions.extend(positions);
|
|
}
|
|
|
|
for phrase in term.term_subset.all_phrases(ctx)? {
|
|
// Only check the position of the first word in the phrase
|
|
// this is not correct, but it is the best we can do, since
|
|
// it is difficult/impossible to know the expected position
|
|
// of a word in a phrase.
|
|
// There is probably a more correct way to do it though.
|
|
if let Some(word) = phrase.words(ctx).iter().flatten().next() {
|
|
let positions = ctx.get_db_word_positions(*word)?;
|
|
all_positions.extend(positions);
|
|
}
|
|
}
|
|
|
|
if let Some(word_prefix) = term.term_subset.use_prefix_db(ctx) {
|
|
let positions = ctx.get_db_word_prefix_positions(word_prefix.interned())?;
|
|
all_positions.extend(positions);
|
|
}
|
|
|
|
let mut positions_for_costs = FxHashMap::<u32, Vec<u16>>::default();
|
|
|
|
for position in all_positions {
|
|
let cost = {
|
|
let mut cost = 0;
|
|
for i in 0..term.term_ids.len() {
|
|
// This is actually not fully correct and slightly penalises ngrams unfairly.
|
|
// Because if two words are in the same bucketed position (e.g. 32) and consecutive,
|
|
// then their position cost will be 32+32=64, but an ngram of these two words at the
|
|
// same position will have a cost of 32+32+1=65
|
|
cost += cost_from_position(position as u32 + i as u32);
|
|
}
|
|
cost
|
|
};
|
|
positions_for_costs.entry(cost).or_default().push(position);
|
|
}
|
|
|
|
let mut edges = vec![];
|
|
|
|
for (cost, positions) in positions_for_costs {
|
|
// TODO: We can improve performances and relevancy by storing
|
|
// the term subsets associated to each position fetched
|
|
edges.push((
|
|
cost,
|
|
conditions_interner.insert(PositionCondition {
|
|
term: term.clone(), // TODO remove this ugly clone
|
|
positions,
|
|
}),
|
|
));
|
|
}
|
|
|
|
Ok(edges)
|
|
}
|
|
}
|
|
|
|
fn cost_from_position(sum_positions: u32) -> u32 {
|
|
match sum_positions {
|
|
0 => 0,
|
|
1 => 1,
|
|
2..=4 => 2,
|
|
5..=7 => 3,
|
|
8..=11 => 4,
|
|
12..=16 => 5,
|
|
17..=24 => 6,
|
|
25..=64 => 7,
|
|
65..=256 => 8,
|
|
257..=1024 => 9,
|
|
_ => 10,
|
|
}
|
|
}
|