mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-11 14:04:31 +01:00
Update typo ranking rule to use new query term structure
This commit is contained in:
parent
fa81381865
commit
728710d63a
@ -68,8 +68,8 @@ pub trait SearchLogger<Q: RankingRuleQueryTrait> {
|
||||
paths: &[Vec<Interned<ProximityCondition>>],
|
||||
dead_ends_cache: &DeadEndsCache<ProximityCondition>,
|
||||
universe: &RoaringBitmap,
|
||||
distances: &MappedInterner<QueryNode, Vec<u16>>,
|
||||
cost: u16,
|
||||
distances: &MappedInterner<QueryNode, Vec<u64>>,
|
||||
cost: u64,
|
||||
);
|
||||
|
||||
/// Logs the internal state of the typo ranking rule
|
||||
@ -79,8 +79,8 @@ pub trait SearchLogger<Q: RankingRuleQueryTrait> {
|
||||
paths: &[Vec<Interned<TypoCondition>>],
|
||||
dead_ends_cache: &DeadEndsCache<TypoCondition>,
|
||||
universe: &RoaringBitmap,
|
||||
distances: &MappedInterner<QueryNode, Vec<u16>>,
|
||||
cost: u16,
|
||||
distances: &MappedInterner<QueryNode, Vec<u64>>,
|
||||
cost: u64,
|
||||
);
|
||||
}
|
||||
|
||||
@ -139,8 +139,8 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
|
||||
_paths_map: &[Vec<Interned<ProximityCondition>>],
|
||||
_dead_ends_cache: &DeadEndsCache<ProximityCondition>,
|
||||
_universe: &RoaringBitmap,
|
||||
_distances: &MappedInterner<QueryNode, Vec<u16>>,
|
||||
_cost: u16,
|
||||
_distances: &MappedInterner<QueryNode, Vec<u64>>,
|
||||
_cost: u64,
|
||||
) {
|
||||
}
|
||||
|
||||
@ -150,8 +150,8 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
|
||||
_paths: &[Vec<Interned<TypoCondition>>],
|
||||
_dead_ends_cache: &DeadEndsCache<TypoCondition>,
|
||||
_universe: &RoaringBitmap,
|
||||
_distances: &MappedInterner<QueryNode, Vec<u16>>,
|
||||
_cost: u16,
|
||||
_distances: &MappedInterner<QueryNode, Vec<u64>>,
|
||||
_cost: u64,
|
||||
) {
|
||||
}
|
||||
}
|
||||
|
@ -1,21 +1,17 @@
|
||||
// use std::collections::HashSet;
|
||||
use std::fmt::Write;
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use fxhash::FxHashSet;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
|
||||
use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
|
||||
use crate::search::new::interner::{DedupInterner, Interned, MappedInterner};
|
||||
use crate::search::new::logger::SearchLogger;
|
||||
use crate::search::new::query_graph::QueryNodeData;
|
||||
use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm};
|
||||
use crate::search::new::query_term::{LocatedQueryTermSubset, NTypoTermSubset};
|
||||
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
|
||||
use crate::search::new::{QueryGraph, QueryNode, SearchContext};
|
||||
use crate::Result;
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub struct TypoCondition {
|
||||
term: Interned<QueryTerm>,
|
||||
term: LocatedQueryTermSubset,
|
||||
nbr_typos: u8,
|
||||
}
|
||||
|
||||
pub enum TypoGraph {}
|
||||
@ -23,203 +19,82 @@ pub enum TypoGraph {}
|
||||
impl RankingRuleGraphTrait for TypoGraph {
|
||||
type Condition = TypoCondition;
|
||||
|
||||
fn resolve_condition<'db_cache>(
|
||||
fn resolve_condition(
|
||||
ctx: &mut SearchContext,
|
||||
condition: &Self::Condition,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<(RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)> {
|
||||
let SearchContext {
|
||||
index,
|
||||
txn,
|
||||
db_cache,
|
||||
word_interner,
|
||||
phrase_interner,
|
||||
term_interner,
|
||||
term_docids: query_term_docids,
|
||||
} = ctx;
|
||||
) -> Result<ComputedCondition> {
|
||||
let TypoCondition { term, .. } = condition;
|
||||
// maybe compute_query_term_subset_docids should accept a universe as argument
|
||||
let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?;
|
||||
docids &= universe;
|
||||
|
||||
let docids = universe
|
||||
& query_term_docids.get_query_term_docids(
|
||||
index,
|
||||
txn,
|
||||
db_cache,
|
||||
word_interner,
|
||||
term_interner,
|
||||
phrase_interner,
|
||||
condition.term,
|
||||
)?;
|
||||
|
||||
let term = term_interner.get(condition.term);
|
||||
Ok((
|
||||
Ok(ComputedCondition {
|
||||
docids,
|
||||
FxHashSet::from_iter(term.all_single_words_except_prefix_db()),
|
||||
FxHashSet::from_iter(term.all_phrases()),
|
||||
))
|
||||
universe_len: universe.len(),
|
||||
start_term_subset: None,
|
||||
end_term_subset: term.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
fn build_edges(
|
||||
ctx: &mut SearchContext,
|
||||
conditions_interner: &mut DedupInterner<Self::Condition>,
|
||||
_from_node: &QueryNode,
|
||||
to_node: &QueryNode,
|
||||
) -> Result<Vec<(u8, Option<Interned<Self::Condition>>)>> {
|
||||
let SearchContext { term_interner, .. } = ctx;
|
||||
match &to_node.data {
|
||||
QueryNodeData::Term(LocatedQueryTerm { value, positions }) => {
|
||||
_from: Option<&LocatedQueryTermSubset>,
|
||||
to_term: &LocatedQueryTermSubset,
|
||||
) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
|
||||
let term = to_term; // LocatedQueryTermSubset { term_subset, positions: _, term_ids } = to_term;
|
||||
let original_full_term = ctx.term_interner.get(term.term_subset.original);
|
||||
|
||||
let mut edges = vec![];
|
||||
// Ngrams have a base typo cost
|
||||
// 2-gram -> equivalent to 1 typo
|
||||
// 3-gram -> equivalent to 2 typos
|
||||
let base_cost = positions.len().min(2) as u8;
|
||||
let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 };
|
||||
|
||||
for nbr_typos in 0..=2 {
|
||||
let term = term_interner.get(*value).clone();
|
||||
let new_term = match nbr_typos {
|
||||
0 => QueryTerm {
|
||||
original: term.original,
|
||||
is_prefix: term.is_prefix,
|
||||
zero_typo: term.zero_typo,
|
||||
prefix_of: term.prefix_of,
|
||||
// TOOD: debatable
|
||||
synonyms: term.synonyms,
|
||||
split_words: None,
|
||||
one_typo: Box::new([]),
|
||||
two_typos: Box::new([]),
|
||||
use_prefix_db: term.use_prefix_db,
|
||||
is_ngram: term.is_ngram,
|
||||
phrase: term.phrase,
|
||||
},
|
||||
1 => {
|
||||
// What about split words and synonyms here?
|
||||
QueryTerm {
|
||||
original: term.original,
|
||||
is_prefix: false,
|
||||
zero_typo: None,
|
||||
prefix_of: Box::new([]),
|
||||
synonyms: Box::new([]),
|
||||
split_words: term.split_words,
|
||||
one_typo: term.one_typo,
|
||||
two_typos: Box::new([]),
|
||||
use_prefix_db: None, // false because all items from use_prefix_db have 0 typos
|
||||
is_ngram: term.is_ngram,
|
||||
phrase: None,
|
||||
for nbr_typos in 0..=original_full_term.max_nbr_typos {
|
||||
let mut term = term.clone();
|
||||
match nbr_typos {
|
||||
0 => {
|
||||
term.term_subset.one_typo_subset = NTypoTermSubset::Nothing;
|
||||
term.term_subset.two_typo_subset = NTypoTermSubset::Nothing;
|
||||
}
|
||||
1 => {
|
||||
term.term_subset.zero_typo_subset = NTypoTermSubset::Nothing;
|
||||
term.term_subset.two_typo_subset = NTypoTermSubset::Nothing;
|
||||
}
|
||||
2 => {
|
||||
// What about split words and synonyms here?
|
||||
QueryTerm {
|
||||
original: term.original,
|
||||
zero_typo: None,
|
||||
is_prefix: false,
|
||||
prefix_of: Box::new([]),
|
||||
synonyms: Box::new([]),
|
||||
split_words: None,
|
||||
one_typo: Box::new([]),
|
||||
two_typos: term.two_typos,
|
||||
use_prefix_db: None, // false because all items from use_prefix_db have 0 typos
|
||||
is_ngram: term.is_ngram,
|
||||
phrase: None,
|
||||
}
|
||||
term.term_subset.zero_typo_subset = NTypoTermSubset::Nothing;
|
||||
term.term_subset.one_typo_subset = NTypoTermSubset::Nothing;
|
||||
}
|
||||
_ => panic!(),
|
||||
};
|
||||
if !new_term.is_empty() {
|
||||
|
||||
edges.push((
|
||||
nbr_typos as u8 + base_cost,
|
||||
Some(
|
||||
conditions_interner
|
||||
.insert(TypoCondition { term: term_interner.push(new_term) }),
|
||||
),
|
||||
))
|
||||
}
|
||||
nbr_typos as u32 + base_cost,
|
||||
conditions_interner.insert(TypoCondition { term, nbr_typos }),
|
||||
));
|
||||
}
|
||||
Ok(edges)
|
||||
}
|
||||
QueryNodeData::End => Ok(vec![(0, None)]),
|
||||
QueryNodeData::Deleted | QueryNodeData::Start => panic!(),
|
||||
}
|
||||
}
|
||||
|
||||
fn log_state(
|
||||
graph: &RankingRuleGraph<Self>,
|
||||
paths: &[Vec<Interned<TypoCondition>>],
|
||||
dead_ends_cache: &DeadEndsCache<TypoCondition>,
|
||||
universe: &RoaringBitmap,
|
||||
distances: &MappedInterner<QueryNode, Vec<u16>>,
|
||||
cost: u16,
|
||||
distances: &MappedInterner<QueryNode, Vec<u64>>,
|
||||
cost: u64,
|
||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
) {
|
||||
logger.log_typo_state(graph, paths, dead_ends_cache, universe, distances, cost);
|
||||
}
|
||||
|
||||
fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result<String> {
|
||||
let TypoCondition { term } = condition;
|
||||
let term = ctx.term_interner.get(*term);
|
||||
let QueryTerm {
|
||||
original: _,
|
||||
is_ngram: _,
|
||||
is_prefix: _,
|
||||
phrase,
|
||||
zero_typo,
|
||||
prefix_of,
|
||||
synonyms,
|
||||
split_words,
|
||||
one_typo,
|
||||
two_typos,
|
||||
use_prefix_db,
|
||||
} = term;
|
||||
let mut s = String::new();
|
||||
if let Some(phrase) = phrase {
|
||||
let phrase = ctx.phrase_interner.get(*phrase).description(&ctx.word_interner);
|
||||
writeln!(&mut s, "\"{phrase}\" : phrase").unwrap();
|
||||
}
|
||||
if let Some(w) = zero_typo {
|
||||
let w = ctx.word_interner.get(*w);
|
||||
writeln!(&mut s, "\"{w}\" : 0 typo").unwrap();
|
||||
}
|
||||
for w in prefix_of.iter() {
|
||||
let w = ctx.word_interner.get(*w);
|
||||
writeln!(&mut s, "\"{w}\" : prefix").unwrap();
|
||||
}
|
||||
for w in one_typo.iter() {
|
||||
let w = ctx.word_interner.get(*w);
|
||||
writeln!(&mut s, "\"{w}\" : 1 typo").unwrap();
|
||||
}
|
||||
for w in two_typos.iter() {
|
||||
let w = ctx.word_interner.get(*w);
|
||||
writeln!(&mut s, "\"{w}\" : 2 typos").unwrap();
|
||||
}
|
||||
if let Some(phrase) = split_words {
|
||||
let phrase = ctx.phrase_interner.get(*phrase).description(&ctx.word_interner);
|
||||
writeln!(&mut s, "\"{phrase}\" : split words").unwrap();
|
||||
}
|
||||
for phrase in synonyms.iter() {
|
||||
let phrase = ctx.phrase_interner.get(*phrase).description(&ctx.word_interner);
|
||||
writeln!(&mut s, "\"{phrase}\" : synonym").unwrap();
|
||||
}
|
||||
if let Some(w) = use_prefix_db {
|
||||
let w = ctx.word_interner.get(*w);
|
||||
writeln!(&mut s, "\"{w}\" : use prefix db").unwrap();
|
||||
}
|
||||
let TypoCondition { term, nbr_typos } = condition;
|
||||
let original_term = ctx.term_interner.get(term.term_subset.original);
|
||||
let original = ctx.word_interner.get(original_term.original);
|
||||
|
||||
Ok(s)
|
||||
Ok(format!("{original}: {nbr_typos}"))
|
||||
}
|
||||
|
||||
// fn words_used_by_condition<'ctx>(
|
||||
// ctx: &mut SearchContext<'ctx>,
|
||||
// condition: &Self::Condition,
|
||||
// ) -> Result<HashSet<Interned<String>>> {
|
||||
// let TypoCondition { term, .. } = condition;
|
||||
// let term = ctx.term_interner.get(*term);
|
||||
// Ok(HashSet::from_iter(term.all_single_words_except_prefix_db()))
|
||||
// }
|
||||
|
||||
// fn phrases_used_by_condition<'ctx>(
|
||||
// ctx: &mut SearchContext<'ctx>,
|
||||
// condition: &Self::Condition,
|
||||
// ) -> Result<HashSet<Interned<Phrase>>> {
|
||||
// let TypoCondition { term, .. } = condition;
|
||||
// let term = ctx.term_interner.get(*term);
|
||||
// Ok(HashSet::from_iter(term.all_phrases()))
|
||||
// }
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user