mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-24 13:40:31 +01:00
Update typo ranking rule to use new query term structure
This commit is contained in:
parent
fa81381865
commit
728710d63a
@ -68,8 +68,8 @@ pub trait SearchLogger<Q: RankingRuleQueryTrait> {
|
|||||||
paths: &[Vec<Interned<ProximityCondition>>],
|
paths: &[Vec<Interned<ProximityCondition>>],
|
||||||
dead_ends_cache: &DeadEndsCache<ProximityCondition>,
|
dead_ends_cache: &DeadEndsCache<ProximityCondition>,
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
distances: &MappedInterner<QueryNode, Vec<u16>>,
|
distances: &MappedInterner<QueryNode, Vec<u64>>,
|
||||||
cost: u16,
|
cost: u64,
|
||||||
);
|
);
|
||||||
|
|
||||||
/// Logs the internal state of the typo ranking rule
|
/// Logs the internal state of the typo ranking rule
|
||||||
@ -79,8 +79,8 @@ pub trait SearchLogger<Q: RankingRuleQueryTrait> {
|
|||||||
paths: &[Vec<Interned<TypoCondition>>],
|
paths: &[Vec<Interned<TypoCondition>>],
|
||||||
dead_ends_cache: &DeadEndsCache<TypoCondition>,
|
dead_ends_cache: &DeadEndsCache<TypoCondition>,
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
distances: &MappedInterner<QueryNode, Vec<u16>>,
|
distances: &MappedInterner<QueryNode, Vec<u64>>,
|
||||||
cost: u16,
|
cost: u64,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -139,8 +139,8 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
|
|||||||
_paths_map: &[Vec<Interned<ProximityCondition>>],
|
_paths_map: &[Vec<Interned<ProximityCondition>>],
|
||||||
_dead_ends_cache: &DeadEndsCache<ProximityCondition>,
|
_dead_ends_cache: &DeadEndsCache<ProximityCondition>,
|
||||||
_universe: &RoaringBitmap,
|
_universe: &RoaringBitmap,
|
||||||
_distances: &MappedInterner<QueryNode, Vec<u16>>,
|
_distances: &MappedInterner<QueryNode, Vec<u64>>,
|
||||||
_cost: u16,
|
_cost: u64,
|
||||||
) {
|
) {
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -150,8 +150,8 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
|
|||||||
_paths: &[Vec<Interned<TypoCondition>>],
|
_paths: &[Vec<Interned<TypoCondition>>],
|
||||||
_dead_ends_cache: &DeadEndsCache<TypoCondition>,
|
_dead_ends_cache: &DeadEndsCache<TypoCondition>,
|
||||||
_universe: &RoaringBitmap,
|
_universe: &RoaringBitmap,
|
||||||
_distances: &MappedInterner<QueryNode, Vec<u16>>,
|
_distances: &MappedInterner<QueryNode, Vec<u64>>,
|
||||||
_cost: u16,
|
_cost: u64,
|
||||||
) {
|
) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,21 +1,17 @@
|
|||||||
// use std::collections::HashSet;
|
|
||||||
use std::fmt::Write;
|
|
||||||
use std::iter::FromIterator;
|
|
||||||
|
|
||||||
use fxhash::FxHashSet;
|
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
|
use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
|
||||||
use crate::search::new::interner::{DedupInterner, Interned, MappedInterner};
|
use crate::search::new::interner::{DedupInterner, Interned, MappedInterner};
|
||||||
use crate::search::new::logger::SearchLogger;
|
use crate::search::new::logger::SearchLogger;
|
||||||
use crate::search::new::query_graph::QueryNodeData;
|
use crate::search::new::query_term::{LocatedQueryTermSubset, NTypoTermSubset};
|
||||||
use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm};
|
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
|
||||||
use crate::search::new::{QueryGraph, QueryNode, SearchContext};
|
use crate::search::new::{QueryGraph, QueryNode, SearchContext};
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
|
|
||||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||||
pub struct TypoCondition {
|
pub struct TypoCondition {
|
||||||
term: Interned<QueryTerm>,
|
term: LocatedQueryTermSubset,
|
||||||
|
nbr_typos: u8,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub enum TypoGraph {}
|
pub enum TypoGraph {}
|
||||||
@ -23,121 +19,63 @@ pub enum TypoGraph {}
|
|||||||
impl RankingRuleGraphTrait for TypoGraph {
|
impl RankingRuleGraphTrait for TypoGraph {
|
||||||
type Condition = TypoCondition;
|
type Condition = TypoCondition;
|
||||||
|
|
||||||
fn resolve_condition<'db_cache>(
|
fn resolve_condition(
|
||||||
ctx: &mut SearchContext,
|
ctx: &mut SearchContext,
|
||||||
condition: &Self::Condition,
|
condition: &Self::Condition,
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
) -> Result<(RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)> {
|
) -> Result<ComputedCondition> {
|
||||||
let SearchContext {
|
let TypoCondition { term, .. } = condition;
|
||||||
index,
|
// maybe compute_query_term_subset_docids should accept a universe as argument
|
||||||
txn,
|
let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?;
|
||||||
db_cache,
|
docids &= universe;
|
||||||
word_interner,
|
|
||||||
phrase_interner,
|
|
||||||
term_interner,
|
|
||||||
term_docids: query_term_docids,
|
|
||||||
} = ctx;
|
|
||||||
|
|
||||||
let docids = universe
|
Ok(ComputedCondition {
|
||||||
& query_term_docids.get_query_term_docids(
|
|
||||||
index,
|
|
||||||
txn,
|
|
||||||
db_cache,
|
|
||||||
word_interner,
|
|
||||||
term_interner,
|
|
||||||
phrase_interner,
|
|
||||||
condition.term,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let term = term_interner.get(condition.term);
|
|
||||||
Ok((
|
|
||||||
docids,
|
docids,
|
||||||
FxHashSet::from_iter(term.all_single_words_except_prefix_db()),
|
universe_len: universe.len(),
|
||||||
FxHashSet::from_iter(term.all_phrases()),
|
start_term_subset: None,
|
||||||
))
|
end_term_subset: term.clone(),
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_edges(
|
fn build_edges(
|
||||||
ctx: &mut SearchContext,
|
ctx: &mut SearchContext,
|
||||||
conditions_interner: &mut DedupInterner<Self::Condition>,
|
conditions_interner: &mut DedupInterner<Self::Condition>,
|
||||||
_from_node: &QueryNode,
|
_from: Option<&LocatedQueryTermSubset>,
|
||||||
to_node: &QueryNode,
|
to_term: &LocatedQueryTermSubset,
|
||||||
) -> Result<Vec<(u8, Option<Interned<Self::Condition>>)>> {
|
) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
|
||||||
let SearchContext { term_interner, .. } = ctx;
|
let term = to_term; // LocatedQueryTermSubset { term_subset, positions: _, term_ids } = to_term;
|
||||||
match &to_node.data {
|
let original_full_term = ctx.term_interner.get(term.term_subset.original);
|
||||||
QueryNodeData::Term(LocatedQueryTerm { value, positions }) => {
|
|
||||||
let mut edges = vec![];
|
|
||||||
// Ngrams have a base typo cost
|
|
||||||
// 2-gram -> equivalent to 1 typo
|
|
||||||
// 3-gram -> equivalent to 2 typos
|
|
||||||
let base_cost = positions.len().min(2) as u8;
|
|
||||||
|
|
||||||
for nbr_typos in 0..=2 {
|
let mut edges = vec![];
|
||||||
let term = term_interner.get(*value).clone();
|
// Ngrams have a base typo cost
|
||||||
let new_term = match nbr_typos {
|
// 2-gram -> equivalent to 1 typo
|
||||||
0 => QueryTerm {
|
// 3-gram -> equivalent to 2 typos
|
||||||
original: term.original,
|
let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 };
|
||||||
is_prefix: term.is_prefix,
|
|
||||||
zero_typo: term.zero_typo,
|
for nbr_typos in 0..=original_full_term.max_nbr_typos {
|
||||||
prefix_of: term.prefix_of,
|
let mut term = term.clone();
|
||||||
// TOOD: debatable
|
match nbr_typos {
|
||||||
synonyms: term.synonyms,
|
0 => {
|
||||||
split_words: None,
|
term.term_subset.one_typo_subset = NTypoTermSubset::Nothing;
|
||||||
one_typo: Box::new([]),
|
term.term_subset.two_typo_subset = NTypoTermSubset::Nothing;
|
||||||
two_typos: Box::new([]),
|
|
||||||
use_prefix_db: term.use_prefix_db,
|
|
||||||
is_ngram: term.is_ngram,
|
|
||||||
phrase: term.phrase,
|
|
||||||
},
|
|
||||||
1 => {
|
|
||||||
// What about split words and synonyms here?
|
|
||||||
QueryTerm {
|
|
||||||
original: term.original,
|
|
||||||
is_prefix: false,
|
|
||||||
zero_typo: None,
|
|
||||||
prefix_of: Box::new([]),
|
|
||||||
synonyms: Box::new([]),
|
|
||||||
split_words: term.split_words,
|
|
||||||
one_typo: term.one_typo,
|
|
||||||
two_typos: Box::new([]),
|
|
||||||
use_prefix_db: None, // false because all items from use_prefix_db have 0 typos
|
|
||||||
is_ngram: term.is_ngram,
|
|
||||||
phrase: None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
2 => {
|
|
||||||
// What about split words and synonyms here?
|
|
||||||
QueryTerm {
|
|
||||||
original: term.original,
|
|
||||||
zero_typo: None,
|
|
||||||
is_prefix: false,
|
|
||||||
prefix_of: Box::new([]),
|
|
||||||
synonyms: Box::new([]),
|
|
||||||
split_words: None,
|
|
||||||
one_typo: Box::new([]),
|
|
||||||
two_typos: term.two_typos,
|
|
||||||
use_prefix_db: None, // false because all items from use_prefix_db have 0 typos
|
|
||||||
is_ngram: term.is_ngram,
|
|
||||||
phrase: None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => panic!(),
|
|
||||||
};
|
|
||||||
if !new_term.is_empty() {
|
|
||||||
edges.push((
|
|
||||||
nbr_typos as u8 + base_cost,
|
|
||||||
Some(
|
|
||||||
conditions_interner
|
|
||||||
.insert(TypoCondition { term: term_interner.push(new_term) }),
|
|
||||||
),
|
|
||||||
))
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
Ok(edges)
|
1 => {
|
||||||
}
|
term.term_subset.zero_typo_subset = NTypoTermSubset::Nothing;
|
||||||
QueryNodeData::End => Ok(vec![(0, None)]),
|
term.term_subset.two_typo_subset = NTypoTermSubset::Nothing;
|
||||||
QueryNodeData::Deleted | QueryNodeData::Start => panic!(),
|
}
|
||||||
|
2 => {
|
||||||
|
term.term_subset.zero_typo_subset = NTypoTermSubset::Nothing;
|
||||||
|
term.term_subset.one_typo_subset = NTypoTermSubset::Nothing;
|
||||||
|
}
|
||||||
|
_ => panic!(),
|
||||||
|
};
|
||||||
|
|
||||||
|
edges.push((
|
||||||
|
nbr_typos as u32 + base_cost,
|
||||||
|
conditions_interner.insert(TypoCondition { term, nbr_typos }),
|
||||||
|
));
|
||||||
}
|
}
|
||||||
|
Ok(edges)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn log_state(
|
fn log_state(
|
||||||
@ -145,81 +83,18 @@ impl RankingRuleGraphTrait for TypoGraph {
|
|||||||
paths: &[Vec<Interned<TypoCondition>>],
|
paths: &[Vec<Interned<TypoCondition>>],
|
||||||
dead_ends_cache: &DeadEndsCache<TypoCondition>,
|
dead_ends_cache: &DeadEndsCache<TypoCondition>,
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
distances: &MappedInterner<QueryNode, Vec<u16>>,
|
distances: &MappedInterner<QueryNode, Vec<u64>>,
|
||||||
cost: u16,
|
cost: u64,
|
||||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||||
) {
|
) {
|
||||||
logger.log_typo_state(graph, paths, dead_ends_cache, universe, distances, cost);
|
logger.log_typo_state(graph, paths, dead_ends_cache, universe, distances, cost);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result<String> {
|
fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result<String> {
|
||||||
let TypoCondition { term } = condition;
|
let TypoCondition { term, nbr_typos } = condition;
|
||||||
let term = ctx.term_interner.get(*term);
|
let original_term = ctx.term_interner.get(term.term_subset.original);
|
||||||
let QueryTerm {
|
let original = ctx.word_interner.get(original_term.original);
|
||||||
original: _,
|
|
||||||
is_ngram: _,
|
|
||||||
is_prefix: _,
|
|
||||||
phrase,
|
|
||||||
zero_typo,
|
|
||||||
prefix_of,
|
|
||||||
synonyms,
|
|
||||||
split_words,
|
|
||||||
one_typo,
|
|
||||||
two_typos,
|
|
||||||
use_prefix_db,
|
|
||||||
} = term;
|
|
||||||
let mut s = String::new();
|
|
||||||
if let Some(phrase) = phrase {
|
|
||||||
let phrase = ctx.phrase_interner.get(*phrase).description(&ctx.word_interner);
|
|
||||||
writeln!(&mut s, "\"{phrase}\" : phrase").unwrap();
|
|
||||||
}
|
|
||||||
if let Some(w) = zero_typo {
|
|
||||||
let w = ctx.word_interner.get(*w);
|
|
||||||
writeln!(&mut s, "\"{w}\" : 0 typo").unwrap();
|
|
||||||
}
|
|
||||||
for w in prefix_of.iter() {
|
|
||||||
let w = ctx.word_interner.get(*w);
|
|
||||||
writeln!(&mut s, "\"{w}\" : prefix").unwrap();
|
|
||||||
}
|
|
||||||
for w in one_typo.iter() {
|
|
||||||
let w = ctx.word_interner.get(*w);
|
|
||||||
writeln!(&mut s, "\"{w}\" : 1 typo").unwrap();
|
|
||||||
}
|
|
||||||
for w in two_typos.iter() {
|
|
||||||
let w = ctx.word_interner.get(*w);
|
|
||||||
writeln!(&mut s, "\"{w}\" : 2 typos").unwrap();
|
|
||||||
}
|
|
||||||
if let Some(phrase) = split_words {
|
|
||||||
let phrase = ctx.phrase_interner.get(*phrase).description(&ctx.word_interner);
|
|
||||||
writeln!(&mut s, "\"{phrase}\" : split words").unwrap();
|
|
||||||
}
|
|
||||||
for phrase in synonyms.iter() {
|
|
||||||
let phrase = ctx.phrase_interner.get(*phrase).description(&ctx.word_interner);
|
|
||||||
writeln!(&mut s, "\"{phrase}\" : synonym").unwrap();
|
|
||||||
}
|
|
||||||
if let Some(w) = use_prefix_db {
|
|
||||||
let w = ctx.word_interner.get(*w);
|
|
||||||
writeln!(&mut s, "\"{w}\" : use prefix db").unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(s)
|
Ok(format!("{original}: {nbr_typos}"))
|
||||||
}
|
}
|
||||||
|
|
||||||
// fn words_used_by_condition<'ctx>(
|
|
||||||
// ctx: &mut SearchContext<'ctx>,
|
|
||||||
// condition: &Self::Condition,
|
|
||||||
// ) -> Result<HashSet<Interned<String>>> {
|
|
||||||
// let TypoCondition { term, .. } = condition;
|
|
||||||
// let term = ctx.term_interner.get(*term);
|
|
||||||
// Ok(HashSet::from_iter(term.all_single_words_except_prefix_db()))
|
|
||||||
// }
|
|
||||||
|
|
||||||
// fn phrases_used_by_condition<'ctx>(
|
|
||||||
// ctx: &mut SearchContext<'ctx>,
|
|
||||||
// condition: &Self::Condition,
|
|
||||||
// ) -> Result<HashSet<Interned<Phrase>>> {
|
|
||||||
// let TypoCondition { term, .. } = condition;
|
|
||||||
// let term = ctx.term_interner.get(*term);
|
|
||||||
// Ok(HashSet::from_iter(term.all_phrases()))
|
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user