Update typo ranking rule to use new query term structure

This commit is contained in:
Loïc Lecrenier 2023-03-30 11:32:19 +02:00
parent fa81381865
commit 728710d63a
2 changed files with 63 additions and 188 deletions

View File

@ -68,8 +68,8 @@ pub trait SearchLogger<Q: RankingRuleQueryTrait> {
paths: &[Vec<Interned<ProximityCondition>>], paths: &[Vec<Interned<ProximityCondition>>],
dead_ends_cache: &DeadEndsCache<ProximityCondition>, dead_ends_cache: &DeadEndsCache<ProximityCondition>,
universe: &RoaringBitmap, universe: &RoaringBitmap,
distances: &MappedInterner<QueryNode, Vec<u16>>, distances: &MappedInterner<QueryNode, Vec<u64>>,
cost: u16, cost: u64,
); );
/// Logs the internal state of the typo ranking rule /// Logs the internal state of the typo ranking rule
@ -79,8 +79,8 @@ pub trait SearchLogger<Q: RankingRuleQueryTrait> {
paths: &[Vec<Interned<TypoCondition>>], paths: &[Vec<Interned<TypoCondition>>],
dead_ends_cache: &DeadEndsCache<TypoCondition>, dead_ends_cache: &DeadEndsCache<TypoCondition>,
universe: &RoaringBitmap, universe: &RoaringBitmap,
distances: &MappedInterner<QueryNode, Vec<u16>>, distances: &MappedInterner<QueryNode, Vec<u64>>,
cost: u16, cost: u64,
); );
} }
@ -139,8 +139,8 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
_paths_map: &[Vec<Interned<ProximityCondition>>], _paths_map: &[Vec<Interned<ProximityCondition>>],
_dead_ends_cache: &DeadEndsCache<ProximityCondition>, _dead_ends_cache: &DeadEndsCache<ProximityCondition>,
_universe: &RoaringBitmap, _universe: &RoaringBitmap,
_distances: &MappedInterner<QueryNode, Vec<u16>>, _distances: &MappedInterner<QueryNode, Vec<u64>>,
_cost: u16, _cost: u64,
) { ) {
} }
@ -150,8 +150,8 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
_paths: &[Vec<Interned<TypoCondition>>], _paths: &[Vec<Interned<TypoCondition>>],
_dead_ends_cache: &DeadEndsCache<TypoCondition>, _dead_ends_cache: &DeadEndsCache<TypoCondition>,
_universe: &RoaringBitmap, _universe: &RoaringBitmap,
_distances: &MappedInterner<QueryNode, Vec<u16>>, _distances: &MappedInterner<QueryNode, Vec<u64>>,
_cost: u16, _cost: u64,
) { ) {
} }
} }

View File

@ -1,21 +1,17 @@
// use std::collections::HashSet;
use std::fmt::Write;
use std::iter::FromIterator;
use fxhash::FxHashSet;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::interner::{DedupInterner, Interned, MappedInterner};
use crate::search::new::logger::SearchLogger; use crate::search::new::logger::SearchLogger;
use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::{LocatedQueryTermSubset, NTypoTermSubset};
use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm}; use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
use crate::search::new::{QueryGraph, QueryNode, SearchContext}; use crate::search::new::{QueryGraph, QueryNode, SearchContext};
use crate::Result; use crate::Result;
#[derive(Clone, PartialEq, Eq, Hash)] #[derive(Clone, PartialEq, Eq, Hash)]
pub struct TypoCondition { pub struct TypoCondition {
term: Interned<QueryTerm>, term: LocatedQueryTermSubset,
nbr_typos: u8,
} }
pub enum TypoGraph {} pub enum TypoGraph {}
@ -23,121 +19,63 @@ pub enum TypoGraph {}
impl RankingRuleGraphTrait for TypoGraph { impl RankingRuleGraphTrait for TypoGraph {
type Condition = TypoCondition; type Condition = TypoCondition;
fn resolve_condition<'db_cache>( fn resolve_condition(
ctx: &mut SearchContext, ctx: &mut SearchContext,
condition: &Self::Condition, condition: &Self::Condition,
universe: &RoaringBitmap, universe: &RoaringBitmap,
) -> Result<(RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)> { ) -> Result<ComputedCondition> {
let SearchContext { let TypoCondition { term, .. } = condition;
index, // maybe compute_query_term_subset_docids should accept a universe as argument
txn, let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?;
db_cache, docids &= universe;
word_interner,
phrase_interner,
term_interner,
term_docids: query_term_docids,
} = ctx;
let docids = universe Ok(ComputedCondition {
& query_term_docids.get_query_term_docids(
index,
txn,
db_cache,
word_interner,
term_interner,
phrase_interner,
condition.term,
)?;
let term = term_interner.get(condition.term);
Ok((
docids, docids,
FxHashSet::from_iter(term.all_single_words_except_prefix_db()), universe_len: universe.len(),
FxHashSet::from_iter(term.all_phrases()), start_term_subset: None,
)) end_term_subset: term.clone(),
})
} }
fn build_edges( fn build_edges(
ctx: &mut SearchContext, ctx: &mut SearchContext,
conditions_interner: &mut DedupInterner<Self::Condition>, conditions_interner: &mut DedupInterner<Self::Condition>,
_from_node: &QueryNode, _from: Option<&LocatedQueryTermSubset>,
to_node: &QueryNode, to_term: &LocatedQueryTermSubset,
) -> Result<Vec<(u8, Option<Interned<Self::Condition>>)>> { ) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
let SearchContext { term_interner, .. } = ctx; let term = to_term; // LocatedQueryTermSubset { term_subset, positions: _, term_ids } = to_term;
match &to_node.data { let original_full_term = ctx.term_interner.get(term.term_subset.original);
QueryNodeData::Term(LocatedQueryTerm { value, positions }) => {
let mut edges = vec![];
// Ngrams have a base typo cost
// 2-gram -> equivalent to 1 typo
// 3-gram -> equivalent to 2 typos
let base_cost = positions.len().min(2) as u8;
for nbr_typos in 0..=2 { let mut edges = vec![];
let term = term_interner.get(*value).clone(); // Ngrams have a base typo cost
let new_term = match nbr_typos { // 2-gram -> equivalent to 1 typo
0 => QueryTerm { // 3-gram -> equivalent to 2 typos
original: term.original, let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 };
is_prefix: term.is_prefix,
zero_typo: term.zero_typo, for nbr_typos in 0..=original_full_term.max_nbr_typos {
prefix_of: term.prefix_of, let mut term = term.clone();
// TOOD: debatable match nbr_typos {
synonyms: term.synonyms, 0 => {
split_words: None, term.term_subset.one_typo_subset = NTypoTermSubset::Nothing;
one_typo: Box::new([]), term.term_subset.two_typo_subset = NTypoTermSubset::Nothing;
two_typos: Box::new([]),
use_prefix_db: term.use_prefix_db,
is_ngram: term.is_ngram,
phrase: term.phrase,
},
1 => {
// What about split words and synonyms here?
QueryTerm {
original: term.original,
is_prefix: false,
zero_typo: None,
prefix_of: Box::new([]),
synonyms: Box::new([]),
split_words: term.split_words,
one_typo: term.one_typo,
two_typos: Box::new([]),
use_prefix_db: None, // false because all items from use_prefix_db have 0 typos
is_ngram: term.is_ngram,
phrase: None,
}
}
2 => {
// What about split words and synonyms here?
QueryTerm {
original: term.original,
zero_typo: None,
is_prefix: false,
prefix_of: Box::new([]),
synonyms: Box::new([]),
split_words: None,
one_typo: Box::new([]),
two_typos: term.two_typos,
use_prefix_db: None, // false because all items from use_prefix_db have 0 typos
is_ngram: term.is_ngram,
phrase: None,
}
}
_ => panic!(),
};
if !new_term.is_empty() {
edges.push((
nbr_typos as u8 + base_cost,
Some(
conditions_interner
.insert(TypoCondition { term: term_interner.push(new_term) }),
),
))
}
} }
Ok(edges) 1 => {
} term.term_subset.zero_typo_subset = NTypoTermSubset::Nothing;
QueryNodeData::End => Ok(vec![(0, None)]), term.term_subset.two_typo_subset = NTypoTermSubset::Nothing;
QueryNodeData::Deleted | QueryNodeData::Start => panic!(), }
2 => {
term.term_subset.zero_typo_subset = NTypoTermSubset::Nothing;
term.term_subset.one_typo_subset = NTypoTermSubset::Nothing;
}
_ => panic!(),
};
edges.push((
nbr_typos as u32 + base_cost,
conditions_interner.insert(TypoCondition { term, nbr_typos }),
));
} }
Ok(edges)
} }
fn log_state( fn log_state(
@ -145,81 +83,18 @@ impl RankingRuleGraphTrait for TypoGraph {
paths: &[Vec<Interned<TypoCondition>>], paths: &[Vec<Interned<TypoCondition>>],
dead_ends_cache: &DeadEndsCache<TypoCondition>, dead_ends_cache: &DeadEndsCache<TypoCondition>,
universe: &RoaringBitmap, universe: &RoaringBitmap,
distances: &MappedInterner<QueryNode, Vec<u16>>, distances: &MappedInterner<QueryNode, Vec<u64>>,
cost: u16, cost: u64,
logger: &mut dyn SearchLogger<QueryGraph>, logger: &mut dyn SearchLogger<QueryGraph>,
) { ) {
logger.log_typo_state(graph, paths, dead_ends_cache, universe, distances, cost); logger.log_typo_state(graph, paths, dead_ends_cache, universe, distances, cost);
} }
fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result<String> { fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result<String> {
let TypoCondition { term } = condition; let TypoCondition { term, nbr_typos } = condition;
let term = ctx.term_interner.get(*term); let original_term = ctx.term_interner.get(term.term_subset.original);
let QueryTerm { let original = ctx.word_interner.get(original_term.original);
original: _,
is_ngram: _,
is_prefix: _,
phrase,
zero_typo,
prefix_of,
synonyms,
split_words,
one_typo,
two_typos,
use_prefix_db,
} = term;
let mut s = String::new();
if let Some(phrase) = phrase {
let phrase = ctx.phrase_interner.get(*phrase).description(&ctx.word_interner);
writeln!(&mut s, "\"{phrase}\" : phrase").unwrap();
}
if let Some(w) = zero_typo {
let w = ctx.word_interner.get(*w);
writeln!(&mut s, "\"{w}\" : 0 typo").unwrap();
}
for w in prefix_of.iter() {
let w = ctx.word_interner.get(*w);
writeln!(&mut s, "\"{w}\" : prefix").unwrap();
}
for w in one_typo.iter() {
let w = ctx.word_interner.get(*w);
writeln!(&mut s, "\"{w}\" : 1 typo").unwrap();
}
for w in two_typos.iter() {
let w = ctx.word_interner.get(*w);
writeln!(&mut s, "\"{w}\" : 2 typos").unwrap();
}
if let Some(phrase) = split_words {
let phrase = ctx.phrase_interner.get(*phrase).description(&ctx.word_interner);
writeln!(&mut s, "\"{phrase}\" : split words").unwrap();
}
for phrase in synonyms.iter() {
let phrase = ctx.phrase_interner.get(*phrase).description(&ctx.word_interner);
writeln!(&mut s, "\"{phrase}\" : synonym").unwrap();
}
if let Some(w) = use_prefix_db {
let w = ctx.word_interner.get(*w);
writeln!(&mut s, "\"{w}\" : use prefix db").unwrap();
}
Ok(s) Ok(format!("{original}: {nbr_typos}"))
} }
// fn words_used_by_condition<'ctx>(
// ctx: &mut SearchContext<'ctx>,
// condition: &Self::Condition,
// ) -> Result<HashSet<Interned<String>>> {
// let TypoCondition { term, .. } = condition;
// let term = ctx.term_interner.get(*term);
// Ok(HashSet::from_iter(term.all_single_words_except_prefix_db()))
// }
// fn phrases_used_by_condition<'ctx>(
// ctx: &mut SearchContext<'ctx>,
// condition: &Self::Condition,
// ) -> Result<HashSet<Interned<Phrase>>> {
// let TypoCondition { term, .. } = condition;
// let term = ctx.term_interner.get(*term);
// Ok(HashSet::from_iter(term.all_phrases()))
// }
} }