Add typo ranking rule to new search impl

This commit is contained in:
Loïc Lecrenier 2023-02-28 14:19:57 +01:00
parent 71f18e4379
commit caa1e1b923
6 changed files with 193 additions and 17 deletions

View File

@ -70,14 +70,16 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap
) -> Result<Option<RankingRuleOutput<QueryGraph>>> { ) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
assert!(universe.len() > 1); assert!(universe.len() > 1);
let mut state = self.state.take().unwrap(); let mut state = self.state.take().unwrap();
if state.cheapest_paths_state.is_none() {
let Some(mut cheapest_paths_state) = state.cheapest_paths_state.take() else {
return Ok(None); return Ok(None);
}; }
let mut paths = PathsMap::default(); let mut paths = PathsMap::default();
while paths.is_empty() { while paths.is_empty() {
let Some(cheapest_paths_state) = state.cheapest_paths_state.take() else {
break;
};
if let Some(next_cheapest_paths_state) = cheapest_paths_state if let Some(next_cheapest_paths_state) = cheapest_paths_state
.compute_paths_of_next_lowest_cost( .compute_paths_of_next_lowest_cost(
&mut state.graph, &mut state.graph,
@ -85,13 +87,15 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap
&mut paths, &mut paths,
) )
{ {
cheapest_paths_state = next_cheapest_paths_state; state.cheapest_paths_state = Some(next_cheapest_paths_state);
} else { } else {
self.state = None; break;
}
}
if paths.is_empty() && state.cheapest_paths_state.is_none() {
return Ok(None); return Ok(None);
} }
}
state.cheapest_paths_state = Some(cheapest_paths_state);
G::log_state(&state.graph, &paths, &state.empty_paths_cache, logger); G::log_state(&state.graph, &paths, &state.empty_paths_cache, logger);

View File

@ -3,6 +3,7 @@ use roaring::RoaringBitmap;
use std::fs::File; use std::fs::File;
use std::{io::Write, path::PathBuf}; use std::{io::Write, path::PathBuf};
use crate::new::ranking_rule_graph::typo::TypoGraph;
use crate::new::{QueryNode, QueryGraph}; use crate::new::{QueryNode, QueryGraph};
use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
use crate::new::ranking_rule_graph::empty_paths_cache::EmptyPathsCache; use crate::new::ranking_rule_graph::empty_paths_cache::EmptyPathsCache;
@ -38,6 +39,11 @@ pub enum SearchEvents {
paths: PathsMap<u64>, paths: PathsMap<u64>,
empty_paths_cache: EmptyPathsCache, empty_paths_cache: EmptyPathsCache,
}, },
TypoState {
graph: RankingRuleGraph<TypoGraph>,
paths: PathsMap<u64>,
empty_paths_cache: EmptyPathsCache,
},
RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap }, RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap },
} }
@ -132,6 +138,9 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.clone(), empty_paths_cache: empty_paths_cache.clone() }) self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.clone(), empty_paths_cache: empty_paths_cache.clone() })
} }
fn log_typo_state(&mut self, query_graph: &RankingRuleGraph<TypoGraph>, paths_map: &PathsMap<u64>, empty_paths_cache: &EmptyPathsCache) {
self.events.push(SearchEvents::TypoState { graph: query_graph.clone(), paths: paths_map.clone(), empty_paths_cache: empty_paths_cache.clone() })
}
} }
@ -251,7 +260,20 @@ results.{random} {{
let id = format!("{cur_ranking_rule}.{cur_activated_id}"); let id = format!("{cur_ranking_rule}.{cur_activated_id}");
let new_file_path = self.folder_path.join(format!("{id}.d2")); let new_file_path = self.folder_path.join(format!("{id}.d2"));
let mut new_file = std::fs::File::create(new_file_path).unwrap(); let mut new_file = std::fs::File::create(new_file_path).unwrap();
Self::proximity_graph_d2_description(graph, paths, empty_paths_cache, &mut new_file); Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, &mut new_file);
writeln!(
&mut file,
"{id} {{
link: \"{id}.d2.svg\"
}}").unwrap();
},
SearchEvents::TypoState { graph, paths, empty_paths_cache } => {
let cur_ranking_rule = timestamp.len() - 1;
let cur_activated_id = activated_id(&timestamp);
let id = format!("{cur_ranking_rule}.{cur_activated_id}");
let new_file_path = self.folder_path.join(format!("{id}.d2"));
let mut new_file = std::fs::File::create(new_file_path).unwrap();
Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, &mut new_file);
writeln!( writeln!(
&mut file, &mut file,
"{id} {{ "{id} {{
@ -309,7 +331,7 @@ shape: class").unwrap();
} }
} }
} }
fn proximity_graph_d2_description(graph: &RankingRuleGraph<ProximityGraph>, paths: &PathsMap<u64>, empty_paths_cache: &EmptyPathsCache, file: &mut File) { fn ranking_rule_graph_d2_description<R: RankingRuleGraphTrait>(graph: &RankingRuleGraph<R>, paths: &PathsMap<u64>, empty_paths_cache: &EmptyPathsCache, file: &mut File) {
writeln!(file,"direction: right").unwrap(); writeln!(file,"direction: right").unwrap();
writeln!(file, "Proximity Graph {{").unwrap(); writeln!(file, "Proximity Graph {{").unwrap();
@ -333,7 +355,7 @@ shape: class").unwrap();
writeln!(file, writeln!(file,
"{from_node} -> {to_node} : \"cost {cost} {edge_label}\"", "{from_node} -> {to_node} : \"cost {cost} {edge_label}\"",
cost = edge.cost, cost = edge.cost,
edge_label = ProximityGraph::graphviz_edge_details_label(details) edge_label = R::graphviz_edge_details_label(details)
).unwrap(); ).unwrap();
} }
} }
@ -354,7 +376,7 @@ shape: class").unwrap();
} }
writeln!(file, "}}").unwrap(); writeln!(file, "}}").unwrap();
} }
fn edge_d2_description(graph: &RankingRuleGraph<ProximityGraph>, paths_idx: &str, edge_idx: u32, file: &mut File) -> String { fn edge_d2_description<R: RankingRuleGraphTrait>(graph: &RankingRuleGraph<R>, paths_idx: &str, edge_idx: u32, file: &mut File) -> String {
let Edge { from_node, to_node, cost, .. } = graph.all_edges[edge_idx as usize].as_ref().unwrap() ; let Edge { from_node, to_node, cost, .. } = graph.all_edges[edge_idx as usize].as_ref().unwrap() ;
let from_node = &graph.query_graph.nodes[*from_node as usize]; let from_node = &graph.query_graph.nodes[*from_node as usize];
let from_node_desc = match from_node { let from_node_desc = match from_node {
@ -382,7 +404,7 @@ shape: class").unwrap();
}}").unwrap(); }}").unwrap();
edge_id edge_id
} }
fn paths_d2_description<T>(graph: &RankingRuleGraph<ProximityGraph>, paths_idx: &str, paths: &PathsMap<T>, file: &mut File) { fn paths_d2_description<R: RankingRuleGraphTrait, T>(graph: &RankingRuleGraph<R>, paths_idx: &str, paths: &PathsMap<T>, file: &mut File) {
for (edge_idx, rest) in paths.nodes.iter() { for (edge_idx, rest) in paths.nodes.iter() {
let edge_id = Self::edge_d2_description(graph, paths_idx, *edge_idx, file); let edge_id = Self::edge_d2_description(graph, paths_idx, *edge_idx, file);
for (dest_edge_idx, _) in rest.nodes.iter() { for (dest_edge_idx, _) in rest.nodes.iter() {

View File

@ -6,7 +6,7 @@ use roaring::RoaringBitmap;
use super::{ use super::{
ranking_rule_graph::{ ranking_rule_graph::{
empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, proximity::ProximityGraph, empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, proximity::ProximityGraph,
RankingRuleGraph, typo::TypoGraph, RankingRuleGraph,
}, },
RankingRule, RankingRuleQueryTrait, RankingRule, RankingRuleQueryTrait,
}; };
@ -61,6 +61,14 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
_empty_paths_cache: &EmptyPathsCache, _empty_paths_cache: &EmptyPathsCache,
) { ) {
} }
fn log_typo_state(
&mut self,
query_graph: &RankingRuleGraph<TypoGraph>,
paths: &PathsMap<u64>,
empty_paths_cache: &EmptyPathsCache,
) {
}
} }
pub trait SearchLogger<Q: RankingRuleQueryTrait> { pub trait SearchLogger<Q: RankingRuleQueryTrait> {
@ -104,4 +112,11 @@ pub trait SearchLogger<Q: RankingRuleQueryTrait> {
paths: &PathsMap<u64>, paths: &PathsMap<u64>,
empty_paths_cache: &EmptyPathsCache, empty_paths_cache: &EmptyPathsCache,
); );
fn log_typo_state(
&mut self,
query_graph: &RankingRuleGraph<TypoGraph>,
paths: &PathsMap<u64>,
empty_paths_cache: &EmptyPathsCache,
);
} }

View File

@ -5,6 +5,7 @@ pub mod empty_paths_cache;
pub mod paths_map; pub mod paths_map;
pub mod proximity; pub mod proximity;
pub mod resolve_paths; pub mod resolve_paths;
pub mod typo;
use std::ops::ControlFlow; use std::ops::ControlFlow;

View File

@ -0,0 +1,131 @@
use heed::{BytesDecode, RoTxn};
use roaring::RoaringBitmap;
use super::empty_paths_cache::EmptyPathsCache;
use super::paths_map::PathsMap;
use super::{EdgeDetails, RankingRuleGraphTrait};
use crate::new::db_cache::DatabaseCache;
use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
use crate::new::QueryNode;
use crate::{Index, Result, RoaringBitmapCodec};
#[derive(Clone)]
pub enum TypoEdge {
Phrase,
Word { derivations: WordDerivations, nbr_typos: u8 },
}
pub enum TypoGraph {}
impl RankingRuleGraphTrait for TypoGraph {
type EdgeDetails = TypoEdge;
type BuildVisitedFromNode = ();
fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String {
match edge {
TypoEdge::Phrase => format!(", 0 typos"),
TypoEdge::Word { nbr_typos, .. } => format!(", {nbr_typos} typos"),
}
}
fn compute_docids<'db_cache, 'transaction>(
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
edge: &Self::EdgeDetails,
) -> Result<roaring::RoaringBitmap> {
match edge {
TypoEdge::Phrase => todo!(),
TypoEdge::Word { derivations, nbr_typos } => {
let words = match nbr_typos {
0 => &derivations.zero_typo,
1 => &derivations.one_typo,
2 => &derivations.two_typos,
_ => panic!(),
};
let mut docids = RoaringBitmap::new();
for word in words.iter() {
let Some(bytes) = db_cache.get_word_docids(index, txn, word)? else { continue };
let bitmap =
RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?;
docids |= bitmap;
}
if *nbr_typos == 0 {
if let Some(bytes) =
db_cache.get_prefix_docids(index, txn, &derivations.original)?
{
let bitmap =
RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?;
docids |= bitmap;
}
}
Ok(docids)
}
}
}
fn build_visit_from_node<'transaction>(
_index: &Index,
_txn: &'transaction RoTxn,
_db_cache: &mut DatabaseCache<'transaction>,
from_node: &QueryNode,
) -> Result<Option<Self::BuildVisitedFromNode>> {
Ok(Some(()))
}
fn build_visit_to_node<'from_data, 'transaction: 'from_data>(
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
to_node: &QueryNode,
from_node_data: &'from_data Self::BuildVisitedFromNode,
) -> Result<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>> {
match to_node {
QueryNode::Term(LocatedQueryTerm { value, .. }) => match value {
QueryTerm::Phrase(_) => Ok(vec![(0, EdgeDetails::Data(TypoEdge::Phrase))]),
QueryTerm::Word { derivations } => {
let mut edges = vec![];
if !derivations.zero_typo.is_empty() || derivations.use_prefix_db {
edges.push((
0,
EdgeDetails::Data(TypoEdge::Word {
derivations: derivations.clone(),
nbr_typos: 0,
}),
))
}
if !derivations.one_typo.is_empty() {
edges.push((
1,
EdgeDetails::Data(TypoEdge::Word {
derivations: derivations.clone(),
nbr_typos: 1,
}),
))
}
if !derivations.two_typos.is_empty() {
edges.push((
2,
EdgeDetails::Data(TypoEdge::Word {
derivations: derivations.clone(),
nbr_typos: 2,
}),
))
}
Ok(edges)
}
},
QueryNode::End => Ok(vec![(0, EdgeDetails::Unconditional)]),
QueryNode::Deleted | QueryNode::Start => panic!(),
}
}
fn log_state(
graph: &super::RankingRuleGraph<Self>,
paths: &PathsMap<u64>,
empty_paths_cache: &EmptyPathsCache,
logger: &mut dyn crate::new::logger::SearchLogger<crate::new::QueryGraph>,
) {
logger.log_typo_state(graph, paths, empty_paths_cache);
}
}

View File

@ -7,6 +7,7 @@ use super::logger::SearchLogger;
use super::QueryGraph; use super::QueryGraph;
use crate::new::graph_based_ranking_rule::GraphBasedRankingRule; use crate::new::graph_based_ranking_rule::GraphBasedRankingRule;
use crate::new::ranking_rule_graph::proximity::ProximityGraph; use crate::new::ranking_rule_graph::proximity::ProximityGraph;
use crate::new::ranking_rule_graph::typo::TypoGraph;
use crate::new::words::Words; use crate::new::words::Words;
use crate::search::new::sort::Sort; use crate::search::new::sort::Sort;
use crate::{Filter, Index, Result, TermsMatchingStrategy}; use crate::{Filter, Index, Result, TermsMatchingStrategy};
@ -125,9 +126,10 @@ pub fn execute_search<'transaction>(
let words = &mut Words::new(TermsMatchingStrategy::Last); let words = &mut Words::new(TermsMatchingStrategy::Last);
let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?; let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?;
let proximity = &mut GraphBasedRankingRule::<ProximityGraph>::new("proximity".to_owned()); let proximity = &mut GraphBasedRankingRule::<ProximityGraph>::new("proximity".to_owned());
let typo = &mut GraphBasedRankingRule::<TypoGraph>::new("typo".to_owned());
// TODO: ranking rules given as argument // TODO: ranking rules given as argument
let mut ranking_rules: Vec<&mut dyn RankingRule<'transaction, QueryGraph>> = let mut ranking_rules: Vec<&mut dyn RankingRule<'transaction, QueryGraph>> =
vec![words, proximity, sort]; vec![words, typo, proximity, sort];
logger.ranking_rules(&ranking_rules); logger.ranking_rules(&ranking_rules);
@ -152,7 +154,7 @@ pub fn execute_search<'transaction>(
macro_rules! back { macro_rules! back {
() => { () => {
assert!(candidates[cur_ranking_rule_index].is_empty()); // assert!(candidates[cur_ranking_rule_index].is_empty());
logger.end_iteration_ranking_rule( logger.end_iteration_ranking_rule(
cur_ranking_rule_index, cur_ranking_rule_index,
ranking_rules[cur_ranking_rule_index], ranking_rules[cur_ranking_rule_index],
@ -230,6 +232,7 @@ pub fn execute_search<'transaction>(
); );
let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(index, txn, db_cache, logger, &candidates[cur_ranking_rule_index])? else { let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(index, txn, db_cache, logger, &candidates[cur_ranking_rule_index])? else {
// TODO: add remaining candidates automatically here?
back!(); back!();
continue; continue;
}; };
@ -346,7 +349,7 @@ mod tests {
let mut db_cache = DatabaseCache::default(); let mut db_cache = DatabaseCache::default();
let query_graph = let query_graph =
make_query_graph(&index, &txn, &mut db_cache, "released from prison by the government") make_query_graph(&index, &txn, &mut db_cache, "releases from poison by the government")
.unwrap(); .unwrap();
let mut logger = DetailedSearchLogger::new("log"); let mut logger = DetailedSearchLogger::new("log");