From caa1e1b9234932765f6a752269c12d2f6864e050 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 28 Feb 2023 14:19:57 +0100 Subject: [PATCH] Add typo ranking rule to new search impl --- .../search/new/graph_based_ranking_rule.rs | 18 ++- milli/src/search/new/logger/detailed.rs | 34 ++++- milli/src/search/new/logger/mod.rs | 17 ++- .../src/search/new/ranking_rule_graph/mod.rs | 1 + .../search/new/ranking_rule_graph/typo/mod.rs | 131 ++++++++++++++++++ milli/src/search/new/ranking_rules.rs | 9 +- 6 files changed, 193 insertions(+), 17 deletions(-) create mode 100644 milli/src/search/new/ranking_rule_graph/typo/mod.rs diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 2bf7885bd..264686b0a 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -70,14 +70,16 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap ) -> Result>> { assert!(universe.len() > 1); let mut state = self.state.take().unwrap(); - - let Some(mut cheapest_paths_state) = state.cheapest_paths_state.take() else { + if state.cheapest_paths_state.is_none() { return Ok(None); - }; + } let mut paths = PathsMap::default(); while paths.is_empty() { + let Some(cheapest_paths_state) = state.cheapest_paths_state.take() else { + break; + }; if let Some(next_cheapest_paths_state) = cheapest_paths_state .compute_paths_of_next_lowest_cost( &mut state.graph, @@ -85,13 +87,15 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap &mut paths, ) { - cheapest_paths_state = next_cheapest_paths_state; + state.cheapest_paths_state = Some(next_cheapest_paths_state); } else { - self.state = None; - return Ok(None); + break; } } - state.cheapest_paths_state = Some(cheapest_paths_state); + + if paths.is_empty() && state.cheapest_paths_state.is_none() { + return Ok(None); + } G::log_state(&state.graph, &paths, &state.empty_paths_cache, logger); diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index a85d20ccc..a7a3f8793 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -3,6 +3,7 @@ use roaring::RoaringBitmap; use std::fs::File; use std::{io::Write, path::PathBuf}; +use crate::new::ranking_rule_graph::typo::TypoGraph; use crate::new::{QueryNode, QueryGraph}; use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::new::ranking_rule_graph::empty_paths_cache::EmptyPathsCache; @@ -38,6 +39,11 @@ pub enum SearchEvents { paths: PathsMap, empty_paths_cache: EmptyPathsCache, }, + TypoState { + graph: RankingRuleGraph, + paths: PathsMap, + empty_paths_cache: EmptyPathsCache, + }, RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap }, } @@ -132,7 +138,10 @@ impl SearchLogger for DetailedSearchLogger { self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.clone(), empty_paths_cache: empty_paths_cache.clone() }) } - + fn log_typo_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &PathsMap, empty_paths_cache: &EmptyPathsCache) { + self.events.push(SearchEvents::TypoState { graph: query_graph.clone(), paths: paths_map.clone(), empty_paths_cache: empty_paths_cache.clone() }) + } + } impl DetailedSearchLogger { @@ -251,7 +260,20 @@ results.{random} {{ let id = format!("{cur_ranking_rule}.{cur_activated_id}"); let new_file_path = self.folder_path.join(format!("{id}.d2")); let mut new_file = std::fs::File::create(new_file_path).unwrap(); - Self::proximity_graph_d2_description(graph, paths, empty_paths_cache, &mut new_file); + Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, &mut new_file); + writeln!( + &mut file, + "{id} {{ + link: \"{id}.d2.svg\" +}}").unwrap(); + }, + SearchEvents::TypoState { graph, paths, empty_paths_cache } => { + let cur_ranking_rule = timestamp.len() - 1; + let cur_activated_id = activated_id(×tamp); + let id = format!("{cur_ranking_rule}.{cur_activated_id}"); + let new_file_path = self.folder_path.join(format!("{id}.d2")); + let mut new_file = std::fs::File::create(new_file_path).unwrap(); + Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, &mut new_file); writeln!( &mut file, "{id} {{ @@ -309,7 +331,7 @@ shape: class").unwrap(); } } } - fn proximity_graph_d2_description(graph: &RankingRuleGraph, paths: &PathsMap, empty_paths_cache: &EmptyPathsCache, file: &mut File) { + fn ranking_rule_graph_d2_description(graph: &RankingRuleGraph, paths: &PathsMap, empty_paths_cache: &EmptyPathsCache, file: &mut File) { writeln!(file,"direction: right").unwrap(); writeln!(file, "Proximity Graph {{").unwrap(); @@ -333,7 +355,7 @@ shape: class").unwrap(); writeln!(file, "{from_node} -> {to_node} : \"cost {cost} {edge_label}\"", cost = edge.cost, - edge_label = ProximityGraph::graphviz_edge_details_label(details) + edge_label = R::graphviz_edge_details_label(details) ).unwrap(); } } @@ -354,7 +376,7 @@ shape: class").unwrap(); } writeln!(file, "}}").unwrap(); } - fn edge_d2_description(graph: &RankingRuleGraph, paths_idx: &str, edge_idx: u32, file: &mut File) -> String { + fn edge_d2_description(graph: &RankingRuleGraph, paths_idx: &str, edge_idx: u32, file: &mut File) -> String { let Edge { from_node, to_node, cost, .. } = graph.all_edges[edge_idx as usize].as_ref().unwrap() ; let from_node = &graph.query_graph.nodes[*from_node as usize]; let from_node_desc = match from_node { @@ -382,7 +404,7 @@ shape: class").unwrap(); }}").unwrap(); edge_id } - fn paths_d2_description(graph: &RankingRuleGraph, paths_idx: &str, paths: &PathsMap, file: &mut File) { + fn paths_d2_description(graph: &RankingRuleGraph, paths_idx: &str, paths: &PathsMap, file: &mut File) { for (edge_idx, rest) in paths.nodes.iter() { let edge_id = Self::edge_d2_description(graph, paths_idx, *edge_idx, file); for (dest_edge_idx, _) in rest.nodes.iter() { diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 3b828f7cb..4e119ae42 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -6,7 +6,7 @@ use roaring::RoaringBitmap; use super::{ ranking_rule_graph::{ empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, proximity::ProximityGraph, - RankingRuleGraph, + typo::TypoGraph, RankingRuleGraph, }, RankingRule, RankingRuleQueryTrait, }; @@ -61,6 +61,14 @@ impl SearchLogger for DefaultSearchLogger { _empty_paths_cache: &EmptyPathsCache, ) { } + + fn log_typo_state( + &mut self, + query_graph: &RankingRuleGraph, + paths: &PathsMap, + empty_paths_cache: &EmptyPathsCache, + ) { + } } pub trait SearchLogger { @@ -104,4 +112,11 @@ pub trait SearchLogger { paths: &PathsMap, empty_paths_cache: &EmptyPathsCache, ); + + fn log_typo_state( + &mut self, + query_graph: &RankingRuleGraph, + paths: &PathsMap, + empty_paths_cache: &EmptyPathsCache, + ); } diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 3a396f3dc..b1adb80fc 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -5,6 +5,7 @@ pub mod empty_paths_cache; pub mod paths_map; pub mod proximity; pub mod resolve_paths; +pub mod typo; use std::ops::ControlFlow; diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs new file mode 100644 index 000000000..55a45e3c3 --- /dev/null +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -0,0 +1,131 @@ +use heed::{BytesDecode, RoTxn}; +use roaring::RoaringBitmap; + +use super::empty_paths_cache::EmptyPathsCache; +use super::paths_map::PathsMap; +use super::{EdgeDetails, RankingRuleGraphTrait}; +use crate::new::db_cache::DatabaseCache; +use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; +use crate::new::QueryNode; +use crate::{Index, Result, RoaringBitmapCodec}; + +#[derive(Clone)] +pub enum TypoEdge { + Phrase, + Word { derivations: WordDerivations, nbr_typos: u8 }, +} + +pub enum TypoGraph {} + +impl RankingRuleGraphTrait for TypoGraph { + type EdgeDetails = TypoEdge; + type BuildVisitedFromNode = (); + + fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String { + match edge { + TypoEdge::Phrase => format!(", 0 typos"), + TypoEdge::Word { nbr_typos, .. } => format!(", {nbr_typos} typos"), + } + } + + fn compute_docids<'db_cache, 'transaction>( + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + edge: &Self::EdgeDetails, + ) -> Result { + match edge { + TypoEdge::Phrase => todo!(), + TypoEdge::Word { derivations, nbr_typos } => { + let words = match nbr_typos { + 0 => &derivations.zero_typo, + 1 => &derivations.one_typo, + 2 => &derivations.two_typos, + _ => panic!(), + }; + let mut docids = RoaringBitmap::new(); + for word in words.iter() { + let Some(bytes) = db_cache.get_word_docids(index, txn, word)? else { continue }; + let bitmap = + RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?; + docids |= bitmap; + } + if *nbr_typos == 0 { + if let Some(bytes) = + db_cache.get_prefix_docids(index, txn, &derivations.original)? + { + let bitmap = + RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?; + docids |= bitmap; + } + } + Ok(docids) + } + } + } + + fn build_visit_from_node<'transaction>( + _index: &Index, + _txn: &'transaction RoTxn, + _db_cache: &mut DatabaseCache<'transaction>, + from_node: &QueryNode, + ) -> Result> { + Ok(Some(())) + } + + fn build_visit_to_node<'from_data, 'transaction: 'from_data>( + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + to_node: &QueryNode, + from_node_data: &'from_data Self::BuildVisitedFromNode, + ) -> Result)>> { + match to_node { + QueryNode::Term(LocatedQueryTerm { value, .. }) => match value { + QueryTerm::Phrase(_) => Ok(vec![(0, EdgeDetails::Data(TypoEdge::Phrase))]), + QueryTerm::Word { derivations } => { + let mut edges = vec![]; + if !derivations.zero_typo.is_empty() || derivations.use_prefix_db { + edges.push(( + 0, + EdgeDetails::Data(TypoEdge::Word { + derivations: derivations.clone(), + nbr_typos: 0, + }), + )) + } + if !derivations.one_typo.is_empty() { + edges.push(( + 1, + EdgeDetails::Data(TypoEdge::Word { + derivations: derivations.clone(), + nbr_typos: 1, + }), + )) + } + if !derivations.two_typos.is_empty() { + edges.push(( + 2, + EdgeDetails::Data(TypoEdge::Word { + derivations: derivations.clone(), + nbr_typos: 2, + }), + )) + } + Ok(edges) + } + }, + QueryNode::End => Ok(vec![(0, EdgeDetails::Unconditional)]), + QueryNode::Deleted | QueryNode::Start => panic!(), + } + } + + fn log_state( + graph: &super::RankingRuleGraph, + paths: &PathsMap, + empty_paths_cache: &EmptyPathsCache, + logger: &mut dyn crate::new::logger::SearchLogger, + ) { + logger.log_typo_state(graph, paths, empty_paths_cache); + } +} diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index f3f71ab4b..f023f94d1 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -7,6 +7,7 @@ use super::logger::SearchLogger; use super::QueryGraph; use crate::new::graph_based_ranking_rule::GraphBasedRankingRule; use crate::new::ranking_rule_graph::proximity::ProximityGraph; +use crate::new::ranking_rule_graph::typo::TypoGraph; use crate::new::words::Words; use crate::search::new::sort::Sort; use crate::{Filter, Index, Result, TermsMatchingStrategy}; @@ -125,9 +126,10 @@ pub fn execute_search<'transaction>( let words = &mut Words::new(TermsMatchingStrategy::Last); let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?; let proximity = &mut GraphBasedRankingRule::::new("proximity".to_owned()); + let typo = &mut GraphBasedRankingRule::::new("typo".to_owned()); // TODO: ranking rules given as argument let mut ranking_rules: Vec<&mut dyn RankingRule<'transaction, QueryGraph>> = - vec![words, proximity, sort]; + vec![words, typo, proximity, sort]; logger.ranking_rules(&ranking_rules); @@ -152,7 +154,7 @@ pub fn execute_search<'transaction>( macro_rules! back { () => { - assert!(candidates[cur_ranking_rule_index].is_empty()); + // assert!(candidates[cur_ranking_rule_index].is_empty()); logger.end_iteration_ranking_rule( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index], @@ -230,6 +232,7 @@ pub fn execute_search<'transaction>( ); let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(index, txn, db_cache, logger, &candidates[cur_ranking_rule_index])? else { + // TODO: add remaining candidates automatically here? back!(); continue; }; @@ -346,7 +349,7 @@ mod tests { let mut db_cache = DatabaseCache::default(); let query_graph = - make_query_graph(&index, &txn, &mut db_cache, "released from prison by the government") + make_query_graph(&index, &txn, &mut db_cache, "releases from poison by the government") .unwrap(); let mut logger = DetailedSearchLogger::new("log");