From e8c76cf7bfe54b7291793b24c72c9b4efdbfe48c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 6 Mar 2023 19:21:55 +0100 Subject: [PATCH] Intern all strings and phrases in the search logic --- milli/src/search/new/db_cache.rs | 122 ++++++------- .../search/new/graph_based_ranking_rule.rs | 47 ++--- milli/src/search/new/interner.rs | 78 ++++++++ milli/src/search/new/logger/detailed.rs | 71 +++++--- milli/src/search/new/mod.rs | 56 +++--- milli/src/search/new/query_graph.rs | 69 ++++---- milli/src/search/new/query_term.rs | 166 +++++++++++------- .../search/new/ranking_rule_graph/build.rs | 21 +-- .../ranking_rule_graph/edge_docids_cache.rs | 19 +- .../src/search/new/ranking_rule_graph/mod.rs | 75 +++++--- .../new/ranking_rule_graph/proximity/build.rs | 95 +++++----- .../proximity/compute_docids.rs | 26 ++- .../new/ranking_rule_graph/proximity/mod.rs | 43 ++--- .../new/ranking_rule_graph/resolve_paths.rs | 23 +-- .../search/new/ranking_rule_graph/typo/mod.rs | 46 ++--- milli/src/search/new/ranking_rules.rs | 162 +++++------------ milli/src/search/new/resolve_query_graph.rs | 90 ++++------ milli/src/search/new/sort.rs | 41 ++--- milli/src/search/new/words.rs | 39 ++-- 19 files changed, 635 insertions(+), 654 deletions(-) create mode 100644 milli/src/search/new/interner.rs diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index f2f8f12c5..cfd69b04f 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -1,51 +1,48 @@ -use std::collections::hash_map::Entry; - +use super::{interner::Interned, SearchContext}; +use crate::Result; use fxhash::FxHashMap; use heed::types::ByteSlice; -use heed::RoTxn; - -use crate::{Index, Result}; +use std::collections::hash_map::Entry; #[derive(Default)] -pub struct DatabaseCache<'transaction> { - pub word_pair_proximity_docids: FxHashMap<(u8, String, String), Option<&'transaction [u8]>>, +pub struct DatabaseCache<'search> { + // TODO: interner for all database cache keys + pub word_pair_proximity_docids: + FxHashMap<(u8, Interned, Interned), Option<&'search [u8]>>, pub word_prefix_pair_proximity_docids: - FxHashMap<(u8, String, String), Option<&'transaction [u8]>>, + FxHashMap<(u8, Interned, Interned), Option<&'search [u8]>>, pub prefix_word_pair_proximity_docids: - FxHashMap<(u8, String, String), Option<&'transaction [u8]>>, - pub word_docids: FxHashMap>, - pub exact_word_docids: FxHashMap>, - pub word_prefix_docids: FxHashMap>, + FxHashMap<(u8, Interned, Interned), Option<&'search [u8]>>, + pub word_docids: FxHashMap, Option<&'search [u8]>>, + pub exact_word_docids: FxHashMap, Option<&'search [u8]>>, + pub word_prefix_docids: FxHashMap, Option<&'search [u8]>>, } -impl<'transaction> DatabaseCache<'transaction> { - pub fn get_word_docids( - &mut self, - index: &Index, - txn: &'transaction RoTxn, - word: &str, - ) -> Result> { - let bitmap_ptr = match self.word_docids.entry(word.to_owned()) { +impl<'search> SearchContext<'search> { + pub fn get_word_docids(&mut self, word: Interned) -> Result> { + let bitmap_ptr = match self.db_cache.word_docids.entry(word) { Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(), Entry::Vacant(entry) => { - let bitmap_ptr = index.word_docids.remap_data_type::().get(txn, word)?; + let bitmap_ptr = self + .index + .word_docids + .remap_data_type::() + .get(self.txn, self.word_interner.get(word))?; entry.insert(bitmap_ptr); bitmap_ptr } }; Ok(bitmap_ptr) } - pub fn get_prefix_docids( - &mut self, - index: &Index, - txn: &'transaction RoTxn, - prefix: &str, - ) -> Result> { + pub fn get_prefix_docids(&mut self, prefix: Interned) -> Result> { // In the future, this will be a frozen roaring bitmap - let bitmap_ptr = match self.word_prefix_docids.entry(prefix.to_owned()) { + let bitmap_ptr = match self.db_cache.word_prefix_docids.entry(prefix) { Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(), Entry::Vacant(entry) => { - let bitmap_ptr = - index.word_prefix_docids.remap_data_type::().get(txn, prefix)?; + let bitmap_ptr = self + .index + .word_prefix_docids + .remap_data_type::() + .get(self.txn, self.word_interner.get(prefix))?; entry.insert(bitmap_ptr); bitmap_ptr } @@ -55,14 +52,12 @@ impl<'transaction> DatabaseCache<'transaction> { pub fn get_word_pair_proximity_docids( &mut self, - index: &Index, - txn: &'transaction RoTxn, - word1: &str, - word2: &str, + word1: Interned, + word2: Interned, proximity: u8, - ) -> Result> { - let key = (proximity, word1.to_owned(), word2.to_owned()); - match self.word_pair_proximity_docids.entry(key.clone()) { + ) -> Result> { + let key = (proximity, word1, word2); + match self.db_cache.word_pair_proximity_docids.entry(key) { Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()), Entry::Vacant(entry) => { // We shouldn't greedily access this DB at all @@ -86,10 +81,11 @@ impl<'transaction> DatabaseCache<'transaction> { // output.push(word1, word2, proximities); // } // } - let bitmap_ptr = index - .word_pair_proximity_docids - .remap_data_type::() - .get(txn, &(key.0, key.1.as_str(), key.2.as_str()))?; + let bitmap_ptr = + self.index.word_pair_proximity_docids.remap_data_type::().get( + self.txn, + &(key.0, self.word_interner.get(key.1), self.word_interner.get(key.2)), + )?; entry.insert(bitmap_ptr); Ok(bitmap_ptr) } @@ -98,20 +94,22 @@ impl<'transaction> DatabaseCache<'transaction> { pub fn get_word_prefix_pair_proximity_docids( &mut self, - index: &Index, - txn: &'transaction RoTxn, - word1: &str, - prefix2: &str, + word1: Interned, + prefix2: Interned, proximity: u8, - ) -> Result> { - let key = (proximity, word1.to_owned(), prefix2.to_owned()); - match self.word_prefix_pair_proximity_docids.entry(key.clone()) { + ) -> Result> { + let key = (proximity, word1, prefix2); + match self.db_cache.word_prefix_pair_proximity_docids.entry(key) { Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()), Entry::Vacant(entry) => { - let bitmap_ptr = index + let bitmap_ptr = self + .index .word_prefix_pair_proximity_docids .remap_data_type::() - .get(txn, &(key.0, key.1.as_str(), key.2.as_str()))?; + .get( + self.txn, + &(key.0, self.word_interner.get(key.1), self.word_interner.get(key.2)), + )?; entry.insert(bitmap_ptr); Ok(bitmap_ptr) } @@ -119,20 +117,26 @@ impl<'transaction> DatabaseCache<'transaction> { } pub fn get_prefix_word_pair_proximity_docids( &mut self, - index: &Index, - txn: &'transaction RoTxn, - left_prefix: &str, - right: &str, + left_prefix: Interned, + right: Interned, proximity: u8, - ) -> Result> { - let key = (proximity, left_prefix.to_owned(), right.to_owned()); - match self.prefix_word_pair_proximity_docids.entry(key) { + ) -> Result> { + let key = (proximity, left_prefix, right); + match self.db_cache.prefix_word_pair_proximity_docids.entry(key) { Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()), Entry::Vacant(entry) => { - let bitmap_ptr = index + let bitmap_ptr = self + .index .prefix_word_pair_proximity_docids .remap_data_type::() - .get(txn, &(proximity, left_prefix, right))?; + .get( + self.txn, + &( + proximity, + self.word_interner.get(left_prefix), + self.word_interner.get(right), + ), + )?; entry.insert(bitmap_ptr); Ok(bitmap_ptr) } diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index d1f5864aa..d51fb6920 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -1,15 +1,11 @@ -use heed::RoTxn; -use roaring::RoaringBitmap; - -use super::db_cache::DatabaseCache; use super::logger::SearchLogger; use super::ranking_rule_graph::EdgeDocidsCache; use super::ranking_rule_graph::EmptyPathsCache; - use super::ranking_rule_graph::{RankingRuleGraph, RankingRuleGraphTrait}; +use super::SearchContext; use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput}; - -use crate::{Index, Result}; +use crate::Result; +use roaring::RoaringBitmap; pub struct GraphBasedRankingRule { id: String, @@ -29,12 +25,10 @@ pub struct GraphBasedRankingRuleState { cur_distance_idx: usize, } -fn remove_empty_edges<'transaction, G: RankingRuleGraphTrait>( +fn remove_empty_edges<'search, G: RankingRuleGraphTrait>( + ctx: &mut SearchContext<'search>, graph: &mut RankingRuleGraph, edge_docids_cache: &mut EdgeDocidsCache, - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, universe: &RoaringBitmap, empty_paths_cache: &mut EmptyPathsCache, ) -> Result<()> { @@ -42,8 +36,7 @@ fn remove_empty_edges<'transaction, G: RankingRuleGraphTrait>( if graph.all_edges[edge_index as usize].is_none() { continue; } - let docids = edge_docids_cache - .get_edge_docids(index, txn, db_cache, edge_index, &*graph, universe)?; + let docids = edge_docids_cache.get_edge_docids(ctx, edge_index, &*graph, universe)?; match docids { BitmapOrAllRef::Bitmap(bitmap) => { if bitmap.is_disjoint(universe) { @@ -59,7 +52,7 @@ fn remove_empty_edges<'transaction, G: RankingRuleGraphTrait>( Ok(()) } -impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGraph> +impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> for GraphBasedRankingRule { fn id(&self) -> String { @@ -67,24 +60,20 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap } fn start_iteration( &mut self, - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + ctx: &mut SearchContext<'search>, _logger: &mut dyn SearchLogger, universe: &RoaringBitmap, query_graph: &QueryGraph, ) -> Result<()> { // TODO: update old state instead of starting from scratch - let mut graph = RankingRuleGraph::build(index, txn, db_cache, query_graph.clone())?; + let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?; let mut edge_docids_cache = EdgeDocidsCache::default(); let mut empty_paths_cache = EmptyPathsCache::new(graph.all_edges.len()); remove_empty_edges( + ctx, &mut graph, &mut edge_docids_cache, - index, - txn, - db_cache, universe, &mut empty_paths_cache, )?; @@ -105,20 +94,16 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap fn next_bucket( &mut self, - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + ctx: &mut SearchContext<'search>, logger: &mut dyn SearchLogger, universe: &RoaringBitmap, ) -> Result>> { assert!(universe.len() > 1); let mut state = self.state.take().unwrap(); remove_empty_edges( + ctx, &mut state.graph, &mut state.edge_docids_cache, - index, - txn, - db_cache, universe, &mut state.empty_paths_cache, )?; @@ -151,9 +136,7 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap ); let bucket = state.graph.resolve_paths( - index, - txn, - db_cache, + ctx, &mut state.edge_docids_cache, &mut state.empty_paths_cache, universe, @@ -169,9 +152,7 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap fn end_iteration( &mut self, - _index: &Index, - _txn: &'transaction RoTxn, - _db_cache: &mut DatabaseCache<'transaction>, + _ctx: &mut SearchContext<'search>, _logger: &mut dyn SearchLogger, ) { self.state = None; diff --git a/milli/src/search/new/interner.rs b/milli/src/search/new/interner.rs new file mode 100644 index 000000000..ae0a4e9cb --- /dev/null +++ b/milli/src/search/new/interner.rs @@ -0,0 +1,78 @@ +use fxhash::FxHashMap; +use std::hash::Hash; +use std::marker::PhantomData; + +pub struct Interned { + idx: u32, + _phantom: PhantomData, +} + +impl Interned { + fn new(idx: u32) -> Self { + Self { idx, _phantom: PhantomData } + } +} + +pub struct Interner { + stable_store: Vec, + lookup: FxHashMap>, +} +impl Default for Interner { + fn default() -> Self { + Self { stable_store: Default::default(), lookup: Default::default() } + } +} + +impl Interner +where + T: Clone + Eq + Hash, +{ + pub fn insert(&mut self, s: T) -> Interned { + if let Some(interned) = self.lookup.get(&s) { + *interned + } else { + self.stable_store.push(s.clone()); + let interned = Interned::new(self.stable_store.len() as u32 - 1); + self.lookup.insert(s, interned); + interned + } + } + pub fn get(&self, interned: Interned) -> &T { + &self.stable_store[interned.idx as usize] + } +} + +// Interned boilerplate implementations + +impl Hash for Interned { + fn hash(&self, state: &mut H) { + self.idx.hash(state); + } +} + +impl Ord for Interned { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.idx.cmp(&other.idx) + } +} + +impl PartialOrd for Interned { + fn partial_cmp(&self, other: &Self) -> Option { + self.idx.partial_cmp(&other.idx) + } +} + +impl Eq for Interned {} + +impl PartialEq for Interned { + fn eq(&self, other: &Self) -> bool { + self.idx == other.idx + } +} +impl Clone for Interned { + fn clone(&self) -> Self { + Self { idx: self.idx, _phantom: PhantomData } + } +} + +impl Copy for Interned {} diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 4282db27f..76c3f8977 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -6,7 +6,7 @@ use std::time::Instant; use std::{io::Write, path::PathBuf}; use crate::new::ranking_rule_graph::TypoGraph; -use crate::new::{QueryNode, QueryGraph}; +use crate::new::{QueryNode, QueryGraph, SearchContext}; use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::new::ranking_rule_graph::EmptyPathsCache; use crate::new::ranking_rule_graph::{Edge, EdgeDetails, RankingRuleGraphTrait}; @@ -176,7 +176,7 @@ impl SearchLogger for DetailedSearchLogger { } impl DetailedSearchLogger { - pub fn write_d2_description(&self) { + pub fn write_d2_description(&self,ctx: &mut SearchContext,) { let mut prev_time = self.initial_query_time.unwrap(); let mut timestamp = vec![]; fn activated_id(timestamp: &[usize]) -> String { @@ -193,12 +193,12 @@ impl DetailedSearchLogger { writeln!(&mut file, "direction: right").unwrap(); writeln!(&mut file, "Initial Query Graph: {{").unwrap(); let initial_query_graph = self.initial_query.as_ref().unwrap(); - Self::query_graph_d2_description(initial_query_graph, &mut file); + Self::query_graph_d2_description(ctx, initial_query_graph, &mut file); writeln!(&mut file, "}}").unwrap(); writeln!(&mut file, "Query Graph Used To Compute Universe: {{").unwrap(); let query_graph_for_universe = self.query_for_universe.as_ref().unwrap(); - Self::query_graph_d2_description(query_graph_for_universe, &mut file); + Self::query_graph_d2_description(ctx, query_graph_for_universe, &mut file); writeln!(&mut file, "}}").unwrap(); let initial_universe = self.initial_universe.as_ref().unwrap(); @@ -308,7 +308,7 @@ results.{random} {{ let id = format!("{cur_ranking_rule}.{cur_activated_id}"); let new_file_path = self.folder_path.join(format!("{id}.d2")); let mut new_file = std::fs::File::create(new_file_path).unwrap(); - Self::query_graph_d2_description(query_graph, &mut new_file); + Self::query_graph_d2_description(ctx, query_graph, &mut new_file); writeln!( &mut file, "{id} {{ @@ -323,7 +323,7 @@ results.{random} {{ let id = format!("{cur_ranking_rule}.{cur_activated_id}"); let new_file_path = self.folder_path.join(format!("{id}.d2")); let mut new_file = std::fs::File::create(new_file_path).unwrap(); - Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, distances.clone(), &mut new_file); + Self::ranking_rule_graph_d2_description(ctx, graph, paths, empty_paths_cache, distances.clone(), &mut new_file); writeln!( &mut file, "{id} {{ @@ -339,7 +339,7 @@ results.{random} {{ let id = format!("{cur_ranking_rule}.{cur_activated_id}"); let new_file_path = self.folder_path.join(format!("{id}.d2")); let mut new_file = std::fs::File::create(new_file_path).unwrap(); - Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, distances.clone(), &mut new_file); + Self::ranking_rule_graph_d2_description(ctx,graph, paths, empty_paths_cache, distances.clone(), &mut new_file); writeln!( &mut file, "{id} {{ @@ -352,31 +352,40 @@ results.{random} {{ writeln!(&mut file, "}}").unwrap(); } - fn query_node_d2_desc(node_idx: usize, node: &QueryNode, _distances: &[u64], file: &mut File) { + fn query_node_d2_desc(ctx: &mut SearchContext, node_idx: usize, node: &QueryNode, _distances: &[u64], file: &mut File) { match &node { QueryNode::Term(LocatedQueryTerm { value, .. }) => { match value { QueryTerm::Phrase { phrase } => { - let phrase_str = phrase.description(); + let phrase = ctx.phrase_interner.get(*phrase); + let phrase_str = phrase.description(&ctx.word_interner); writeln!(file,"{node_idx} : \"{phrase_str}\"").unwrap(); }, QueryTerm::Word { derivations: WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db, synonyms, split_words } } => { + let original = ctx.word_interner.get(*original); writeln!(file,"{node_idx} : \"{original}\" {{ shape: class").unwrap(); - for w in zero_typo { + for w in zero_typo.iter().copied() { + let w = ctx.word_interner.get(w); writeln!(file, "\"{w}\" : 0").unwrap(); } - for w in one_typo { + for w in one_typo.iter().copied() { + let w = ctx.word_interner.get(w); writeln!(file, "\"{w}\" : 1").unwrap(); } - for w in two_typos { + for w in two_typos.iter().copied() { + let w = ctx.word_interner.get(w); writeln!(file, "\"{w}\" : 2").unwrap(); } - if let Some((left, right)) = split_words { - writeln!(file, "\"{left} {right}\" : split_words").unwrap(); + if let Some(split_words) = split_words { + let phrase = ctx.phrase_interner.get(*split_words); + let phrase_str = phrase.description(&ctx.word_interner); + writeln!(file, "\"{phrase_str}\" : split_words").unwrap(); } - for synonym in synonyms { - writeln!(file, "\"{}\" : synonym", synonym.description()).unwrap(); + for synonym in synonyms.iter().copied() { + let phrase = ctx.phrase_interner.get(synonym); + let phrase_str = phrase.description(&ctx.word_interner); + writeln!(file, "\"{phrase_str}\" : synonym").unwrap(); } if *use_prefix_db { writeln!(file, "use prefix DB : true").unwrap(); @@ -398,20 +407,20 @@ shape: class").unwrap(); }, } } - fn query_graph_d2_description(query_graph: &QueryGraph, file: &mut File) { + fn query_graph_d2_description(ctx: &mut SearchContext, query_graph: &QueryGraph, file: &mut File) { writeln!(file,"direction: right").unwrap(); for node in 0..query_graph.nodes.len() { if matches!(query_graph.nodes[node], QueryNode::Deleted) { continue; } - Self::query_node_d2_desc(node, &query_graph.nodes[node], &[], file); + Self::query_node_d2_desc(ctx, node, &query_graph.nodes[node], &[], file); for edge in query_graph.edges[node].successors.iter() { writeln!(file, "{node} -> {edge};\n").unwrap(); } } } - fn ranking_rule_graph_d2_description(graph: &RankingRuleGraph, paths: &[Vec], _empty_paths_cache: &EmptyPathsCache, distances: Vec>, file: &mut File) { + fn ranking_rule_graph_d2_description(ctx: &mut SearchContext, graph: &RankingRuleGraph, paths: &[Vec], _empty_paths_cache: &EmptyPathsCache, distances: Vec>, file: &mut File) { writeln!(file,"direction: right").unwrap(); writeln!(file, "Proximity Graph {{").unwrap(); @@ -420,7 +429,7 @@ shape: class").unwrap(); continue; } let distances = &distances[node_idx]; - Self::query_node_d2_desc(node_idx, node, distances.as_slice(), file); + Self::query_node_d2_desc(ctx, node_idx, node, distances.as_slice(), file); } for edge in graph.all_edges.iter().flatten() { let Edge { from_node, to_node, details, .. } = edge; @@ -449,7 +458,7 @@ shape: class").unwrap(); writeln!(file, "Shortest Paths {{").unwrap(); - Self::paths_d2_description(graph, paths, file); + Self::paths_d2_description(ctx, graph, paths, file); writeln!(file, "}}").unwrap(); // writeln!(file, "Empty Edge Couples {{").unwrap(); @@ -468,15 +477,18 @@ shape: class").unwrap(); // } // writeln!(file, "}}").unwrap(); } - fn edge_d2_description(graph: &RankingRuleGraph, edge_idx: u32, file: &mut File) { + fn edge_d2_description(ctx: &mut SearchContext,graph: &RankingRuleGraph, edge_idx: u32, file: &mut File) { let Edge { from_node, to_node, cost, .. } = graph.all_edges[edge_idx as usize].as_ref().unwrap() ; let from_node = &graph.query_graph.nodes[*from_node as usize]; let from_node_desc = match from_node { QueryNode::Term(term) => match &term.value { QueryTerm::Phrase { phrase } => { - phrase.description() + let phrase = ctx.phrase_interner.get(*phrase); + phrase.description(&ctx.word_interner) + }, + QueryTerm::Word { derivations } => { + ctx.word_interner.get(derivations.original).to_owned() }, - QueryTerm::Word { derivations } => derivations.original.clone(), }, QueryNode::Deleted => panic!(), QueryNode::Start => "START".to_owned(), @@ -485,8 +497,11 @@ shape: class").unwrap(); let to_node = &graph.query_graph.nodes[*to_node as usize]; let to_node_desc = match to_node { QueryNode::Term(term) => match &term.value { - QueryTerm::Phrase { phrase } => phrase.description(), - QueryTerm::Word { derivations } => derivations.original.clone(), + QueryTerm::Phrase { phrase } => { + let phrase = ctx.phrase_interner.get(*phrase); + phrase.description(&ctx.word_interner) + }, + QueryTerm::Word { derivations } => ctx.word_interner.get(derivations.original).to_owned(), }, QueryNode::Deleted => panic!(), QueryNode::Start => "START".to_owned(), @@ -496,11 +511,11 @@ shape: class").unwrap(); shape: class }}").unwrap(); } - fn paths_d2_description(graph: &RankingRuleGraph, paths: &[Vec], file: &mut File) { + fn paths_d2_description(ctx: &mut SearchContext, graph: &RankingRuleGraph, paths: &[Vec], file: &mut File) { for (path_idx, edge_indexes) in paths.iter().enumerate() { writeln!(file, "{path_idx} {{").unwrap(); for edge_idx in edge_indexes.iter() { - Self::edge_d2_description(graph, *edge_idx, file); + Self::edge_d2_description(ctx, graph, *edge_idx, file); } for couple_edges in edge_indexes.windows(2) { let [src_edge_idx, dest_edge_idx] = couple_edges else { panic!() }; diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 3e9b43f1b..0feef1f60 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -1,5 +1,6 @@ mod db_cache; mod graph_based_ranking_rule; +mod interner; mod logger; mod query_graph; mod query_term; @@ -26,7 +27,9 @@ use query_graph::{QueryGraph, QueryNode}; use roaring::RoaringBitmap; use self::{ + interner::Interner, logger::SearchLogger, + query_term::Phrase, resolve_query_graph::{resolve_query_graph, NodeDocIdsCache}, }; @@ -35,14 +38,32 @@ pub enum BitmapOrAllRef<'s> { All, } +pub struct SearchContext<'search> { + pub index: &'search Index, + pub txn: &'search RoTxn<'search>, + pub db_cache: DatabaseCache<'search>, + pub word_interner: Interner, + pub phrase_interner: Interner, + pub node_docids_cache: NodeDocIdsCache, +} +impl<'search> SearchContext<'search> { + pub fn new(index: &'search Index, txn: &'search RoTxn<'search>) -> Self { + Self { + index, + txn, + db_cache: <_>::default(), + word_interner: <_>::default(), + phrase_interner: <_>::default(), + node_docids_cache: <_>::default(), + } + } +} + #[allow(clippy::too_many_arguments)] -pub fn resolve_maximally_reduced_query_graph<'transaction>( - index: &Index, - txn: &'transaction heed::RoTxn, - db_cache: &mut DatabaseCache<'transaction>, +pub fn resolve_maximally_reduced_query_graph<'search>( + ctx: &mut SearchContext<'search>, universe: &RoaringBitmap, query_graph: &QueryGraph, - node_docids_cache: &mut NodeDocIdsCache, matching_strategy: TermsMatchingStrategy, logger: &mut dyn SearchLogger, ) -> Result { @@ -73,16 +94,14 @@ pub fn resolve_maximally_reduced_query_graph<'transaction>( } } logger.query_for_universe(&graph); - let docids = resolve_query_graph(index, txn, db_cache, node_docids_cache, &graph, universe)?; + let docids = resolve_query_graph(ctx, &graph, universe)?; Ok(docids) } #[allow(clippy::too_many_arguments)] -pub fn execute_search<'transaction>( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, +pub fn execute_search<'search>( + ctx: &mut SearchContext<'search>, query: &str, filters: Option, from: usize, @@ -90,26 +109,21 @@ pub fn execute_search<'transaction>( logger: &mut dyn SearchLogger, ) -> Result> { assert!(!query.is_empty()); - let query_terms = located_query_terms_from_string(index, txn, query.tokenize(), None).unwrap(); - let graph = QueryGraph::from_query(index, txn, db_cache, query_terms)?; + let query_terms = located_query_terms_from_string(ctx, query.tokenize(), None).unwrap(); + let graph = QueryGraph::from_query(ctx, query_terms)?; logger.initial_query(&graph); let universe = if let Some(filters) = filters { - filters.evaluate(txn, index)? + filters.evaluate(ctx.txn, ctx.index)? } else { - index.documents_ids(txn)? + ctx.index.documents_ids(ctx.txn)? }; - let mut node_docids_cache = NodeDocIdsCache::default(); - let universe = resolve_maximally_reduced_query_graph( - index, - txn, - db_cache, + ctx, &universe, &graph, - &mut node_docids_cache, TermsMatchingStrategy::Last, logger, )?; @@ -117,5 +131,5 @@ pub fn execute_search<'transaction>( logger.initial_universe(&universe); - apply_ranking_rules(index, txn, db_cache, &graph, &universe, from, length, logger) + apply_ranking_rules(ctx, &graph, &universe, from, length, logger) } diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index e86c175af..449b6536c 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -1,13 +1,10 @@ +use super::query_term::{self, LocatedQueryTerm, QueryTerm, WordDerivations}; +use super::SearchContext; +use crate::Result; +use roaring::RoaringBitmap; use std::fmt::Debug; -use heed::RoTxn; -use roaring::RoaringBitmap; - -use super::db_cache::DatabaseCache; -use super::query_term::{self, LocatedQueryTerm, QueryTerm, WordDerivations}; -use crate::{Index, Result}; - -#[derive(Debug, Clone)] +#[derive(Clone)] pub enum QueryNode { Term(LocatedQueryTerm), Deleted, @@ -22,7 +19,7 @@ pub struct Edges { pub successors: RoaringBitmap, } -#[derive(Debug, Clone)] +#[derive(Clone)] pub struct QueryGraph { pub root_node: u32, pub end_node: u32, @@ -31,8 +28,8 @@ pub struct QueryGraph { } fn _assert_sizes() { - // TODO: QueryNodes are too big now, 184B is an unreasonable size - let _: [u8; 184] = [0; std::mem::size_of::()]; + // TODO: QueryNodes are too big now, 88B is a bit too big + let _: [u8; 88] = [0; std::mem::size_of::()]; let _: [u8; 48] = [0; std::mem::size_of::()]; } @@ -72,19 +69,14 @@ impl QueryGraph { impl QueryGraph { // TODO: return the list of all matching words here as well - pub fn from_query<'transaction>( - index: &Index, - txn: &RoTxn, - _db_cache: &mut DatabaseCache<'transaction>, - terms: Vec, - ) -> Result { + pub fn from_query(ctx: &mut SearchContext, terms: Vec) -> Result { // TODO: maybe empty nodes should not be removed here, to compute // the score of the `words` ranking rule correctly // it is very easy to traverse the graph and remove afterwards anyway // Still, I'm keeping this here as a demo let mut empty_nodes = vec![]; - let word_set = index.words_fst(txn)?; + let word_set = ctx.index.words_fst(ctx.txn)?; let mut graph = QueryGraph::default(); let (mut prev2, mut prev1, mut prev0): (Vec, Vec, Vec) = @@ -105,20 +97,20 @@ impl QueryGraph { if !prev1.is_empty() { if let Some((ngram2_str, ngram2_pos)) = - query_term::ngram2(&query[length - 2], &query[length - 1]) + query_term::ngram2(ctx, &query[length - 2], &query[length - 1]) { - if word_set.contains(ngram2_str.as_bytes()) { + if word_set.contains(ctx.word_interner.get(ngram2_str)) { let ngram2 = LocatedQueryTerm { value: QueryTerm::Word { derivations: WordDerivations { - original: ngram2_str.clone(), + original: ngram2_str, // TODO: could add a typo if it's an ngram? - zero_typo: vec![ngram2_str], - one_typo: vec![], - two_typos: vec![], + zero_typo: Box::new([ngram2_str]), + one_typo: Box::new([]), + two_typos: Box::new([]), use_prefix_db: false, - synonyms: vec![], // TODO: ngram synonyms - split_words: None, // TODO: maybe ngram split words? + synonyms: Box::new([]), // TODO: ngram synonyms + split_words: None, // TODO: maybe ngram split words? }, }, positions: ngram2_pos, @@ -129,22 +121,25 @@ impl QueryGraph { } } if !prev2.is_empty() { - if let Some((ngram3_str, ngram3_pos)) = - query_term::ngram3(&query[length - 3], &query[length - 2], &query[length - 1]) - { - if word_set.contains(ngram3_str.as_bytes()) { + if let Some((ngram3_str, ngram3_pos)) = query_term::ngram3( + ctx, + &query[length - 3], + &query[length - 2], + &query[length - 1], + ) { + if word_set.contains(ctx.word_interner.get(ngram3_str)) { let ngram3 = LocatedQueryTerm { value: QueryTerm::Word { derivations: WordDerivations { - original: ngram3_str.clone(), + original: ngram3_str, // TODO: could add a typo if it's an ngram? - zero_typo: vec![ngram3_str], - one_typo: vec![], - two_typos: vec![], + zero_typo: Box::new([ngram3_str]), + one_typo: Box::new([]), + two_typos: Box::new([]), use_prefix_db: false, - synonyms: vec![], // TODO: ngram synonyms - split_words: None, // TODO: maybe ngram split words? - // would be nice for typos like su nflower + synonyms: Box::new([]), // TODO: ngram synonyms + split_words: None, // TODO: maybe ngram split words? + // would be nice for typos like su nflower }, }, positions: ngram3_pos, diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 3820b8ed0..b5e29bffc 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -16,30 +16,35 @@ use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; use crate::search::{build_dfa, get_first}; use crate::{CboRoaringBitmapLenCodec, Index, Result}; -#[derive(Debug, Default, Clone)] +use super::interner::{Interned, Interner}; +use super::SearchContext; + +#[derive(Default, Clone, PartialEq, Eq, Hash)] pub struct Phrase { - pub words: Vec>, + pub words: Vec>>, } impl Phrase { - pub fn description(&self) -> String { - self.words.iter().flatten().join(" ") + pub fn description(&self, interner: &Interner) -> String { + self.words.iter().flatten().map(|w| interner.get(*w)).join(" ") } } -#[derive(Debug, Clone)] +#[derive(Clone)] pub struct WordDerivations { - pub original: String, + pub original: Interned, // TODO: pub prefix_of: Vec, - pub synonyms: Vec, - pub split_words: Option<(String, String)>, - pub zero_typo: Vec, - pub one_typo: Vec, - pub two_typos: Vec, + pub synonyms: Box<[Interned]>, + pub split_words: Option>, + pub zero_typo: Box<[Interned]>, + pub one_typo: Box<[Interned]>, + pub two_typos: Box<[Interned]>, pub use_prefix_db: bool, } impl WordDerivations { - pub fn all_derivations_except_prefix_db(&self) -> impl Iterator + Clone { - self.zero_typo.iter().chain(self.one_typo.iter()).chain(self.two_typos.iter()) + pub fn all_derivations_except_prefix_db( + &'_ self, + ) -> impl Iterator> + Clone + '_ { + self.zero_typo.iter().chain(self.one_typo.iter()).chain(self.two_typos.iter()).copied() } fn is_empty(&self) -> bool { self.zero_typo.is_empty() @@ -50,15 +55,21 @@ impl WordDerivations { } pub fn word_derivations( - index: &Index, - txn: &RoTxn, + ctx: &mut SearchContext, word: &str, max_typo: u8, is_prefix: bool, fst: &fst::Set>, ) -> Result { + let word_interned = ctx.word_interner.insert(word.to_owned()); + let use_prefix_db = is_prefix - && index.word_prefix_docids.remap_data_type::().get(txn, word)?.is_some(); + && ctx + .index + .word_prefix_docids + .remap_data_type::() + .get(ctx.txn, word)? + .is_some(); let mut zero_typo = vec![]; let mut one_typo = vec![]; @@ -70,11 +81,12 @@ pub fn word_derivations( let mut stream = fst.search(prefix).into_stream(); while let Some(word) = stream.next() { - let word = std::str::from_utf8(word)?; - zero_typo.push(word.to_string()); + let word = std::str::from_utf8(word)?.to_owned(); + let word_interned = ctx.word_interner.insert(word); + zero_typo.push(word_interned); } } else if fst.contains(word) { - zero_typo.push(word.to_string()); + zero_typo.push(word_interned); } } else if max_typo == 1 { let dfa = build_dfa(word, 1, is_prefix); @@ -83,13 +95,14 @@ pub fn word_derivations( while let Some((word, state)) = stream.next() { let word = std::str::from_utf8(word)?; + let word_interned = ctx.word_interner.insert(word.to_owned()); let d = dfa.distance(state.1); match d.to_u8() { 0 => { - zero_typo.push(word.to_string()); + zero_typo.push(word_interned); } 1 => { - one_typo.push(word.to_string()); + one_typo.push(word_interned); } _ => panic!(), } @@ -105,47 +118,56 @@ pub fn word_derivations( while let Some((found_word, state)) = stream.next() { let found_word = std::str::from_utf8(found_word)?; + let found_word_interned = ctx.word_interner.insert(found_word.to_owned()); // in the case the typo is on the first letter, we know the number of typo // is two if get_first(found_word) != get_first(word) { - two_typos.push(found_word.to_string()); + two_typos.push(found_word_interned); } else { // Else, we know that it is the second dfa that matched and compute the // correct distance let d = second_dfa.distance((state.1).0); match d.to_u8() { 0 => { - zero_typo.push(found_word.to_string()); + zero_typo.push(found_word_interned); } 1 => { - one_typo.push(found_word.to_string()); + one_typo.push(found_word_interned); } 2 => { - two_typos.push(found_word.to_string()); + two_typos.push(found_word_interned); } _ => panic!(), } } } } - let split_words = split_best_frequency(index, txn, word)?; + let split_words = split_best_frequency(ctx.index, ctx.txn, word)?.map(|(l, r)| { + ctx.phrase_interner.insert(Phrase { + words: vec![Some(ctx.word_interner.insert(l)), Some(ctx.word_interner.insert(r))], + }) + }); + + let synonyms = ctx.index.synonyms(ctx.txn)?; - let synonyms = index.synonyms(txn)?; let synonyms = synonyms .get(&vec![word.to_owned()]) .cloned() .unwrap_or_default() .into_iter() - .map(|words| Phrase { words: words.into_iter().map(Some).collect() }) + .map(|words| { + let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); + ctx.phrase_interner.insert(Phrase { words }) + }) .collect(); Ok(WordDerivations { - original: word.to_owned(), + original: ctx.word_interner.insert(word.to_owned()), synonyms, split_words, - zero_typo, - one_typo, - two_typos, + zero_typo: zero_typo.into_boxed_slice(), + one_typo: one_typo.into_boxed_slice(), + two_typos: two_typos.into_boxed_slice(), use_prefix_db, }) } @@ -176,33 +198,36 @@ fn split_best_frequency( Ok(best.map(|(_, left, right)| (left.to_owned(), right.to_owned()))) } -#[derive(Debug, Clone)] +#[derive(Clone)] pub enum QueryTerm { // TODO: should there be SplitWord, NGram2, and NGram3 variants? // NGram2 can have 1 typo and synonyms // NGram3 cannot have typos but can have synonyms // SplitWords are a phrase // Can NGrams be prefixes? - Phrase { phrase: Phrase }, + Phrase { phrase: Interned }, Word { derivations: WordDerivations }, } impl QueryTerm { - pub fn original_single_word(&self) -> Option<&str> { + pub fn original_single_word<'interner>( + &self, + word_interner: &'interner Interner, + ) -> Option<&'interner str> { match self { QueryTerm::Phrase { phrase: _ } => None, QueryTerm::Word { derivations } => { if derivations.is_empty() { None } else { - Some(derivations.original.as_str()) + Some(word_interner.get(derivations.original)) } } } } } -#[derive(Debug, Clone)] +#[derive(Clone)] pub struct LocatedQueryTerm { pub value: QueryTerm, pub positions: RangeInclusive, @@ -217,18 +242,17 @@ impl LocatedQueryTerm { } } -pub fn located_query_terms_from_string<'transaction>( - index: &Index, - txn: &'transaction RoTxn, +pub fn located_query_terms_from_string<'search>( + ctx: &mut SearchContext<'search>, query: NormalizedTokenIter>, words_limit: Option, ) -> Result> { - let authorize_typos = index.authorize_typos(txn)?; - let min_len_one_typo = index.min_word_len_one_typo(txn)?; - let min_len_two_typos = index.min_word_len_two_typos(txn)?; + let authorize_typos = ctx.index.authorize_typos(ctx.txn)?; + let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?; + let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?; - let exact_words = index.exact_words(txn)?; - let fst = index.words_fst(txn)?; + let exact_words = ctx.index.exact_words(ctx.txn)?; + let fst = ctx.index.words_fst(ctx.txn)?; let nbr_typos = |word: &str| { if !authorize_typos @@ -243,10 +267,6 @@ pub fn located_query_terms_from_string<'transaction>( } }; - let derivations = |word: &str, is_prefix: bool| { - word_derivations(index, txn, word, nbr_typos(word), is_prefix, &fst) - }; - let mut primitive_query = Vec::new(); let mut phrase = Vec::new(); @@ -279,14 +299,17 @@ pub fn located_query_terms_from_string<'transaction>( if let TokenKind::StopWord = token.kind { phrase.push(None); } else { + let word = ctx.word_interner.insert(token.lemma().to_string()); // TODO: in a phrase, check that every word exists // otherwise return WordDerivations::Empty - phrase.push(Some(token.lemma().to_string())); + phrase.push(Some(word)); } } else if peekable.peek().is_some() { match token.kind { TokenKind::Word => { - let derivations = derivations(token.lemma(), false)?; + let word = token.lemma(); + let derivations = + word_derivations(ctx, word, nbr_typos(word), false, &fst)?; let located_term = LocatedQueryTerm { value: QueryTerm::Word { derivations }, positions: position..=position, @@ -296,7 +319,8 @@ pub fn located_query_terms_from_string<'transaction>( TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {} } } else { - let derivations = derivations(token.lemma(), true)?; + let word = token.lemma(); + let derivations = word_derivations(ctx, word, nbr_typos(word), true, &fst)?; let located_term = LocatedQueryTerm { value: QueryTerm::Word { derivations }, positions: position..=position, @@ -323,7 +347,9 @@ pub fn located_query_terms_from_string<'transaction>( { let located_query_term = LocatedQueryTerm { value: QueryTerm::Phrase { - phrase: Phrase { words: mem::take(&mut phrase) }, + phrase: ctx + .phrase_interner + .insert(Phrase { words: mem::take(&mut phrase) }), }, positions: phrase_start..=phrase_end, }; @@ -337,7 +363,9 @@ pub fn located_query_terms_from_string<'transaction>( // If a quote is never closed, we consider all of the end of the query as a phrase. if !phrase.is_empty() { let located_query_term = LocatedQueryTerm { - value: QueryTerm::Phrase { phrase: Phrase { words: mem::take(&mut phrase) } }, + value: QueryTerm::Phrase { + phrase: ctx.phrase_interner.insert(Phrase { words: mem::take(&mut phrase) }), + }, positions: phrase_start..=phrase_end, }; primitive_query.push(located_query_term); @@ -347,35 +375,49 @@ pub fn located_query_terms_from_string<'transaction>( } // TODO: return a word derivations instead? -pub fn ngram2(x: &LocatedQueryTerm, y: &LocatedQueryTerm) -> Option<(String, RangeInclusive)> { +pub fn ngram2( + ctx: &mut SearchContext, + x: &LocatedQueryTerm, + y: &LocatedQueryTerm, +) -> Option<(Interned, RangeInclusive)> { if *x.positions.end() != y.positions.start() - 1 { return None; } - match (&x.value.original_single_word(), &y.value.original_single_word()) { + match ( + &x.value.original_single_word(&ctx.word_interner), + &y.value.original_single_word(&ctx.word_interner), + ) { (Some(w1), Some(w2)) => { - let term = (format!("{w1}{w2}"), *x.positions.start()..=*y.positions.end()); + let term = ( + ctx.word_interner.insert(format!("{w1}{w2}")), + *x.positions.start()..=*y.positions.end(), + ); Some(term) } _ => None, } } pub fn ngram3( + ctx: &mut SearchContext, x: &LocatedQueryTerm, y: &LocatedQueryTerm, z: &LocatedQueryTerm, -) -> Option<(String, RangeInclusive)> { +) -> Option<(Interned, RangeInclusive)> { if *x.positions.end() != y.positions.start() - 1 || *y.positions.end() != z.positions.start() - 1 { return None; } match ( - &x.value.original_single_word(), - &y.value.original_single_word(), - &z.value.original_single_word(), + &x.value.original_single_word(&ctx.word_interner), + &y.value.original_single_word(&ctx.word_interner), + &z.value.original_single_word(&ctx.word_interner), ) { (Some(w1), Some(w2), Some(w3)) => { - let term = (format!("{w1}{w2}{w3}"), *x.positions.start()..=*z.positions.end()); + let term = ( + ctx.word_interner.insert(format!("{w1}{w2}{w3}")), + *x.positions.start()..=*z.positions.end(), + ); Some(term) } _ => None, diff --git a/milli/src/search/new/ranking_rule_graph/build.rs b/milli/src/search/new/ranking_rule_graph/build.rs index a0fdd79c6..03a7f6c9d 100644 --- a/milli/src/search/new/ranking_rule_graph/build.rs +++ b/milli/src/search/new/ranking_rule_graph/build.rs @@ -1,18 +1,10 @@ -use heed::RoTxn; +use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait}; +use crate::new::{QueryGraph, SearchContext}; +use crate::Result; use roaring::RoaringBitmap; -use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait}; -use crate::new::db_cache::DatabaseCache; -use crate::new::QueryGraph; -use crate::{Index, Result}; - impl RankingRuleGraph { - pub fn build<'db_cache, 'transaction: 'db_cache>( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, - query_graph: QueryGraph, - ) -> Result { + pub fn build(ctx: &mut SearchContext, query_graph: QueryGraph) -> Result { let mut ranking_rule_graph = Self { query_graph, all_edges: vec![], node_edges: vec![], successors: vec![] }; @@ -22,12 +14,11 @@ impl RankingRuleGraph { let new_edges = ranking_rule_graph.node_edges.last_mut().unwrap(); let new_successors = ranking_rule_graph.successors.last_mut().unwrap(); - let Some(from_node_data) = G::build_visit_from_node(index, txn, db_cache, node)? else { continue }; + let Some(from_node_data) = G::build_visit_from_node(ctx, node)? else { continue }; for successor_idx in ranking_rule_graph.query_graph.edges[node_idx].successors.iter() { let to_node = &ranking_rule_graph.query_graph.nodes[successor_idx as usize]; - let mut edges = - G::build_visit_to_node(index, txn, db_cache, to_node, &from_node_data)?; + let mut edges = G::build_visit_to_node(ctx, to_node, &from_node_data)?; if edges.is_empty() { continue; } diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs index 3d48fd69c..21c186f3c 100644 --- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs @@ -1,13 +1,10 @@ use std::marker::PhantomData; -use fxhash::FxHashMap; -use heed::RoTxn; -use roaring::RoaringBitmap; - use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait}; -use crate::new::db_cache::DatabaseCache; -use crate::new::BitmapOrAllRef; -use crate::{Index, Result}; +use crate::new::{BitmapOrAllRef, SearchContext}; +use crate::Result; +use fxhash::FxHashMap; +use roaring::RoaringBitmap; // TODO: the cache should have a G::EdgeDetails as key // but then it means that we should have a quick way of @@ -25,11 +22,9 @@ impl Default for EdgeDocidsCache { } } impl EdgeDocidsCache { - pub fn get_edge_docids<'s, 'transaction>( + pub fn get_edge_docids<'s, 'search>( &'s mut self, - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + ctx: &mut SearchContext<'search>, edge_index: u32, graph: &RankingRuleGraph, // TODO: maybe universe doesn't belong here @@ -46,7 +41,7 @@ impl EdgeDocidsCache { return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index])); } // TODO: maybe universe doesn't belong here - let docids = universe & G::compute_docids(index, txn, db_cache, details)?; + let docids = universe & G::compute_docids(ctx, details)?; let _ = self.cache.insert(edge_index, docids); let docids = &self.cache[&edge_index]; Ok(BitmapOrAllRef::Bitmap(docids)) diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index e65d5f70b..446c4e248 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -7,20 +7,15 @@ mod proximity; mod resolve_paths; mod typo; +use super::logger::SearchLogger; +use super::{QueryGraph, QueryNode, SearchContext}; +use crate::Result; pub use edge_docids_cache::EdgeDocidsCache; pub use empty_paths_cache::EmptyPathsCache; pub use proximity::ProximityGraph; -pub use typo::TypoGraph; - -use std::ops::ControlFlow; - -use heed::RoTxn; use roaring::RoaringBitmap; - -use super::db_cache::DatabaseCache; -use super::logger::SearchLogger; -use super::{QueryGraph, QueryNode}; -use crate::{Index, Result}; +use std::ops::ControlFlow; +pub use typo::TypoGraph; #[derive(Debug, Clone)] pub enum EdgeDetails { @@ -42,6 +37,48 @@ pub struct EdgePointer<'graph, E> { pub edge: &'graph Edge, } +// pub struct SubWordDerivations { +// words: FxHashSet>, +// synonyms: FxHashSet>, // NO! they're phrases, not strings +// split_words: bool, +// use_prefix_db: bool, +// } + +// pub struct EdgeWordDerivations { +// // TODO: not Option, instead: Any | All | Subset(SubWordDerivations) +// from_words: Option, // ??? +// to_words: Option, // + use prefix db? +// } + +// fn aggregate_edge_word_derivations( +// graph: (), +// edges: Vec, +// ) -> BTreeMap { +// todo!() +// } + +// fn reduce_word_term_to_sub_word_derivations( +// term: &mut WordDerivations, +// derivations: &SubWordDerivations, +// ) { +// let mut new_one_typo = vec![]; +// for w in term.one_typo { +// if derivations.words.contains(w) { +// new_one_typo.push(w); +// } +// } +// if term.use_prefix_db && !derivations.use_prefix_db { +// term.use_prefix_db = false; +// } +// // etc. +// } + +// fn word_derivations_used_by_edge( +// edge: G::EdgeDetails, +// ) -> SubWordDerivations { +// todo!() +// } + pub trait RankingRuleGraphTrait: Sized { /// The details of an edge connecting two query nodes. These details /// should be sufficient to compute the edge's cost and associated document ids @@ -55,10 +92,8 @@ pub trait RankingRuleGraphTrait: Sized { fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String; /// Compute the document ids associated with the given edge. - fn compute_docids<'transaction>( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + fn compute_docids<'search>( + ctx: &mut SearchContext<'search>, edge_details: &Self::EdgeDetails, ) -> Result; @@ -66,19 +101,15 @@ pub trait RankingRuleGraphTrait: Sized { /// /// This call is followed by zero, one or more calls to [`build_visit_to_node`](RankingRuleGraphTrait::build_visit_to_node), /// which builds the actual edges. - fn build_visit_from_node<'transaction>( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + fn build_visit_from_node<'search>( + ctx: &mut SearchContext<'search>, from_node: &QueryNode, ) -> Result>; /// Return the cost and details of the edges going from the previously visited node /// (with [`build_visit_from_node`](RankingRuleGraphTrait::build_visit_from_node)) to `to_node`. - fn build_visit_to_node<'from_data, 'transaction: 'from_data>( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + fn build_visit_to_node<'from_data, 'search: 'from_data>( + ctx: &mut SearchContext<'search>, to_node: &QueryNode, from_node_data: &'from_data Self::BuildVisitedFromNode, ) -> Result)>>; diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 06c860d7e..4603c7ea0 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -1,30 +1,30 @@ -use std::collections::BTreeMap; - -use heed::RoTxn; -use itertools::Itertools; - use super::ProximityEdge; -use crate::new::db_cache::DatabaseCache; use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::new::ranking_rule_graph::proximity::WordPair; use crate::new::ranking_rule_graph::EdgeDetails; -use crate::new::QueryNode; -use crate::{Index, Result}; +use crate::new::{QueryNode, SearchContext}; +use crate::Result; +use itertools::Itertools; +use std::collections::BTreeMap; -pub fn visit_from_node(from_node: &QueryNode) -> Result> { +pub fn visit_from_node( + ctx: &mut SearchContext, + from_node: &QueryNode, +) -> Result> { Ok(Some(match from_node { QueryNode::Term(LocatedQueryTerm { value: value1, positions: pos1 }) => match value1 { QueryTerm::Word { derivations } => (derivations.clone(), *pos1.end()), QueryTerm::Phrase { phrase: phrase1 } => { - if let Some(original) = phrase1.words.last().unwrap().as_ref() { + let phrase1 = ctx.phrase_interner.get(*phrase1); + if let Some(original) = *phrase1.words.last().unwrap() { ( WordDerivations { - original: original.clone(), - zero_typo: vec![original.to_owned()], - one_typo: vec![], - two_typos: vec![], + original, + zero_typo: Box::new([original]), + one_typo: Box::new([]), + two_typos: Box::new([]), use_prefix_db: false, - synonyms: vec![], + synonyms: Box::new([]), split_words: None, }, *pos1.end(), @@ -37,12 +37,12 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result ( WordDerivations { - original: String::new(), - zero_typo: vec![], - one_typo: vec![], - two_typos: vec![], + original: ctx.word_interner.insert(String::new()), + zero_typo: Box::new([]), + one_typo: Box::new([]), + two_typos: Box::new([]), use_prefix_db: false, - synonyms: vec![], + synonyms: Box::new([]), split_words: None, }, -100, @@ -51,10 +51,8 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, +pub fn visit_to_node<'search, 'from_data>( + ctx: &mut SearchContext<'search>, to_node: &QueryNode, from_node_data: &'from_data (WordDerivations, i8), ) -> Result)>> { @@ -69,15 +67,16 @@ pub fn visit_to_node<'transaction, 'from_data>( let (derivations2, pos2, ngram_len2) = match value2 { QueryTerm::Word { derivations } => (derivations.clone(), *pos2.start(), pos2.len()), QueryTerm::Phrase { phrase: phrase2 } => { - if let Some(original) = phrase2.words.first().unwrap().as_ref() { + let phrase2 = ctx.phrase_interner.get(*phrase2); + if let Some(original) = *phrase2.words.first().unwrap() { ( WordDerivations { - original: original.clone(), - zero_typo: vec![original.to_owned()], - one_typo: vec![], - two_typos: vec![], + original, + zero_typo: Box::new([original]), + one_typo: Box::new([]), + two_typos: Box::new([]), use_prefix_db: false, - synonyms: vec![], + synonyms: Box::new([]), split_words: None, }, *pos2.start(), @@ -106,19 +105,16 @@ pub fn visit_to_node<'transaction, 'from_data>( let derivations1 = derivations1.all_derivations_except_prefix_db(); // TODO: eventually, we want to get rid of the uses from `orginal` - let original_word_2 = derivations2.original.clone(); let mut cost_proximity_word_pairs = BTreeMap::>>::new(); if updb2 { for word1 in derivations1.clone() { for proximity in 1..=(8 - ngram_len2) { let cost = (proximity + ngram_len2 - 1) as u8; - if db_cache + if ctx .get_word_prefix_pair_proximity_docids( - index, - txn, word1, - original_word_2.as_str(), + derivations2.original, proximity as u8, )? .is_some() @@ -129,16 +125,14 @@ pub fn visit_to_node<'transaction, 'from_data>( .entry(proximity as u8) .or_default() .push(WordPair::WordPrefix { - left: word1.to_owned(), - right_prefix: original_word_2.to_owned(), + left: word1, + right_prefix: derivations2.original, }); } - if db_cache + if ctx .get_prefix_word_pair_proximity_docids( - index, - txn, - original_word_2.as_str(), - word1.as_str(), + derivations2.original, + word1, proximity as u8 - 1, )? .is_some() @@ -149,8 +143,8 @@ pub fn visit_to_node<'transaction, 'from_data>( .entry(proximity as u8) .or_default() .push(WordPair::WordPrefixSwapped { - left_prefix: original_word_2.to_owned(), - right: word1.to_owned(), + left_prefix: derivations2.original, + right: word1, }); } } @@ -164,28 +158,23 @@ pub fn visit_to_node<'transaction, 'from_data>( for (word1, word2) in product_derivations { for proximity in 1..=(8 - ngram_len2) { let cost = (proximity + ngram_len2 - 1) as u8; - if db_cache - .get_word_pair_proximity_docids(index, txn, word1, word2, proximity as u8)? - .is_some() - { + if ctx.get_word_pair_proximity_docids(word1, word2, proximity as u8)?.is_some() { cost_proximity_word_pairs .entry(cost) .or_default() .entry(proximity as u8) .or_default() - .push(WordPair::Words { left: word1.to_owned(), right: word2.to_owned() }); + .push(WordPair::Words { left: word1, right: word2 }); } if proximity > 1 - && db_cache - .get_word_pair_proximity_docids(index, txn, word2, word1, proximity as u8 - 1)? - .is_some() + && ctx.get_word_pair_proximity_docids(word2, word1, proximity as u8 - 1)?.is_some() { cost_proximity_word_pairs .entry(cost) .or_default() .entry(proximity as u8 - 1) .or_default() - .push(WordPair::Words { left: word2.to_owned(), right: word1.to_owned() }); + .push(WordPair::Words { left: word2, right: word1 }); } } } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 34f7deea1..df289fb2c 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -1,14 +1,10 @@ -use heed::RoTxn; +use super::{ProximityEdge, WordPair}; +use crate::new::SearchContext; +use crate::{CboRoaringBitmapCodec, Result}; use roaring::{MultiOps, RoaringBitmap}; -use super::{ProximityEdge, WordPair}; -use crate::new::db_cache::DatabaseCache; -use crate::{CboRoaringBitmapCodec, Result}; - -pub fn compute_docids<'transaction>( - index: &crate::Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, +pub fn compute_docids<'search>( + ctx: &mut SearchContext<'search>, edge: &ProximityEdge, ) -> Result { let ProximityEdge { pairs, proximity } = edge; @@ -16,12 +12,14 @@ pub fn compute_docids<'transaction>( for pair in pairs.iter() { let bytes = match pair { WordPair::Words { left, right } => { - db_cache.get_word_pair_proximity_docids(index, txn, left, right, *proximity) + ctx.get_word_pair_proximity_docids(*left, *right, *proximity) + } + WordPair::WordPrefix { left, right_prefix } => { + ctx.get_word_prefix_pair_proximity_docids(*left, *right_prefix, *proximity) + } + WordPair::WordPrefixSwapped { left_prefix, right } => { + ctx.get_prefix_word_pair_proximity_docids(*left_prefix, *right, *proximity) } - WordPair::WordPrefix { left, right_prefix } => db_cache - .get_word_prefix_pair_proximity_docids(index, txn, left, right_prefix, *proximity), - WordPair::WordPrefixSwapped { left_prefix, right } => db_cache - .get_prefix_word_pair_proximity_docids(index, txn, left_prefix, right, *proximity), }?; let bitmap = bytes.map(CboRoaringBitmapCodec::deserialize_from).transpose()?.unwrap_or_default(); diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index c0dbbefa9..ec1a7b5fa 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -1,25 +1,22 @@ pub mod build; pub mod compute_docids; -use heed::RoTxn; -use roaring::RoaringBitmap; - use super::empty_paths_cache::EmptyPathsCache; - use super::{EdgeDetails, RankingRuleGraphTrait}; -use crate::new::db_cache::DatabaseCache; +use crate::new::interner::Interned; use crate::new::logger::SearchLogger; use crate::new::query_term::WordDerivations; -use crate::new::{QueryGraph, QueryNode}; -use crate::{Index, Result}; +use crate::new::{QueryGraph, QueryNode, SearchContext}; +use crate::Result; +use roaring::RoaringBitmap; // TODO: intern the strings, refer to them by their pointer? -#[derive(Debug, Clone)] +#[derive(Clone)] pub enum WordPair { - Words { left: String, right: String }, - WordPrefix { left: String, right_prefix: String }, - WordPrefixSwapped { left_prefix: String, right: String }, + Words { left: Interned, right: Interned }, + WordPrefix { left: Interned, right_prefix: Interned }, + WordPrefixSwapped { left_prefix: Interned, right: Interned }, } #[derive(Clone)] @@ -40,32 +37,26 @@ impl RankingRuleGraphTrait for ProximityGraph { format!(", prox {proximity}, {} pairs", pairs.len()) } - fn compute_docids<'db_cache, 'transaction>( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + fn compute_docids<'search>( + ctx: &mut SearchContext<'search>, edge: &Self::EdgeDetails, ) -> Result { - compute_docids::compute_docids(index, txn, db_cache, edge) + compute_docids::compute_docids(ctx, edge) } - fn build_visit_from_node<'transaction>( - _index: &Index, - _txn: &'transaction RoTxn, - _db_cache: &mut DatabaseCache<'transaction>, + fn build_visit_from_node<'search>( + ctx: &mut SearchContext<'search>, from_node: &QueryNode, ) -> Result> { - build::visit_from_node(from_node) + build::visit_from_node(ctx, from_node) } - fn build_visit_to_node<'from_data, 'transaction: 'from_data>( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + fn build_visit_to_node<'from_data, 'search: 'from_data>( + ctx: &mut SearchContext<'search>, to_node: &QueryNode, from_node_data: &'from_data Self::BuildVisitedFromNode, ) -> Result)>> { - build::visit_to_node(index, txn, db_cache, to_node, from_node_data) + build::visit_to_node(ctx, to_node, from_node_data) } fn log_state( diff --git a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs index 94a51756e..b3e03d555 100644 --- a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs @@ -1,23 +1,18 @@ #![allow(clippy::too_many_arguments)] -use heed::RoTxn; -use roaring::{MultiOps, RoaringBitmap}; - use super::edge_docids_cache::EdgeDocidsCache; use super::empty_paths_cache::EmptyPathsCache; - use super::{RankingRuleGraph, RankingRuleGraphTrait}; -use crate::new::db_cache::DatabaseCache; - -use crate::new::BitmapOrAllRef; -use crate::{Index, Result}; +use crate::new::{BitmapOrAllRef, SearchContext}; +use crate::Result; +use roaring::{MultiOps, RoaringBitmap}; impl RankingRuleGraph { - pub fn resolve_paths<'transaction>( + // TODO: reduce the universe after computing each path + // TODO: deserialize roaring bitmap within a universe + pub fn resolve_paths<'search>( &mut self, - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + ctx: &mut SearchContext<'search>, edge_docids_cache: &mut EdgeDocidsCache, empty_paths_cache: &mut EmptyPathsCache, universe: &RoaringBitmap, @@ -52,8 +47,8 @@ impl RankingRuleGraph { let mut cached_edge_docids = vec![]; 'edge_loop: for edge_index in edge_indexes { visited_edges.push(edge_index); - let edge_docids = edge_docids_cache - .get_edge_docids(index, txn, db_cache, edge_index, self, universe)?; + let edge_docids = + edge_docids_cache.get_edge_docids(ctx, edge_index, self, universe)?; match edge_docids { BitmapOrAllRef::Bitmap(edge_docids) => { cached_edge_docids.push((edge_index, edge_docids.clone())); diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index c9ca7c229..d3771221f 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -1,19 +1,17 @@ -use heed::{BytesDecode, RoTxn}; -use roaring::RoaringBitmap; - use super::empty_paths_cache::EmptyPathsCache; - use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait}; -use crate::new::db_cache::DatabaseCache; +use crate::new::interner::Interned; use crate::new::logger::SearchLogger; use crate::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations}; use crate::new::resolve_query_graph::resolve_phrase; -use crate::new::{QueryGraph, QueryNode}; -use crate::{Index, Result, RoaringBitmapCodec}; +use crate::new::{QueryGraph, QueryNode, SearchContext}; +use crate::{Result, RoaringBitmapCodec}; +use heed::BytesDecode; +use roaring::RoaringBitmap; #[derive(Clone)] pub enum TypoEdge { - Phrase { phrase: Phrase }, + Phrase { phrase: Interned }, Word { derivations: WordDerivations, nbr_typos: u8 }, } @@ -30,14 +28,12 @@ impl RankingRuleGraphTrait for TypoGraph { } } - fn compute_docids<'db_cache, 'transaction>( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + fn compute_docids<'db_cache, 'search>( + ctx: &mut SearchContext<'search>, edge: &Self::EdgeDetails, ) -> Result { match edge { - TypoEdge::Phrase { phrase } => resolve_phrase(index, txn, db_cache, phrase), + TypoEdge::Phrase { phrase } => resolve_phrase(ctx, *phrase), TypoEdge::Word { derivations, nbr_typos } => { let words = match nbr_typos { 0 => &derivations.zero_typo, @@ -46,16 +42,14 @@ impl RankingRuleGraphTrait for TypoGraph { _ => panic!(), }; let mut docids = RoaringBitmap::new(); - for word in words.iter() { - let Some(bytes) = db_cache.get_word_docids(index, txn, word)? else { continue }; + for word in words.iter().copied() { + let Some(bytes) = ctx.get_word_docids(word)? else { continue }; let bitmap = RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?; docids |= bitmap; } if *nbr_typos == 0 { - if let Some(bytes) = - db_cache.get_prefix_docids(index, txn, &derivations.original)? - { + if let Some(bytes) = ctx.get_prefix_docids(derivations.original)? { let bitmap = RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?; docids |= bitmap; @@ -66,26 +60,22 @@ impl RankingRuleGraphTrait for TypoGraph { } } - fn build_visit_from_node<'transaction>( - _index: &Index, - _txn: &'transaction RoTxn, - _db_cache: &mut DatabaseCache<'transaction>, + fn build_visit_from_node<'search>( + _ctx: &mut SearchContext<'search>, _from_node: &QueryNode, ) -> Result> { Ok(Some(())) } - fn build_visit_to_node<'from_data, 'transaction: 'from_data>( - _index: &Index, - _txn: &'transaction RoTxn, - _db_cache: &mut DatabaseCache<'transaction>, + fn build_visit_to_node<'from_data, 'search: 'from_data>( + _ctx: &mut SearchContext<'search>, to_node: &QueryNode, _from_node_data: &'from_data Self::BuildVisitedFromNode, ) -> Result)>> { match to_node { QueryNode::Term(LocatedQueryTerm { value, .. }) => match value { - QueryTerm::Phrase { phrase } => { - Ok(vec![(0, EdgeDetails::Data(TypoEdge::Phrase { phrase: phrase.clone() }))]) + &QueryTerm::Phrase { phrase } => { + Ok(vec![(0, EdgeDetails::Data(TypoEdge::Phrase { phrase }))]) } QueryTerm::Word { derivations } => { let mut edges = vec![]; diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index cfa43c006..bfb9b5492 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -1,33 +1,28 @@ -use heed::RoTxn; -use roaring::RoaringBitmap; - -use super::db_cache::DatabaseCache; use super::logger::SearchLogger; - use super::QueryGraph; +use super::SearchContext; use crate::new::graph_based_ranking_rule::GraphBasedRankingRule; use crate::new::ranking_rule_graph::ProximityGraph; use crate::new::ranking_rule_graph::TypoGraph; use crate::new::words::Words; +use roaring::RoaringBitmap; // use crate::search::new::sort::Sort; -use crate::{Index, Result, TermsMatchingStrategy}; +use crate::{Result, TermsMatchingStrategy}; -pub trait RankingRuleOutputIter<'transaction, Query> { +pub trait RankingRuleOutputIter<'search, Query> { fn next_bucket(&mut self) -> Result>>; } -pub struct RankingRuleOutputIterWrapper<'transaction, Query> { - iter: Box>> + 'transaction>, +pub struct RankingRuleOutputIterWrapper<'search, Query> { + iter: Box>> + 'search>, } -impl<'transaction, Query> RankingRuleOutputIterWrapper<'transaction, Query> { - pub fn new( - iter: Box>> + 'transaction>, - ) -> Self { +impl<'search, Query> RankingRuleOutputIterWrapper<'search, Query> { + pub fn new(iter: Box>> + 'search>) -> Self { Self { iter } } } -impl<'transaction, Query> RankingRuleOutputIter<'transaction, Query> - for RankingRuleOutputIterWrapper<'transaction, Query> +impl<'search, Query> RankingRuleOutputIter<'search, Query> + for RankingRuleOutputIterWrapper<'search, Query> { fn next_bucket(&mut self) -> Result>> { match self.iter.next() { @@ -44,7 +39,7 @@ pub struct PlaceholderQuery; impl RankingRuleQueryTrait for PlaceholderQuery {} impl RankingRuleQueryTrait for QueryGraph {} -pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> { +pub trait RankingRule<'search, Query: RankingRuleQueryTrait> { fn id(&self) -> String; /// Prepare the ranking rule such that it can start iterating over its @@ -53,9 +48,7 @@ pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> { /// The given universe is the universe that will be given to [`next_bucket`](RankingRule::next_bucket). fn start_iteration( &mut self, - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + ctx: &mut SearchContext<'search>, logger: &mut dyn SearchLogger, universe: &RoaringBitmap, query: &Query, @@ -70,9 +63,7 @@ pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> { /// - the universe given to [`start_iteration`](RankingRule::start_iteration) fn next_bucket( &mut self, - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + ctx: &mut SearchContext<'search>, logger: &mut dyn SearchLogger, universe: &RoaringBitmap, ) -> Result>>; @@ -81,9 +72,7 @@ pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> { /// The next call to this ranking rule, if any, will be [`start_iteration`](RankingRule::start_iteration). fn end_iteration( &mut self, - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + ctx: &mut SearchContext<'search>, logger: &mut dyn SearchLogger, ); } @@ -98,11 +87,9 @@ pub struct RankingRuleOutput { // TODO: can make it generic over the query type (either query graph or placeholder) fairly easily #[allow(clippy::too_many_arguments)] -pub fn apply_ranking_rules<'transaction>( - index: &Index, - txn: &'transaction heed::RoTxn, +pub fn apply_ranking_rules<'search>( + ctx: &mut SearchContext<'search>, // TODO: ranking rules parameter - db_cache: &mut DatabaseCache<'transaction>, query_graph: &QueryGraph, universe: &RoaringBitmap, from: usize, @@ -115,7 +102,7 @@ pub fn apply_ranking_rules<'transaction>( let proximity = &mut GraphBasedRankingRule::::new("proximity".to_owned()); let typo = &mut GraphBasedRankingRule::::new("typo".to_owned()); // TODO: ranking rules given as argument - let mut ranking_rules: Vec<&mut dyn RankingRule<'transaction, QueryGraph>> = + let mut ranking_rules: Vec<&mut dyn RankingRule<'search, QueryGraph>> = vec![words, typo, proximity /*sort*/]; logger.ranking_rules(&ranking_rules); @@ -126,7 +113,7 @@ pub fn apply_ranking_rules<'transaction>( let ranking_rules_len = ranking_rules.len(); logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, universe); - ranking_rules[0].start_iteration(index, txn, db_cache, logger, universe, query_graph)?; + ranking_rules[0].start_iteration(ctx, logger, universe, query_graph)?; let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len]; candidates[0] = universe.clone(); @@ -142,7 +129,7 @@ pub fn apply_ranking_rules<'transaction>( &candidates[cur_ranking_rule_index], ); candidates[cur_ranking_rule_index].clear(); - ranking_rules[cur_ranking_rule_index].end_iteration(index, txn, db_cache, logger); + ranking_rules[cur_ranking_rule_index].end_iteration(ctx, logger); if cur_ranking_rule_index == 0 { break; } else { @@ -206,7 +193,7 @@ pub fn apply_ranking_rules<'transaction>( continue; } - let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(index, txn, db_cache, logger, &candidates[cur_ranking_rule_index])? else { + let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(ctx, logger, &candidates[cur_ranking_rule_index])? else { // TODO: add remaining candidates automatically here? back!(); continue; @@ -239,9 +226,7 @@ pub fn apply_ranking_rules<'transaction>( &candidates[cur_ranking_rule_index], ); ranking_rules[cur_ranking_rule_index].start_iteration( - index, - txn, - db_cache, + ctx, logger, &next_bucket.candidates, &next_bucket.query, @@ -255,9 +240,7 @@ pub fn apply_ranking_rules<'transaction>( mod tests { // use crate::allocator::ALLOC; use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; - use crate::index::tests::TempIndex; - use crate::new::db_cache::DatabaseCache; - use crate::new::execute_search; + use crate::new::{execute_search, SearchContext}; use big_s::S; use heed::EnvOpenOptions; use maplit::hashset; @@ -269,55 +252,6 @@ mod tests { use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use crate::{Criterion, Index, Object, Search, TermsMatchingStrategy}; - #[test] - fn execute_new_search() { - let index = TempIndex::new(); - index - .add_documents(documents!([ - { - "id": 7, - "text": "the super quick super brown fox jumps over", - }, - { - "id": 8, - "text": "the super quick brown fox jumps over", - }, - { - "id": 9, - "text": "the quick super brown fox jumps over", - }, - { - "id": 10, - "text": "the quick brown fox jumps over", - }, - { - "id": 11, - "text": "the quick brown fox jumps over the lazy dog", - }, - { - "id": 12, - "text": "the quick brown cat jumps over the lazy dog", - }, - ])) - .unwrap(); - let txn = index.read_txn().unwrap(); - let mut db_cache = DatabaseCache::default(); - - let results = execute_search( - &index, - &txn, - &mut db_cache, - "releases from poison by the government", - None, - 0, - 50, - &mut DefaultSearchLogger, - ) - .unwrap(); - - println!("{results:?}") - } - #[test] fn search_wiki_new() { let mut options = EnvOpenOptions::new(); @@ -331,24 +265,20 @@ mod tests { // loop { let start = Instant::now(); - let mut db_cache = DatabaseCache::default(); - - let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); + // let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); let results = execute_search( - &index, - &txn, - &mut db_cache, + &mut SearchContext::new(&index, &txn), "releases from poison by the government", None, 0, 20, - // &mut DefaultSearchLogger, - &mut logger, + &mut DefaultSearchLogger, + // &mut logger, ) .unwrap(); - logger.write_d2_description(); + // logger.write_d2_description(); let elapsed = start.elapsed(); @@ -425,19 +355,15 @@ mod tests { let index = Index::new(options, "data_movies").unwrap(); let txn = index.read_txn().unwrap(); - let primary_key = index.primary_key(&txn).unwrap().unwrap(); - let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); + // let primary_key = index.primary_key(&txn).unwrap().unwrap(); + // let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); // loop { let start = Instant::now(); - let mut db_cache = DatabaseCache::default(); - let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); - + let mut ctx = SearchContext::new(&index, &txn); let results = execute_search( - &index, - &txn, - &mut db_cache, + &mut ctx, "releases from poison by the government", None, 0, @@ -447,24 +373,24 @@ mod tests { ) .unwrap(); - logger.write_d2_description(); + logger.write_d2_description(&mut ctx); let elapsed = start.elapsed(); - let ids = index - .documents(&txn, results.iter().copied()) - .unwrap() - .into_iter() - .map(|x| { - let obkv = &x.1; - let id = obkv.get(primary_key).unwrap(); - let id: serde_json::Value = serde_json::from_slice(id).unwrap(); - id.as_str().unwrap().to_owned() - }) - .collect::>(); + // let ids = index + // .documents(&txn, results.iter().copied()) + // .unwrap() + // .into_iter() + // .map(|x| { + // let obkv = &x.1; + // let id = obkv.get(primary_key).unwrap(); + // let id: serde_json::Value = serde_json::from_slice(id).unwrap(); + // id.as_str().unwrap().to_owned() + // }) + // .collect::>(); println!("{}us: {results:?}", elapsed.as_micros()); - println!("external ids: {ids:?}"); + // println!("external ids: {ids:?}"); // } } diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 93ebcf989..de5cf02ab 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -1,34 +1,28 @@ -use std::collections::VecDeque; - -use fxhash::FxHashMap; -use heed::{BytesDecode, RoTxn}; -use roaring::{MultiOps, RoaringBitmap}; - -use super::db_cache::DatabaseCache; +use super::interner::Interned; use super::query_term::{Phrase, QueryTerm, WordDerivations}; -use super::{QueryGraph, QueryNode}; - -use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec}; +use super::{QueryGraph, QueryNode, SearchContext}; +use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec}; +use fxhash::FxHashMap; +use heed::BytesDecode; +use roaring::{MultiOps, RoaringBitmap}; +use std::collections::VecDeque; // TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc. #[derive(Default)] pub struct NodeDocIdsCache { pub cache: FxHashMap, } -impl NodeDocIdsCache { - fn get_docids<'cache, 'transaction>( +impl<'search> SearchContext<'search> { + fn get_node_docids<'cache>( &'cache mut self, - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, term: &QueryTerm, node_idx: u32, ) -> Result<&'cache RoaringBitmap> { - if self.cache.contains_key(&node_idx) { - return Ok(&self.cache[&node_idx]); + if self.node_docids_cache.cache.contains_key(&node_idx) { + return Ok(&self.node_docids_cache.cache[&node_idx]); }; let docids = match term { - QueryTerm::Phrase { phrase } => resolve_phrase(index, txn, db_cache, phrase)?, + QueryTerm::Phrase { phrase } => resolve_phrase(self, *phrase)?, QueryTerm::Word { derivations: WordDerivations { @@ -42,15 +36,14 @@ impl NodeDocIdsCache { }, } => { let mut or_docids = vec![]; - for word in zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()) { - if let Some(word_docids) = db_cache.get_word_docids(index, txn, word)? { + for word in zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()).copied() + { + if let Some(word_docids) = self.get_word_docids(word)? { or_docids.push(word_docids); } } if *use_prefix_db { - if let Some(prefix_docids) = - db_cache.get_prefix_docids(index, txn, original.as_str())? - { + if let Some(prefix_docids) = self.get_prefix_docids(*original)? { or_docids.push(prefix_docids); } } @@ -58,32 +51,25 @@ impl NodeDocIdsCache { .into_iter() .map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap()) .collect::>(); - for synonym in synonyms { + for synonym in synonyms.iter().copied() { // TODO: cache resolve_phrase? - docids.push(resolve_phrase(index, txn, db_cache, synonym)?); + docids.push(resolve_phrase(self, synonym)?); } - if let Some((left, right)) = split_words { - if let Some(split_word_docids) = - db_cache.get_word_pair_proximity_docids(index, txn, left, right, 1)? - { - docids.push(CboRoaringBitmapCodec::deserialize_from(split_word_docids)?); - } + if let Some(split_words) = split_words { + docids.push(resolve_phrase(self, *split_words)?); } MultiOps::union(docids) } }; - let _ = self.cache.insert(node_idx, docids); - let docids = &self.cache[&node_idx]; + let _ = self.node_docids_cache.cache.insert(node_idx, docids); + let docids = &self.node_docids_cache.cache[&node_idx]; Ok(docids) } } -pub fn resolve_query_graph<'transaction>( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, - node_docids_cache: &mut NodeDocIdsCache, +pub fn resolve_query_graph<'search>( + ctx: &mut SearchContext<'search>, q: &QueryGraph, universe: &RoaringBitmap, ) -> Result { @@ -111,8 +97,7 @@ pub fn resolve_query_graph<'transaction>( let node_docids = match n { QueryNode::Term(located_term) => { let term = &located_term.value; - let derivations_docids = - node_docids_cache.get_docids(index, txn, db_cache, term, node)?; + let derivations_docids = ctx.get_node_docids(term, node)?; predecessors_docids & derivations_docids } QueryNode::Deleted => { @@ -143,13 +128,8 @@ pub fn resolve_query_graph<'transaction>( panic!() } -pub fn resolve_phrase<'transaction>( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, - phrase: &Phrase, -) -> Result { - let Phrase { words } = phrase; +pub fn resolve_phrase(ctx: &mut SearchContext, phrase: Interned) -> Result { + let Phrase { words } = ctx.phrase_interner.get(phrase).clone(); let mut candidates = RoaringBitmap::new(); let mut first_iter = true; let winsize = words.len().min(3); @@ -161,19 +141,19 @@ pub fn resolve_phrase<'transaction>( for win in words.windows(winsize) { // Get all the documents with the matching distance for each word pairs. let mut bitmaps = Vec::with_capacity(winsize.pow(2)); - for (offset, s1) in win + for (offset, &s1) in win .iter() .enumerate() .filter_map(|(index, word)| word.as_ref().map(|word| (index, word))) { - for (dist, s2) in win + for (dist, &s2) in win .iter() .skip(offset + 1) .enumerate() .filter_map(|(index, word)| word.as_ref().map(|word| (index, word))) { if dist == 0 { - match db_cache.get_word_pair_proximity_docids(index, txn, s1, s2, 1)? { + match ctx.get_word_pair_proximity_docids(s1, s2, 1)? { Some(m) => bitmaps.push(CboRoaringBitmapCodec::deserialize_from(m)?), // If there are no documents for this pair, there will be no // results for the phrase query. @@ -182,13 +162,9 @@ pub fn resolve_phrase<'transaction>( } else { let mut bitmap = RoaringBitmap::new(); for dist in 0..=dist { - if let Some(m) = db_cache.get_word_pair_proximity_docids( - index, - txn, - s1, - s2, - dist as u8 + 1, - )? { + if let Some(m) = + ctx.get_word_pair_proximity_docids(s1, s2, dist as u8 + 1)? + { bitmap |= CboRoaringBitmapCodec::deserialize_from(m)?; } } diff --git a/milli/src/search/new/sort.rs b/milli/src/search/new/sort.rs index 9ef01bd95..f0967843b 100644 --- a/milli/src/search/new/sort.rs +++ b/milli/src/search/new/sort.rs @@ -1,11 +1,7 @@ -use heed::RoTxn; -use roaring::RoaringBitmap; - -use super::db_cache::DatabaseCache; use super::logger::SearchLogger; use super::{ RankingRule, RankingRuleOutput, RankingRuleOutputIter, RankingRuleOutputIterWrapper, - RankingRuleQueryTrait, + RankingRuleQueryTrait, SearchContext, }; use crate::{ // facet::FacetType, @@ -15,18 +11,19 @@ use crate::{ Index, Result, }; +use roaring::RoaringBitmap; -pub struct Sort<'transaction, Query> { +pub struct Sort<'search, Query> { field_name: String, field_id: Option, is_ascending: bool, original_query: Option, - iter: Option>, + iter: Option>, } -impl<'transaction, Query> Sort<'transaction, Query> { - pub fn new( +impl<'search, Query> Sort<'search, Query> { + pub fn _new( index: &Index, - rtxn: &'transaction heed::RoTxn, + rtxn: &'search heed::RoTxn, field_name: String, is_ascending: bool, ) -> Result { @@ -37,18 +34,14 @@ impl<'transaction, Query> Sort<'transaction, Query> { } } -impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query> - for Sort<'transaction, Query> -{ +impl<'search, Query: RankingRuleQueryTrait> RankingRule<'search, Query> for Sort<'search, Query> { fn id(&self) -> String { let Self { field_name, is_ascending, .. } = self; format!("{field_name}:{}", if *is_ascending { "asc" } else { "desc " }) } fn start_iteration( &mut self, - index: &Index, - txn: &'transaction RoTxn, - _db_cache: &mut DatabaseCache<'transaction>, + ctx: &mut SearchContext<'search>, _logger: &mut dyn SearchLogger, parent_candidates: &RoaringBitmap, parent_query_graph: &Query, @@ -59,8 +52,8 @@ impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query if self.is_ascending { ascending_facet_sort } else { descending_facet_sort }; let number_iter = make_iter( - txn, - index + ctx.txn, + ctx.index .facet_id_f64_docids .remap_key_type::>(), field_id, @@ -68,8 +61,8 @@ impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query )?; let string_iter = make_iter( - txn, - index + ctx.txn, + ctx.index .facet_id_string_docids .remap_key_type::>(), field_id, @@ -91,9 +84,7 @@ impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query fn next_bucket( &mut self, - _index: &Index, - _txn: &'transaction RoTxn, - _db_cache: &mut DatabaseCache<'transaction>, + _ctx: &mut SearchContext<'search>, _logger: &mut dyn SearchLogger, universe: &RoaringBitmap, ) -> Result>> { @@ -110,9 +101,7 @@ impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query fn end_iteration( &mut self, - _index: &Index, - _txn: &'transaction RoTxn, - _db_cache: &mut DatabaseCache<'transaction>, + _ctx: &mut SearchContext<'search>, _logger: &mut dyn SearchLogger, ) { self.original_query = None; diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index 10c0800ba..9ad8b33ba 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -1,13 +1,9 @@ -use std::collections::BTreeSet; - -use heed::RoTxn; -use roaring::RoaringBitmap; - -use super::db_cache::DatabaseCache; use super::logger::SearchLogger; -use super::resolve_query_graph::{resolve_query_graph, NodeDocIdsCache}; -use super::{QueryGraph, QueryNode, RankingRule, RankingRuleOutput}; -use crate::{Index, Result, TermsMatchingStrategy}; +use super::resolve_query_graph::resolve_query_graph; +use super::{QueryGraph, QueryNode, RankingRule, RankingRuleOutput, SearchContext}; +use crate::{Result, TermsMatchingStrategy}; +use roaring::RoaringBitmap; +use std::collections::BTreeSet; pub struct Words { exhausted: bool, @@ -15,7 +11,6 @@ pub struct Words { iterating: bool, positions_to_remove: Vec, terms_matching_strategy: TermsMatchingStrategy, - node_docids_cache: NodeDocIdsCache, } impl Words { pub fn new(terms_matching_strategy: TermsMatchingStrategy) -> Self { @@ -25,20 +20,17 @@ impl Words { iterating: false, positions_to_remove: vec![], terms_matching_strategy, - node_docids_cache: <_>::default(), } } } -impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { +impl<'search> RankingRule<'search, QueryGraph> for Words { fn id(&self) -> String { "words".to_owned() } fn start_iteration( &mut self, - _index: &Index, - _txn: &'transaction RoTxn, - _db_cache: &mut DatabaseCache<'transaction>, + _ctx: &mut SearchContext<'search>, _logger: &mut dyn SearchLogger, _parent_candidates: &RoaringBitmap, parent_query_graph: &QueryGraph, @@ -71,9 +63,7 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { fn next_bucket( &mut self, - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, + ctx: &mut SearchContext<'search>, logger: &mut dyn SearchLogger, universe: &RoaringBitmap, ) -> Result>> { @@ -87,14 +77,7 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { logger.log_words_state(query_graph); - let this_bucket = resolve_query_graph( - index, - txn, - db_cache, - &mut self.node_docids_cache, - query_graph, - universe, - )?; + let this_bucket = resolve_query_graph(ctx, query_graph, universe)?; let child_query_graph = query_graph.clone(); loop { @@ -115,9 +98,7 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { fn end_iteration( &mut self, - _index: &Index, - _txn: &'transaction RoTxn, - _db_cache: &mut DatabaseCache<'transaction>, + _ctx: &mut SearchContext<'search>, _logger: &mut dyn SearchLogger, ) { self.iterating = false;