From 223e82a10d5f99c89b8e948999d7d97be7052cda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 30 Mar 2023 11:05:51 +0200 Subject: [PATCH] Update QueryGraph to use new lazy query terms + build from paths --- milli/src/search/new/query_graph.rs | 383 +++++++++++++++++----------- 1 file changed, 241 insertions(+), 142 deletions(-) diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 0f06b9b95..8f87dfd8c 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -1,10 +1,14 @@ -use std::collections::HashSet; - use super::interner::{FixedSizeInterner, Interned}; -use super::query_term::{self, number_of_typos_allowed, LocatedQueryTerm}; +use super::query_term::{ + self, number_of_typos_allowed, LocatedQueryTerm, LocatedQueryTermSubset, NTypoTermSubset, + QueryTermSubset, +}; use super::small_bitmap::SmallBitmap; use super::SearchContext; +use crate::search::new::interner::DedupInterner; use crate::Result; +use std::cmp::Ordering; +use std::collections::BTreeMap; /// A node of the [`QueryGraph`]. /// @@ -21,9 +25,9 @@ pub struct QueryNode { pub predecessors: SmallBitmap, pub successors: SmallBitmap, } -#[derive(Clone)] +#[derive(Clone, PartialEq, Eq, Hash)] pub enum QueryNodeData { - Term(LocatedQueryTerm), + Term(LocatedQueryTermSubset), Deleted, Start, End, @@ -83,51 +87,15 @@ pub struct QueryGraph { pub nodes: FixedSizeInterner, } -// impl Default for QueryGraph { -// /// Create a new QueryGraph with two disconnected nodes: the root and end nodes. -// fn default() -> Self { -// let nodes = vec![ -// QueryNode { -// data: QueryNodeData::Start, -// predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), -// successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), -// }, -// QueryNode { -// data: QueryNodeData::End, -// predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), -// successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), -// }, -// ]; - -// Self { root_node: 0, end_node: 1, nodes } -// } -// } - -impl QueryGraph { - /// Connect all the given predecessor nodes to the given successor node - fn connect_to_node( - &mut self, - from_nodes: &[Interned], - to_node: Interned, - ) { - for &from_node in from_nodes { - self.nodes.get_mut(from_node).successors.insert(to_node); - self.nodes.get_mut(to_node).predecessors.insert(from_node); - } - } -} - impl QueryGraph { /// Build the query graph from the parsed user search query. - /// - /// The ngrams are made at this point. - pub fn from_query(ctx: &mut SearchContext, terms: Vec) -> Result { + pub fn from_query( + ctx: &mut SearchContext, + // NOTE: the terms here must be consecutive + terms: &[LocatedQueryTerm], + ) -> Result { let nbr_typos = number_of_typos_allowed(ctx)?; - let mut empty_nodes = vec![]; - - let mut predecessors: Vec> = vec![HashSet::new(), HashSet::new()]; - let mut successors: Vec> = vec![HashSet::new(), HashSet::new()]; let mut nodes_data: Vec = vec![QueryNodeData::Start, QueryNodeData::End]; let root_node = 0; let end_node = 1; @@ -136,21 +104,23 @@ impl QueryGraph { let (mut prev2, mut prev1, mut prev0): (Vec, Vec, Vec) = (vec![], vec![], vec![root_node]); - for term_idx in 0..terms.len() { - let term0 = &terms[term_idx]; - + let original_terms_len = terms.len(); + for term_idx in 0..original_terms_len { let mut new_nodes = vec![]; let new_node_idx = add_node( &mut nodes_data, - QueryNodeData::Term(term0.clone()), - &prev0, - &mut successors, - &mut predecessors, + QueryNodeData::Term(LocatedQueryTermSubset { + term_subset: QueryTermSubset { + original: Interned::from_raw(term_idx as u16), + zero_typo_subset: NTypoTermSubset::All, + one_typo_subset: NTypoTermSubset::All, + two_typo_subset: NTypoTermSubset::All, + }, + positions: terms[term_idx].positions.clone(), + term_ids: term_idx as u8..=term_idx as u8, + }), ); new_nodes.push(new_node_idx); - if term0.is_empty(&ctx.term_interner) { - empty_nodes.push(new_node_idx); - } if !prev1.is_empty() { if let Some(ngram) = @@ -158,10 +128,16 @@ impl QueryGraph { { let ngram_idx = add_node( &mut nodes_data, - QueryNodeData::Term(ngram), - &prev1, - &mut successors, - &mut predecessors, + QueryNodeData::Term(LocatedQueryTermSubset { + term_subset: QueryTermSubset { + original: ngram.value, + zero_typo_subset: NTypoTermSubset::All, + one_typo_subset: NTypoTermSubset::All, + two_typo_subset: NTypoTermSubset::All, + }, + positions: ngram.positions, + term_ids: term_idx as u8 - 1..=term_idx as u8, + }), ); new_nodes.push(ngram_idx); } @@ -172,10 +148,16 @@ impl QueryGraph { { let ngram_idx = add_node( &mut nodes_data, - QueryNodeData::Term(ngram), - &prev2, - &mut successors, - &mut predecessors, + QueryNodeData::Term(LocatedQueryTermSubset { + term_subset: QueryTermSubset { + original: ngram.value, + zero_typo_subset: NTypoTermSubset::All, + one_typo_subset: NTypoTermSubset::All, + two_typo_subset: NTypoTermSubset::All, + }, + positions: ngram.positions, + term_ids: term_idx as u8 - 2..=term_idx as u8, + }), ); new_nodes.push(ngram_idx); } @@ -193,35 +175,17 @@ impl QueryGraph { successors: SmallBitmap::new(nodes_data.len() as u16), }, ); - for (node_idx, ((node_data, predecessors), successors)) in nodes_data - .into_iter() - .zip(predecessors.into_iter()) - .zip(successors.into_iter()) - .enumerate() - { + for (node_idx, node_data) in nodes_data.into_iter().enumerate() { let node = nodes.get_mut(Interned::from_raw(node_idx as u16)); node.data = node_data; - for x in predecessors { - node.predecessors.insert(Interned::from_raw(x)); - } - for x in successors { - node.successors.insert(Interned::from_raw(x)); - } } let mut graph = QueryGraph { root_node, end_node, nodes }; - - graph.connect_to_node( - prev0.into_iter().map(Interned::from_raw).collect::>().as_slice(), - end_node, - ); - let empty_nodes = empty_nodes.into_iter().map(Interned::from_raw).collect::>(); - graph.remove_nodes_keep_edges(&empty_nodes); + graph.rebuild_edges(); Ok(graph) } /// Remove the given nodes and all their edges from the query graph. - /// TODO: need to check where this is used, and if this is correct. pub fn remove_nodes(&mut self, nodes: &[Interned]) { for &node_id in nodes { let node = &self.nodes.get(node_id); @@ -240,85 +204,220 @@ impl QueryGraph { node.predecessors.clear(); node.successors.clear(); } + self.rebuild_edges(); } - /// Remove the given nodes, connecting all their predecessors to all their successors. - pub fn remove_nodes_keep_edges(&mut self, nodes: &[Interned]) { - for &node_id in nodes { - let node = self.nodes.get(node_id); - let old_node_pred = node.predecessors.clone(); - let old_node_succ = node.successors.clone(); - for pred in old_node_pred.iter() { - let pred_successors = &mut self.nodes.get_mut(pred).successors; - pred_successors.remove(node_id); - pred_successors.union(&old_node_succ); - } - for succ in old_node_succ.iter() { - let succ_predecessors = &mut self.nodes.get_mut(succ).predecessors; - succ_predecessors.remove(node_id); - succ_predecessors.union(&old_node_pred); - } - let node = self.nodes.get_mut(node_id); - node.data = QueryNodeData::Deleted; - node.predecessors.clear(); + + fn rebuild_edges(&mut self) { + for (_, node) in self.nodes.iter_mut() { node.successors.clear(); + node.predecessors.clear(); + } + for node_id in self.nodes.indexes() { + let node = self.nodes.get(node_id); + let end_position = match &node.data { + QueryNodeData::Term(term) => *term.positions.end(), + QueryNodeData::Start => -1, + QueryNodeData::Deleted => continue, + QueryNodeData::End => continue, + }; + let successors = { + let mut successors = SmallBitmap::for_interned_values_in(&self.nodes); + let mut min = i8::MAX; + for (node_id, node) in self.nodes.iter() { + let start_position = match &node.data { + QueryNodeData::Term(term) => *term.positions.start(), + QueryNodeData::End => i8::MAX, + QueryNodeData::Start => continue, + QueryNodeData::Deleted => continue, + }; + if start_position <= end_position { + continue; + } + match start_position.cmp(&min) { + Ordering::Less => { + min = start_position; + successors.clear(); + successors.insert(node_id); + } + Ordering::Equal => { + successors.insert(node_id); + } + Ordering::Greater => continue, + } + } + successors + }; + let node = self.nodes.get_mut(node_id); + node.successors = successors.clone(); + for successor in successors.iter() { + let successor = self.nodes.get_mut(successor); + successor.predecessors.insert(node_id); + } } } - /// Remove all the nodes that correspond to a word starting at the given position, and connect - /// the predecessors of these nodes to their successors. - /// Return `true` if any node was removed. + /// Remove all the nodes that correspond to a word starting at the given position and rebuild + /// the edges of the graph appropriately. pub fn remove_words_starting_at_position(&mut self, position: i8) -> bool { - let mut nodes_to_remove_keeping_edges = vec![]; + let mut nodes_to_remove = vec![]; for (node_idx, node) in self.nodes.iter() { - let QueryNodeData::Term(LocatedQueryTerm { value: _, positions }) = &node.data else { continue }; + let QueryNodeData::Term(LocatedQueryTermSubset { term_subset: _, positions, term_ids: _ }) = &node.data else { continue }; if positions.start() == &position { - nodes_to_remove_keeping_edges.push(node_idx); + nodes_to_remove.push(node_idx); } } - self.remove_nodes_keep_edges(&nodes_to_remove_keeping_edges); + self.remove_nodes(&nodes_to_remove); - self.simplify(); - !nodes_to_remove_keeping_edges.is_empty() + !nodes_to_remove.is_empty() } - /// Simplify the query graph by removing all nodes that are disconnected from - /// the start or end nodes. - pub fn simplify(&mut self) { - loop { - let mut nodes_to_remove = vec![]; - for (node_idx, node) in self.nodes.iter() { - if (!matches!(node.data, QueryNodeData::End | QueryNodeData::Deleted) - && node.successors.is_empty()) - || (!matches!(node.data, QueryNodeData::Start | QueryNodeData::Deleted) - && node.predecessors.is_empty()) - { - nodes_to_remove.push(node_idx); + pub fn removal_order_for_terms_matching_strategy_last(&self) -> Vec> { + let (first_term_idx, last_term_idx) = { + let mut first_term_idx = u8::MAX; + let mut last_term_idx = 0u8; + for (_, node) in self.nodes.iter() { + match &node.data { + QueryNodeData::Term(t) => { + if *t.term_ids.end() > last_term_idx { + last_term_idx = *t.term_ids.end(); + } + if *t.term_ids.start() < first_term_idx { + first_term_idx = *t.term_ids.start(); + } + } + QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue, } } - if nodes_to_remove.is_empty() { - break; - } else { - self.remove_nodes(&nodes_to_remove); - } + (first_term_idx, last_term_idx) + }; + if first_term_idx >= last_term_idx { + return vec![]; } + let cost_of_term_idx = |term_idx: u8| { + if term_idx == first_term_idx { + None + } else { + let rank = 1 + last_term_idx - term_idx; + Some(rank as u16) + } + }; + let mut nodes_to_remove = BTreeMap::>::new(); + for (node_id, node) in self.nodes.iter() { + let QueryNodeData::Term(t) = &node.data else { continue }; + let mut cost = 0; + for id in t.term_ids.clone() { + if let Some(t_cost) = cost_of_term_idx(id) { + cost += t_cost; + } else { + continue; + } + } + nodes_to_remove + .entry(cost) + .or_insert_with(|| SmallBitmap::for_interned_values_in(&self.nodes)) + .insert(node_id); + } + nodes_to_remove.into_values().collect() } } -fn add_node( - nodes_data: &mut Vec, - node_data: QueryNodeData, - from_nodes: &Vec, - successors: &mut Vec>, - predecessors: &mut Vec>, -) -> u16 { - successors.push(HashSet::new()); - predecessors.push(HashSet::new()); +fn add_node(nodes_data: &mut Vec, node_data: QueryNodeData) -> u16 { let new_node_idx = nodes_data.len() as u16; nodes_data.push(node_data); - for &from_node in from_nodes { - successors[from_node as usize].insert(new_node_idx); - predecessors[new_node_idx as usize].insert(from_node); - } new_node_idx } + +impl QueryGraph { + /* + Build a query graph from a list of paths + + The paths are composed of source and dest terms. + If the source term is `None`, then the last dest term is used + as the predecessor of the dest term. If the source is Some(_), + then an edge is built between the last dest term and the source, + and between the source and new dest term. + + Note that the resulting graph will not correspond to a perfect + representation of the set of paths. + For example, consider the following paths: + ```txt + PATH 1 : a -> b1 -> c1 -> d -> e1 + PATH 2 : a -> b2 -> c2 -> d -> e2 + ``` + Then the resulting graph will be: + ```txt + ┌────┐ ┌────┐ ┌────┐ + ┌──│ b1 │──│ c1 │─┐ ┌──│ e1 │ + ┌────┐ │ └────┘ └────┘ │ ┌────┐ │ └────┘ + │ a │─┤ ├─│ d │─┤ + └────┘ │ ┌────┐ ┌────┐ │ └────┘ │ ┌────┐ + └──│ b2 │──│ c2 │─┘ └──│ e2 │ + └────┘ └────┘ └────┘ + ``` + which is different from the fully correct representation: + ```txt + ┌────┐ ┌────┐ ┌────┐ ┌────┐ + ┌──│ b1 │──│ c1 │───│ d │───│ e1 │ + ┌────┐ │ └────┘ └────┘ └────┘ └────┘ + │ a │─┤ + └────┘ │ ┌────┐ ┌────┐ ┌────┐ ┌────┐ + └──│ b2 │──│ c2 │───│ d │───│ e2 │ + └────┘ └────┘ └────┘ └────┘ + ``` + But we accept the first representation as it reduces the size + of the graph and shouldn't cause much problems. + */ + pub fn build_from_paths( + paths: Vec, LocatedQueryTermSubset)>>, + ) -> Self { + let mut node_data = DedupInterner::default(); + let root_node = node_data.insert(QueryNodeData::Start); + let end_node = node_data.insert(QueryNodeData::End); + + let mut paths_with_ids = vec![]; + for path in paths { + let mut path_with_ids = vec![]; + for node in path { + let (start_term, end_term) = node; + let src_node_id = start_term.map(|x| node_data.insert(QueryNodeData::Term(x))); + let dest_node_id = node_data.insert(QueryNodeData::Term(end_term)); + path_with_ids.push((src_node_id, dest_node_id)); + } + paths_with_ids.push(path_with_ids); + } + let nodes_data = node_data.freeze(); + let nodes_data_len = nodes_data.len(); + let mut nodes = nodes_data.map_move(|n| QueryNode { + data: n, + predecessors: SmallBitmap::new(nodes_data_len), + successors: SmallBitmap::new(nodes_data_len), + }); + + let root_node = Interned::from_raw(root_node.into_raw()); + let end_node = Interned::from_raw(end_node.into_raw()); + + for path in paths_with_ids { + let mut prev_node = root_node; + for node in path { + let (start_term, dest_term) = node; + let end_term = Interned::from_raw(dest_term.into_raw()); + let src = if let Some(start_term) = start_term { + let start_term = Interned::from_raw(start_term.into_raw()); + nodes.get_mut(prev_node).successors.insert(start_term); + nodes.get_mut(start_term).predecessors.insert(prev_node); + start_term + } else { + prev_node + }; + nodes.get_mut(src).successors.insert(end_term); + nodes.get_mut(end_term).predecessors.insert(src); + prev_node = end_term; + } + nodes.get_mut(prev_node).successors.insert(end_node); + nodes.get_mut(end_node).predecessors.insert(prev_node); + } + + QueryGraph { root_node, end_node, nodes } + } +}