From 85d96d35a857a45ce70f45d5996b5b30fc3ff1c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic.lecrenier@me.com>
Date: Wed, 3 May 2023 13:39:19 +0200
Subject: [PATCH 1/6] Highlight ngram matches as well

---
 milli/src/search/new/mod.rs         |  4 ++--
 milli/src/search/new/query_graph.rs | 14 ++++++++++----
 2 files changed, 12 insertions(+), 6 deletions(-)
diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs
index 7e8426bf9..cbc085b12 100644
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@@ -397,8 +397,8 @@ pub fn execute_search(
         None
     };
     let bucket_sort_output = if let Some(query_terms) = query_terms {
-        let graph = QueryGraph::from_query(ctx, &query_terms)?;
-        located_query_terms = Some(query_terms);
+        let (graph, new_located_query_terms) = QueryGraph::from_query(ctx, &query_terms)?;
+        located_query_terms = Some(new_located_query_terms);
 
         let ranking_rules = get_ranking_rules_for_query_graph_search(
             ctx,
diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs
index 0e7d5a7f3..dc25d1bc3 100644
--- a/milli/src/search/new/query_graph.rs
+++ b/milli/src/search/new/query_graph.rs
@@ -88,12 +88,15 @@ pub struct QueryGraph {
 }
 
 impl QueryGraph {
-    /// Build the query graph from the parsed user search query.
+    /// Build the query graph from the parsed user search query, return an updated list of the located query terms
+    /// which contains ngrams.
     pub fn from_query(
         ctx: &mut SearchContext,
         // NOTE: the terms here must be consecutive
         terms: &[LocatedQueryTerm],
-    ) -> Result<QueryGraph> {
+    ) -> Result<(QueryGraph, Vec<LocatedQueryTerm>)> {
+        let mut new_located_query_terms = terms.to_vec();
+
         let nbr_typos = number_of_typos_allowed(ctx)?;
 
         let mut nodes_data: Vec<QueryNodeData> = vec![QueryNodeData::Start, QueryNodeData::End];
@@ -107,10 +110,11 @@ impl QueryGraph {
         let original_terms_len = terms.len();
         for term_idx in 0..original_terms_len {
             let mut new_nodes = vec![];
+
             let new_node_idx = add_node(
                 &mut nodes_data,
                 QueryNodeData::Term(LocatedQueryTermSubset {
-                    term_subset: QueryTermSubset::full(Interned::from_raw(term_idx as u16)),
+                    term_subset: QueryTermSubset::full(terms[term_idx].value),
                     positions: terms[term_idx].positions.clone(),
                     term_ids: term_idx as u8..=term_idx as u8,
                 }),
@@ -121,6 +125,7 @@ impl QueryGraph {
                 if let Some(ngram) =
                     query_term::make_ngram(ctx, &terms[term_idx - 1..=term_idx], &nbr_typos)?
                 {
+                    new_located_query_terms.push(ngram.clone());
                     let ngram_idx = add_node(
                         &mut nodes_data,
                         QueryNodeData::Term(LocatedQueryTermSubset {
@@ -136,6 +141,7 @@ impl QueryGraph {
                 if let Some(ngram) =
                     query_term::make_ngram(ctx, &terms[term_idx - 2..=term_idx], &nbr_typos)?
                 {
+                    new_located_query_terms.push(ngram.clone());
                     let ngram_idx = add_node(
                         &mut nodes_data,
                         QueryNodeData::Term(LocatedQueryTermSubset {
@@ -167,7 +173,7 @@ impl QueryGraph {
         let mut graph = QueryGraph { root_node, end_node, nodes };
         graph.build_initial_edges();
 
-        Ok(graph)
+        Ok((graph, new_located_query_terms))
     }
 
     /// Remove the given nodes, connecting all their predecessors to all their successors.

From a37da36766101d10688e9185e9c822a7e6b77ac7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic.lecrenier@me.com>
Date: Mon, 8 May 2023 11:52:43 +0200
Subject: [PATCH 2/6] Implement `words` as a graph-based ranking rule and fix
 some bugs

---
 .../search/new/graph_based_ranking_rule.rs    |   8 +-
 milli/src/search/new/logger/visual.rs         |  24 ++--
 milli/src/search/new/mod.rs                   |  11 +-
 .../new/ranking_rule_graph/cheapest_paths.rs  | 105 ++++++++++++------
 .../src/search/new/ranking_rule_graph/mod.rs  |   3 +
 .../new/ranking_rule_graph/words/mod.rs       |  49 ++++++++
 milli/src/search/new/words.rs                 |  87 ---------------
 7 files changed, 148 insertions(+), 139 deletions(-)
 create mode 100644 milli/src/search/new/ranking_rule_graph/words/mod.rs
 delete mode 100644 milli/src/search/new/words.rs

diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs
index d8f6836e7..dd25ddd4a 100644
--- a/milli/src/search/new/graph_based_ranking_rule.rs
+++ b/milli/src/search/new/graph_based_ranking_rule.rs
@@ -46,7 +46,7 @@ use super::logger::SearchLogger;
 use super::query_graph::QueryNode;
 use super::ranking_rule_graph::{
     ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, FidGraph, PositionGraph, ProximityGraph,
-    RankingRuleGraph, RankingRuleGraphTrait, TypoGraph,
+    RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, WordsGraph,
 };
 use super::small_bitmap::SmallBitmap;
 use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
@@ -54,6 +54,12 @@ use crate::search::new::query_term::LocatedQueryTermSubset;
 use crate::search::new::ranking_rule_graph::PathVisitor;
 use crate::{Result, TermsMatchingStrategy};
 
+pub type Words = GraphBasedRankingRule<WordsGraph>;
+impl GraphBasedRankingRule<WordsGraph> {
+    pub fn new(terms_matching_strategy: TermsMatchingStrategy) -> Self {
+        Self::new_with_id("words".to_owned(), Some(terms_matching_strategy))
+    }
+}
 pub type Proximity = GraphBasedRankingRule<ProximityGraph>;
 impl GraphBasedRankingRule<ProximityGraph> {
     pub fn new(terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self {
diff --git a/milli/src/search/new/logger/visual.rs b/milli/src/search/new/logger/visual.rs
index 1cbe007d3..f76782e63 100644
--- a/milli/src/search/new/logger/visual.rs
+++ b/milli/src/search/new/logger/visual.rs
@@ -4,7 +4,6 @@ use std::io::{BufWriter, Write};
 use std::path::{Path, PathBuf};
 use std::time::Instant;
 
-// use rand::random;
 use roaring::RoaringBitmap;
 
 use crate::search::new::interner::Interned;
@@ -13,6 +12,7 @@ use crate::search::new::query_term::LocatedQueryTermSubset;
 use crate::search::new::ranking_rule_graph::{
     Edge, FidCondition, FidGraph, PositionCondition, PositionGraph, ProximityCondition,
     ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoCondition, TypoGraph,
+    WordsCondition, WordsGraph,
 };
 use crate::search::new::ranking_rules::BoxRankingRule;
 use crate::search::new::{QueryGraph, QueryNode, RankingRule, SearchContext, SearchLogger};
@@ -24,11 +24,12 @@ pub enum SearchEvents {
     RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 },
     RankingRuleEndIteration { ranking_rule_idx: usize, universe_len: u64 },
     ExtendResults { new: Vec<u32> },
-    WordsGraph { query_graph: QueryGraph },
     ProximityGraph { graph: RankingRuleGraph<ProximityGraph> },
     ProximityPaths { paths: Vec<Vec<Interned<ProximityCondition>>> },
     TypoGraph { graph: RankingRuleGraph<TypoGraph> },
     TypoPaths { paths: Vec<Vec<Interned<TypoCondition>>> },
+    WordsGraph { graph: RankingRuleGraph<WordsGraph> },
+    WordsPaths { paths: Vec<Vec<Interned<WordsCondition>>> },
     FidGraph { graph: RankingRuleGraph<FidGraph> },
     FidPaths { paths: Vec<Vec<Interned<FidCondition>>> },
     PositionGraph { graph: RankingRuleGraph<PositionGraph> },
@@ -139,8 +140,11 @@ impl SearchLogger<QueryGraph> for VisualSearchLogger {
         let Some(location) = self.location.last() else { return };
         match location {
             Location::Words => {
-                if let Some(query_graph) = state.downcast_ref::<QueryGraph>() {
-                    self.events.push(SearchEvents::WordsGraph { query_graph: query_graph.clone() });
+                if let Some(graph) = state.downcast_ref::<RankingRuleGraph<WordsGraph>>() {
+                    self.events.push(SearchEvents::WordsGraph { graph: graph.clone() });
+                }
+                if let Some(paths) = state.downcast_ref::<Vec<Vec<Interned<WordsCondition>>>>() {
+                    self.events.push(SearchEvents::WordsPaths { paths: paths.clone() });
                 }
             }
             Location::Typo => {
@@ -329,7 +333,6 @@ impl<'ctx> DetailedLoggerFinish<'ctx> {
             SearchEvents::ExtendResults { new } => {
                 self.write_extend_results(new)?;
             }
-            SearchEvents::WordsGraph { query_graph } => self.write_words_graph(query_graph)?,
             SearchEvents::ProximityGraph { graph } => self.write_rr_graph(&graph)?,
             SearchEvents::ProximityPaths { paths } => {
                 self.write_rr_graph_paths::<ProximityGraph>(paths)?;
@@ -338,6 +341,10 @@ impl<'ctx> DetailedLoggerFinish<'ctx> {
             SearchEvents::TypoPaths { paths } => {
                 self.write_rr_graph_paths::<TypoGraph>(paths)?;
             }
+            SearchEvents::WordsGraph { graph } => self.write_rr_graph(&graph)?,
+            SearchEvents::WordsPaths { paths } => {
+                self.write_rr_graph_paths::<WordsGraph>(paths)?;
+            }
             SearchEvents::FidGraph { graph } => self.write_rr_graph(&graph)?,
             SearchEvents::FidPaths { paths } => {
                 self.write_rr_graph_paths::<FidGraph>(paths)?;
@@ -482,13 +489,6 @@ fill: \"#B6E2D3\"
         }
         Ok(())
     }
-    fn write_words_graph(&mut self, qg: QueryGraph) -> Result<()> {
-        self.make_new_file_for_internal_state_if_needed()?;
-
-        self.write_query_graph(&qg)?;
-
-        Ok(())
-    }
     fn write_rr_graph<R: RankingRuleGraphTrait>(
         &mut self,
         graph: &RankingRuleGraph<R>,
diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs
index cbc085b12..a28f42f35 100644
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@@ -15,11 +15,7 @@ mod resolve_query_graph;
 mod small_bitmap;
 
 mod exact_attribute;
-// TODO: documentation + comments
-// implementation is currently an adaptation of the previous implementation to fit with the new model
 mod sort;
-// TODO: documentation + comments
-mod words;
 
 #[cfg(test)]
 mod tests;
@@ -43,10 +39,10 @@ use ranking_rules::{
 use resolve_query_graph::{compute_query_graph_docids, PhraseDocIdsCache};
 use roaring::RoaringBitmap;
 use sort::Sort;
-use words::Words;
 
 use self::geo_sort::GeoSort;
 pub use self::geo_sort::Strategy as GeoSortStrategy;
+use self::graph_based_ranking_rule::Words;
 use self::interner::Interned;
 use crate::search::new::distinct::apply_distinct_rule;
 use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError};
@@ -202,6 +198,11 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
     let mut sorted_fields = HashSet::new();
     let mut geo_sorted = false;
 
+    // Don't add the `words` ranking rule if the term matching strategy is `All`
+    if matches!(terms_matching_strategy, TermsMatchingStrategy::All) {
+        words = true;
+    }
+
     let mut ranking_rules: Vec<BoxRankingRule<QueryGraph>> = vec![];
     let settings_ranking_rules = ctx.index.criteria(ctx.txn)?;
     for rr in settings_ranking_rules {
diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs
index c065cc706..30caf0017 100644
--- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs
+++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs
@@ -205,18 +205,12 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> {
 impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
     pub fn find_all_costs_to_end(&self) -> MappedInterner<QueryNode, Vec<u64>> {
         let mut costs_to_end = self.query_graph.nodes.map(|_| vec![]);
-        let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len());
 
-        let mut node_stack = VecDeque::new();
-
-        *costs_to_end.get_mut(self.query_graph.end_node) = vec![0];
-
-        for prev_node in self.query_graph.nodes.get(self.query_graph.end_node).predecessors.iter() {
-            node_stack.push_back(prev_node);
-            enqueued.insert(prev_node);
-        }
-
-        while let Some(cur_node) = node_stack.pop_front() {
+        self.traverse_breadth_first_backward(self.query_graph.end_node, |cur_node| {
+            if cur_node == self.query_graph.end_node {
+                *costs_to_end.get_mut(self.query_graph.end_node) = vec![0];
+                return true;
+            }
             let mut self_costs = Vec::<u64>::new();
 
             let cur_node_edges = &self.edges_of_node.get(cur_node);
@@ -232,13 +226,8 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
             self_costs.dedup();
 
             *costs_to_end.get_mut(cur_node) = self_costs;
-            for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() {
-                if !enqueued.contains(prev_node) {
-                    node_stack.push_back(prev_node);
-                    enqueued.insert(prev_node);
-                }
-            }
-        }
+            true
+        });
         costs_to_end
     }
 
@@ -247,17 +236,9 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
         node_with_removed_outgoing_conditions: Interned<QueryNode>,
         costs: &mut MappedInterner<QueryNode, Vec<u64>>,
     ) {
-        let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len());
-        let mut node_stack = VecDeque::new();
-
-        enqueued.insert(node_with_removed_outgoing_conditions);
-        node_stack.push_back(node_with_removed_outgoing_conditions);
-
-        'main_loop: while let Some(cur_node) = node_stack.pop_front() {
+        self.traverse_breadth_first_backward(node_with_removed_outgoing_conditions, |cur_node| {
             let mut costs_to_remove = FxHashSet::default();
-            for c in costs.get(cur_node) {
-                costs_to_remove.insert(*c);
-            }
+            costs_to_remove.extend(costs.get(cur_node).iter().copied());
 
             let cur_node_edges = &self.edges_of_node.get(cur_node);
             for edge_idx in cur_node_edges.iter() {
@@ -265,23 +246,79 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
                 for cost in costs.get(edge.dest_node).iter() {
                     costs_to_remove.remove(&(*cost + edge.cost as u64));
                     if costs_to_remove.is_empty() {
-                        continue 'main_loop;
+                        return false;
                     }
                 }
             }
             if costs_to_remove.is_empty() {
-                continue 'main_loop;
+                return false;
             }
             let mut new_costs = BTreeSet::from_iter(costs.get(cur_node).iter().copied());
             for c in costs_to_remove {
                 new_costs.remove(&c);
             }
             *costs.get_mut(cur_node) = new_costs.into_iter().collect();
+            true
+        });
+    }
 
-            for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() {
-                if !enqueued.contains(prev_node) {
-                    node_stack.push_back(prev_node);
-                    enqueued.insert(prev_node);
+    /// Traverse the graph backwards from the given node such that every time
+    /// a node is visited, we are guaranteed that all its successors either:
+    /// 1. have already been visited; OR
+    /// 2. were not reachable from the given node
+    pub fn traverse_breadth_first_backward(
+        &self,
+        from: Interned<QueryNode>,
+        mut visit: impl FnMut(Interned<QueryNode>) -> bool,
+    ) {
+        let mut reachable = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
+        {
+            // go backward to get the set of all reachable nodes from the given node
+            // the nodes that are not reachable will be set as `visited`
+            let mut stack = VecDeque::new();
+            let mut enqueued = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
+            enqueued.insert(from);
+            stack.push_back(from);
+            while let Some(n) = stack.pop_front() {
+                if reachable.contains(n) {
+                    continue;
+                }
+                reachable.insert(n);
+                for prev_node in self.query_graph.nodes.get(n).predecessors.iter() {
+                    if !enqueued.contains(prev_node) && !reachable.contains(prev_node) {
+                        stack.push_back(prev_node);
+                        enqueued.insert(prev_node);
+                    }
+                }
+            }
+        };
+        let mut unreachable_or_visited =
+            SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
+        for (n, _) in self.query_graph.nodes.iter() {
+            if !reachable.contains(n) {
+                unreachable_or_visited.insert(n);
+            }
+        }
+
+        let mut enqueued = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
+        let mut stack = VecDeque::new();
+
+        enqueued.insert(from);
+        stack.push_back(from);
+
+        while let Some(cur_node) = stack.pop_front() {
+            if !self.query_graph.nodes.get(cur_node).successors.is_subset(&unreachable_or_visited) {
+                stack.push_back(cur_node);
+                continue;
+            }
+            unreachable_or_visited.insert(cur_node);
+            if visit(cur_node) {
+                for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() {
+                    if !enqueued.contains(prev_node) && !unreachable_or_visited.contains(prev_node)
+                    {
+                        stack.push_back(prev_node);
+                        enqueued.insert(prev_node);
+                    }
                 }
             }
         }
diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs
index f60c481de..8de455822 100644
--- a/milli/src/search/new/ranking_rule_graph/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/mod.rs
@@ -20,6 +20,8 @@ mod position;
 mod proximity;
 /// Implementation of the `typo` ranking rule
 mod typo;
+/// Implementation of the `words` ranking rule
+mod words;
 
 use std::collections::BTreeSet;
 use std::hash::Hash;
@@ -33,6 +35,7 @@ pub use position::{PositionCondition, PositionGraph};
 pub use proximity::{ProximityCondition, ProximityGraph};
 use roaring::RoaringBitmap;
 pub use typo::{TypoCondition, TypoGraph};
+pub use words::{WordsCondition, WordsGraph};
 
 use super::interner::{DedupInterner, FixedSizeInterner, Interned, MappedInterner};
 use super::query_term::LocatedQueryTermSubset;
diff --git a/milli/src/search/new/ranking_rule_graph/words/mod.rs b/milli/src/search/new/ranking_rule_graph/words/mod.rs
new file mode 100644
index 000000000..0a0cc112b
--- /dev/null
+++ b/milli/src/search/new/ranking_rule_graph/words/mod.rs
@@ -0,0 +1,49 @@
+use roaring::RoaringBitmap;
+
+use super::{ComputedCondition, RankingRuleGraphTrait};
+use crate::search::new::interner::{DedupInterner, Interned};
+use crate::search::new::query_term::LocatedQueryTermSubset;
+use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
+use crate::search::new::SearchContext;
+use crate::Result;
+
+#[derive(Clone, PartialEq, Eq, Hash)]
+pub struct WordsCondition {
+    term: LocatedQueryTermSubset,
+}
+
+pub enum WordsGraph {}
+
+impl RankingRuleGraphTrait for WordsGraph {
+    type Condition = WordsCondition;
+
+    fn resolve_condition(
+        ctx: &mut SearchContext,
+        condition: &Self::Condition,
+        universe: &RoaringBitmap,
+    ) -> Result<ComputedCondition> {
+        let WordsCondition { term, .. } = condition;
+        // maybe compute_query_term_subset_docids should accept a universe as argument
+        let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?;
+        docids &= universe;
+
+        Ok(ComputedCondition {
+            docids,
+            universe_len: universe.len(),
+            start_term_subset: None,
+            end_term_subset: term.clone(),
+        })
+    }
+
+    fn build_edges(
+        _ctx: &mut SearchContext,
+        conditions_interner: &mut DedupInterner<Self::Condition>,
+        _from: Option<&LocatedQueryTermSubset>,
+        to_term: &LocatedQueryTermSubset,
+    ) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
+        Ok(vec![(
+            to_term.term_ids.len() as u32,
+            conditions_interner.insert(WordsCondition { term: to_term.clone() }),
+        )])
+    }
+}
diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs
deleted file mode 100644
index 72b7b5916..000000000
--- a/milli/src/search/new/words.rs
+++ /dev/null
@@ -1,87 +0,0 @@
-use roaring::RoaringBitmap;
-
-use super::logger::SearchLogger;
-use super::query_graph::QueryNode;
-use super::resolve_query_graph::compute_query_graph_docids;
-use super::small_bitmap::SmallBitmap;
-use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
-use crate::{Result, TermsMatchingStrategy};
-
-pub struct Words {
-    exhausted: bool, // TODO: remove
-    query_graph: Option<QueryGraph>,
-    nodes_to_remove: Vec<SmallBitmap<QueryNode>>,
-    terms_matching_strategy: TermsMatchingStrategy,
-}
-impl Words {
-    pub fn new(terms_matching_strategy: TermsMatchingStrategy) -> Self {
-        Self {
-            exhausted: true,
-            query_graph: None,
-            nodes_to_remove: vec![],
-            terms_matching_strategy,
-        }
-    }
-}
-
-impl<'ctx> RankingRule<'ctx, QueryGraph> for Words {
-    fn id(&self) -> String {
-        "words".to_owned()
-    }
-    fn start_iteration(
-        &mut self,
-        ctx: &mut SearchContext<'ctx>,
-        _logger: &mut dyn SearchLogger<QueryGraph>,
-        _universe: &RoaringBitmap,
-        parent_query_graph: &QueryGraph,
-    ) -> Result<()> {
-        self.exhausted = false;
-        self.query_graph = Some(parent_query_graph.clone());
-        self.nodes_to_remove = match self.terms_matching_strategy {
-            TermsMatchingStrategy::Last => {
-                let mut ns = parent_query_graph.removal_order_for_terms_matching_strategy_last(ctx);
-                ns.reverse();
-                ns
-            }
-            TermsMatchingStrategy::All => {
-                vec![]
-            }
-        };
-        Ok(())
-    }
-
-    fn next_bucket(
-        &mut self,
-        ctx: &mut SearchContext<'ctx>,
-        logger: &mut dyn SearchLogger<QueryGraph>,
-        universe: &RoaringBitmap,
-    ) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
-        if self.exhausted {
-            return Ok(None);
-        }
-        let Some(query_graph) = &mut self.query_graph else { panic!() };
-        logger.log_internal_state(query_graph);
-
-        let this_bucket = compute_query_graph_docids(ctx, query_graph, universe)?;
-
-        let child_query_graph = query_graph.clone();
-
-        if self.nodes_to_remove.is_empty() {
-            self.exhausted = true;
-        } else {
-            let nodes_to_remove = self.nodes_to_remove.pop().unwrap();
-            query_graph.remove_nodes_keep_edges(&nodes_to_remove.iter().collect::<Vec<_>>());
-        }
-        Ok(Some(RankingRuleOutput { query: child_query_graph, candidates: this_bucket }))
-    }
-
-    fn end_iteration(
-        &mut self,
-        _ctx: &mut SearchContext<'ctx>,
-        _logger: &mut dyn SearchLogger<QueryGraph>,
-    ) {
-        self.exhausted = true;
-        self.nodes_to_remove = vec![];
-        self.query_graph = None;
-    }
-}

From 57582688660f7bb0c40865a558933fa4f9a73a27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic.lecrenier@me.com>
Date: Tue, 16 May 2023 16:22:23 +0200
Subject: [PATCH 3/6] Don't compute split_words for phrases

---
 milli/src/search/new/logger/visual.rs         |  2 +-
 .../new/query_term/compute_derivations.rs     | 37 ++++++++++++++-----
 milli/src/search/new/query_term/mod.rs        | 17 +++++++--
 .../src/search/new/query_term/parse_query.rs  |  4 +-
 .../search/new/ranking_rule_graph/typo/mod.rs |  2 +-
 5 files changed, 45 insertions(+), 17 deletions(-)

diff --git a/milli/src/search/new/logger/visual.rs b/milli/src/search/new/logger/visual.rs
index f76782e63..8df56da89 100644
--- a/milli/src/search/new/logger/visual.rs
+++ b/milli/src/search/new/logger/visual.rs
@@ -462,7 +462,7 @@ fill: \"#B6E2D3\"
                 shape: class
                 max_nbr_typo: {}",
                     term_subset.description(ctx),
-                    term_subset.max_nbr_typos(ctx)
+                    term_subset.max_typo_cost(ctx)
                 )?;
 
                 for w in term_subset.all_single_words_except_prefix_db(ctx)? {
diff --git a/milli/src/search/new/query_term/compute_derivations.rs b/milli/src/search/new/query_term/compute_derivations.rs
index c26c4bc6b..d5dfbbcd0 100644
--- a/milli/src/search/new/query_term/compute_derivations.rs
+++ b/milli/src/search/new/query_term/compute_derivations.rs
@@ -28,14 +28,14 @@ pub enum ZeroOrOneTypo {
 impl Interned<QueryTerm> {
     pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> {
         let s = ctx.term_interner.get_mut(self);
-        if s.max_nbr_typos <= 1 && s.one_typo.is_uninit() {
+        if s.max_levenshtein_distance <= 1 && s.one_typo.is_uninit() {
             assert!(s.two_typo.is_uninit());
             // Initialize one_typo subterm even if max_nbr_typo is 0 because of split words
             self.initialize_one_typo_subterm(ctx)?;
             let s = ctx.term_interner.get_mut(self);
             assert!(s.one_typo.is_init());
             s.two_typo = Lazy::Init(TwoTypoTerm::default());
-        } else if s.max_nbr_typos > 1 && s.two_typo.is_uninit() {
+        } else if s.max_levenshtein_distance > 1 && s.two_typo.is_uninit() {
             assert!(s.two_typo.is_uninit());
             self.initialize_one_and_two_typo_subterm(ctx)?;
             let s = ctx.term_interner.get_mut(self);
@@ -185,7 +185,7 @@ pub fn partially_initialized_term_from_word(
                 original: ctx.word_interner.insert(word.to_owned()),
                 ngram_words: None,
                 is_prefix: false,
-                max_nbr_typos: 0,
+                max_levenshtein_distance: 0,
                 zero_typo: <_>::default(),
                 one_typo: Lazy::Init(<_>::default()),
                 two_typo: Lazy::Init(<_>::default()),
@@ -256,7 +256,7 @@ pub fn partially_initialized_term_from_word(
     Ok(QueryTerm {
         original: word_interned,
         ngram_words: None,
-        max_nbr_typos: max_typo,
+        max_levenshtein_distance: max_typo,
         is_prefix,
         zero_typo,
         one_typo: Lazy::Uninit,
@@ -275,7 +275,16 @@ fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result<Option<Intern
 impl Interned<QueryTerm> {
     fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
         let self_mut = ctx.term_interner.get_mut(self);
-        let QueryTerm { original, is_prefix, one_typo, max_nbr_typos, .. } = self_mut;
+
+        let allows_split_words = self_mut.allows_split_words();
+        let QueryTerm {
+            original,
+            is_prefix,
+            one_typo,
+            max_levenshtein_distance: max_nbr_typos,
+            ..
+        } = self_mut;
+
         let original = *original;
         let is_prefix = *is_prefix;
         // let original_str = ctx.word_interner.get(*original).to_owned();
@@ -300,13 +309,17 @@ impl Interned<QueryTerm> {
             })?;
         }
 
-        let original_str = ctx.word_interner.get(original).to_owned();
-        let split_words = find_split_words(ctx, original_str.as_str())?;
+        let split_words = if allows_split_words {
+            let original_str = ctx.word_interner.get(original).to_owned();
+            find_split_words(ctx, original_str.as_str())?
+        } else {
+            None
+        };
 
         let self_mut = ctx.term_interner.get_mut(self);
 
         // Only add the split words to the derivations if:
-        // 1. the term is not an ngram; OR
+        // 1. the term is neither an ngram nor a phrase; OR
         // 2. the term is an ngram, but the split words are different from the ngram's component words
         let split_words = if let Some((ngram_words, split_words)) =
             self_mut.ngram_words.as_ref().zip(split_words.as_ref())
@@ -328,7 +341,13 @@ impl Interned<QueryTerm> {
     }
     fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
         let self_mut = ctx.term_interner.get_mut(self);
-        let QueryTerm { original, is_prefix, two_typo, max_nbr_typos, .. } = self_mut;
+        let QueryTerm {
+            original,
+            is_prefix,
+            two_typo,
+            max_levenshtein_distance: max_nbr_typos,
+            ..
+        } = self_mut;
         let original_str = ctx.word_interner.get(*original).to_owned();
         if two_typo.is_init() {
             return Ok(());
diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs
index bf521d9b2..fb749a797 100644
--- a/milli/src/search/new/query_term/mod.rs
+++ b/milli/src/search/new/query_term/mod.rs
@@ -43,7 +43,7 @@ pub struct QueryTermSubset {
 pub struct QueryTerm {
     original: Interned<String>,
     ngram_words: Option<Vec<Interned<String>>>,
-    max_nbr_typos: u8,
+    max_levenshtein_distance: u8,
     is_prefix: bool,
     zero_typo: ZeroTypoTerm,
     // May not be computed yet
@@ -342,10 +342,16 @@ impl QueryTermSubset {
         }
         None
     }
-    pub fn max_nbr_typos(&self, ctx: &SearchContext) -> u8 {
+    pub fn max_typo_cost(&self, ctx: &SearchContext) -> u8 {
         let t = ctx.term_interner.get(self.original);
-        match t.max_nbr_typos {
-            0 => 0,
+        match t.max_levenshtein_distance {
+            0 => {
+                if t.allows_split_words() {
+                    1
+                } else {
+                    0
+                }
+            }
             1 => {
                 if self.one_typo_subset.is_empty() {
                     0
@@ -438,6 +444,9 @@ impl QueryTerm {
 
         self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty()
     }
+    fn allows_split_words(&self) -> bool {
+        self.zero_typo.phrase.is_none()
+    }
 }
 
 impl Interned<QueryTerm> {
diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs
index dc317a0fb..bf90748e4 100644
--- a/milli/src/search/new/query_term/parse_query.rs
+++ b/milli/src/search/new/query_term/parse_query.rs
@@ -217,7 +217,7 @@ pub fn make_ngram(
         original: ngram_str_interned,
         ngram_words: Some(words_interned),
         is_prefix,
-        max_nbr_typos,
+        max_levenshtein_distance: max_nbr_typos,
         zero_typo: term.zero_typo,
         one_typo: Lazy::Uninit,
         two_typo: Lazy::Uninit,
@@ -271,7 +271,7 @@ impl PhraseBuilder {
                 QueryTerm {
                     original: ctx.word_interner.insert(phrase_desc),
                     ngram_words: None,
-                    max_nbr_typos: 0,
+                    max_levenshtein_distance: 0,
                     is_prefix: false,
                     zero_typo: ZeroTypoTerm {
                         phrase: Some(phrase),
diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs
index da5198c23..a44be6015 100644
--- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs
@@ -50,7 +50,7 @@ impl RankingRuleGraphTrait for TypoGraph {
         // 3-gram -> equivalent to 2 typos
         let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 };
 
-        for nbr_typos in 0..=term.term_subset.max_nbr_typos(ctx) {
+        for nbr_typos in 0..=term.term_subset.max_typo_cost(ctx) {
             let mut term = term.clone();
             match nbr_typos {
                 0 => {

From ec8f685d8404673e4961daeec364a993c46124e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic.lecrenier@me.com>
Date: Tue, 16 May 2023 17:01:30 +0200
Subject: [PATCH 4/6] Fix bug in cheapest path algorithm

---
 .../new/ranking_rule_graph/cheapest_paths.rs  | 22 ++++++++-----------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs
index 30caf0017..4a696b3dd 100644
--- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs
+++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs
@@ -209,7 +209,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
         self.traverse_breadth_first_backward(self.query_graph.end_node, |cur_node| {
             if cur_node == self.query_graph.end_node {
                 *costs_to_end.get_mut(self.query_graph.end_node) = vec![0];
-                return true;
+                return;
             }
             let mut self_costs = Vec::<u64>::new();
 
@@ -226,7 +226,6 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
             self_costs.dedup();
 
             *costs_to_end.get_mut(cur_node) = self_costs;
-            true
         });
         costs_to_end
     }
@@ -246,19 +245,18 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
                 for cost in costs.get(edge.dest_node).iter() {
                     costs_to_remove.remove(&(*cost + edge.cost as u64));
                     if costs_to_remove.is_empty() {
-                        return false;
+                        return;
                     }
                 }
             }
             if costs_to_remove.is_empty() {
-                return false;
+                return;
             }
             let mut new_costs = BTreeSet::from_iter(costs.get(cur_node).iter().copied());
             for c in costs_to_remove {
                 new_costs.remove(&c);
             }
             *costs.get_mut(cur_node) = new_costs.into_iter().collect();
-            true
         });
     }
 
@@ -269,7 +267,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
     pub fn traverse_breadth_first_backward(
         &self,
         from: Interned<QueryNode>,
-        mut visit: impl FnMut(Interned<QueryNode>) -> bool,
+        mut visit: impl FnMut(Interned<QueryNode>),
     ) {
         let mut reachable = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
         {
@@ -312,13 +310,11 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
                 continue;
             }
             unreachable_or_visited.insert(cur_node);
-            if visit(cur_node) {
-                for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() {
-                    if !enqueued.contains(prev_node) && !unreachable_or_visited.contains(prev_node)
-                    {
-                        stack.push_back(prev_node);
-                        enqueued.insert(prev_node);
-                    }
+            visit(cur_node);
+            for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() {
+                if !enqueued.contains(prev_node) && !unreachable_or_visited.contains(prev_node) {
+                    stack.push_back(prev_node);
+                    enqueued.insert(prev_node);
                 }
             }
         }

From a490a11325cebfde7ead969c95ebc95af78e1352 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 23 May 2023 15:24:24 +0200
Subject: [PATCH 5/6] Add explanatory comment on the way we're recomputing
 costs

---
 milli/src/search/new/ranking_rule_graph/cheapest_paths.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs
index 4a696b3dd..738b53016 100644
--- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs
+++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs
@@ -235,6 +235,9 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
         node_with_removed_outgoing_conditions: Interned<QueryNode>,
         costs: &mut MappedInterner<QueryNode, Vec<u64>>,
     ) {
+        // Traverse the graph backward from the target node, recomputing the cost for each of its predecessors.
+        // We first check that no other node is contributing the same total cost to a predecessor before removing
+        // the cost from the predecessor. 
         self.traverse_breadth_first_backward(node_with_removed_outgoing_conditions, |cur_node| {
             let mut costs_to_remove = FxHashSet::default();
             costs_to_remove.extend(costs.get(cur_node).iter().copied());

From 51043f78f0159ab18f7d64051aa076514b5019aa Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 23 May 2023 15:27:25 +0200
Subject: [PATCH 6/6] Remove trailing whitespace

---
 milli/src/search/new/ranking_rule_graph/cheapest_paths.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs
index 738b53016..8fd943e6e 100644
--- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs
+++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs
@@ -237,7 +237,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
     ) {
         // Traverse the graph backward from the target node, recomputing the cost for each of its predecessors.
         // We first check that no other node is contributing the same total cost to a predecessor before removing
-        // the cost from the predecessor. 
+        // the cost from the predecessor.
         self.traverse_breadth_first_backward(node_with_removed_outgoing_conditions, |cur_node| {
             let mut costs_to_remove = FxHashSet::default();
             costs_to_remove.extend(costs.get(cur_node).iter().copied());