From cab2b6bcdac8291c5a0ab71ec5becec488ad98a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic.lecrenier@me.com>
Date: Mon, 6 Mar 2023 08:35:01 +0100
Subject: [PATCH] Fix: computation of initial universe, code organisation

---
 .../search/new/graph_based_ranking_rule.rs    |   4 +-
 milli/src/search/new/logger/detailed.rs       |  54 ++--
 milli/src/search/new/logger/mod.rs            |  23 +-
 milli/src/search/new/mod.rs                   | 124 +++++---
 milli/src/search/new/query_graph.rs           |  22 +-
 milli/src/search/new/query_term.rs            | 275 ++++++++++--------
 .../ranking_rule_graph/edge_docids_cache.rs   |   3 +-
 .../src/search/new/ranking_rule_graph/mod.rs  |  23 +-
 .../new/ranking_rule_graph/proximity/build.rs |   1 +
 milli/src/search/new/ranking_rules.rs         |  75 ++---
 milli/src/search/new/words.rs                 |  12 +-
 11 files changed, 341 insertions(+), 275 deletions(-)
diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs
index e5a0fbad6..d1f5864aa 100644
--- a/milli/src/search/new/graph_based_ranking_rule.rs
+++ b/milli/src/search/new/graph_based_ranking_rule.rs
@@ -3,8 +3,8 @@ use roaring::RoaringBitmap;
 
 use super::db_cache::DatabaseCache;
 use super::logger::SearchLogger;
-use super::ranking_rule_graph::edge_docids_cache::EdgeDocidsCache;
-use super::ranking_rule_graph::empty_paths_cache::EmptyPathsCache;
+use super::ranking_rule_graph::EdgeDocidsCache;
+use super::ranking_rule_graph::EmptyPathsCache;
 
 use super::ranking_rule_graph::{RankingRuleGraph, RankingRuleGraphTrait};
 use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput};
diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs
index c2415837d..4282db27f 100644
--- a/milli/src/search/new/logger/detailed.rs
+++ b/milli/src/search/new/logger/detailed.rs
@@ -5,13 +5,13 @@ use std::fs::File;
 use std::time::Instant;
 use std::{io::Write, path::PathBuf};
 
-use crate::new::ranking_rule_graph::typo::TypoGraph;
+use crate::new::ranking_rule_graph::TypoGraph;
 use crate::new::{QueryNode, QueryGraph};
 use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
-use crate::new::ranking_rule_graph::empty_paths_cache::EmptyPathsCache;
+use crate::new::ranking_rule_graph::EmptyPathsCache;
 use crate::new::ranking_rule_graph::{Edge, EdgeDetails, RankingRuleGraphTrait};
 use crate::new::ranking_rule_graph::{
-    proximity::ProximityGraph, RankingRuleGraph,
+    ProximityGraph, RankingRuleGraph,
 };
 
 use super::{RankingRule, SearchLogger};
@@ -21,18 +21,18 @@ pub enum SearchEvents {
         ranking_rule_idx: usize,
         query: QueryGraph,
         universe: RoaringBitmap,
-        time: Instant,
+        time: Instant
     },
     RankingRuleNextBucket {
         ranking_rule_idx: usize,
         universe: RoaringBitmap,
         candidates: RoaringBitmap,
-        time: Instant,
+        time: Instant
     },
     RankingRuleEndIteration {
         ranking_rule_idx: usize,
         universe: RoaringBitmap,
-        time: Instant,
+        time: Instant
     },
     ExtendResults {
         new: Vec<u32>,
@@ -56,13 +56,14 @@ pub enum SearchEvents {
         distances: Vec<Vec<u64>>,
         cost: u64,
     },
-    RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap, time: Instant, },
+    RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap, time: Instant },
 }
 
 pub struct DetailedSearchLogger {
     folder_path: PathBuf,
     initial_query: Option<QueryGraph>,
     initial_query_time: Option<Instant>,
+    query_for_universe: Option<QueryGraph>,
     initial_universe: Option<RoaringBitmap>,
     ranking_rules_ids: Option<Vec<String>>,
     events: Vec<SearchEvents>,
@@ -73,6 +74,7 @@ impl DetailedSearchLogger {
             folder_path: PathBuf::new().join(folder_path),
             initial_query: None,
             initial_query_time: None,
+            query_for_universe: None,
             initial_universe: None,
             ranking_rules_ids: None,
             events: vec![],
@@ -81,9 +83,13 @@ impl DetailedSearchLogger {
 }
 
 impl SearchLogger<QueryGraph> for DetailedSearchLogger {
-    fn initial_query(&mut self, query: &QueryGraph, time: Instant) {
+    fn initial_query(&mut self, query: &QueryGraph) {
         self.initial_query = Some(query.clone());
-        self.initial_query_time = Some(time);
+        self.initial_query_time = Some(Instant::now());
+    }
+
+    fn query_for_universe(&mut self, query: &QueryGraph) {
+        self.query_for_universe = Some(query.clone());
     }
 
     fn initial_universe(&mut self, universe: &RoaringBitmap) {
@@ -99,13 +105,13 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
         _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>,
         query: &QueryGraph,
         universe: &RoaringBitmap,
-        time: Instant,
+        
     ) {
         self.events.push(SearchEvents::RankingRuleStartIteration {
             ranking_rule_idx,
             query: query.clone(),
             universe: universe.clone(),
-            time,
+            time: Instant::now(),
         })
     }
 
@@ -115,13 +121,13 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
         _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>,
         universe: &RoaringBitmap,
         candidates: &RoaringBitmap,
-        time: Instant,
+        
     ) {
         self.events.push(SearchEvents::RankingRuleNextBucket {
             ranking_rule_idx,
             universe: universe.clone(),
             candidates: candidates.clone(),
-            time,
+            time: Instant::now(),
         })
     }
     fn skip_bucket_ranking_rule<'transaction>(
@@ -129,12 +135,12 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
         ranking_rule_idx: usize,
         _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>,
         candidates: &RoaringBitmap,
-        time: Instant,
+        
     ) {
         self.events.push(SearchEvents::RankingRuleSkipBucket {
             ranking_rule_idx,
             candidates: candidates.clone(),
-            time
+            time: Instant::now()
         })
     }
 
@@ -143,12 +149,12 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
         ranking_rule_idx: usize,
         _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>,
         universe: &RoaringBitmap,
-        time: Instant,
+        
     ) {
         self.events.push(SearchEvents::RankingRuleEndIteration {
             ranking_rule_idx,
             universe: universe.clone(),
-            time
+            time: Instant::now()
         })
     }
     fn add_to_results(&mut self, docids: &[u32]) {
@@ -184,6 +190,20 @@ impl DetailedSearchLogger {
 
         let index_path = self.folder_path.join("index.d2");
         let mut file = std::fs::File::create(index_path).unwrap();
+        writeln!(&mut file, "direction: right").unwrap();
+        writeln!(&mut file, "Initial Query Graph: {{").unwrap();
+        let initial_query_graph = self.initial_query.as_ref().unwrap();
+        Self::query_graph_d2_description(initial_query_graph, &mut file);
+        writeln!(&mut file, "}}").unwrap();
+
+        writeln!(&mut file, "Query Graph Used To Compute Universe: {{").unwrap();
+        let query_graph_for_universe = self.query_for_universe.as_ref().unwrap();
+        Self::query_graph_d2_description(query_graph_for_universe, &mut file);
+        writeln!(&mut file, "}}").unwrap();
+
+        let initial_universe = self.initial_universe.as_ref().unwrap();
+        writeln!(&mut file, "Initial Universe Length {}", initial_universe.len()).unwrap();
+
         writeln!(&mut file, "Control Flow Between Ranking Rules: {{").unwrap();
         writeln!(&mut file, "shape: sequence_diagram").unwrap();
         for (idx, rr_id) in self.ranking_rules_ids.as_ref().unwrap().iter().enumerate() {
diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs
index 079bb892c..9a141c1c6 100644
--- a/milli/src/search/new/logger/mod.rs
+++ b/milli/src/search/new/logger/mod.rs
@@ -2,19 +2,17 @@
 pub mod detailed;
 
 use roaring::RoaringBitmap;
-use std::time::Instant;
 
 use super::{
-    ranking_rule_graph::{
-        empty_paths_cache::EmptyPathsCache, proximity::ProximityGraph, typo::TypoGraph,
-        RankingRuleGraph,
-    },
+    ranking_rule_graph::{EmptyPathsCache, ProximityGraph, RankingRuleGraph, TypoGraph},
     RankingRule, RankingRuleQueryTrait,
 };
 
 pub struct DefaultSearchLogger;
 impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
-    fn initial_query(&mut self, _query: &Q, _time: Instant) {}
+    fn initial_query(&mut self, _query: &Q) {}
+
+    fn query_for_universe(&mut self, _query: &Q) {}
 
     fn initial_universe(&mut self, _universe: &RoaringBitmap) {}
 
@@ -26,7 +24,6 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
         _ranking_rule: &dyn RankingRule<'transaction, Q>,
         _query: &Q,
         _universe: &RoaringBitmap,
-        _time: Instant,
     ) {
     }
 
@@ -36,7 +33,6 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
         _ranking_rule: &dyn RankingRule<'transaction, Q>,
         _universe: &RoaringBitmap,
         _candidates: &RoaringBitmap,
-        _time: Instant,
     ) {
     }
     fn skip_bucket_ranking_rule<'transaction>(
@@ -44,7 +40,6 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
         _ranking_rule_idx: usize,
         _ranking_rule: &dyn RankingRule<'transaction, Q>,
         _candidates: &RoaringBitmap,
-        _time: Instant,
     ) {
     }
 
@@ -53,7 +48,6 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
         _ranking_rule_idx: usize,
         _ranking_rule: &dyn RankingRule<'transaction, Q>,
         _universe: &RoaringBitmap,
-        _time: Instant,
     ) {
     }
 
@@ -85,7 +79,10 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
 }
 
 pub trait SearchLogger<Q: RankingRuleQueryTrait> {
-    fn initial_query(&mut self, query: &Q, time: Instant);
+    fn initial_query(&mut self, query: &Q);
+
+    fn query_for_universe(&mut self, query: &Q);
+
     fn initial_universe(&mut self, universe: &RoaringBitmap);
 
     fn ranking_rules(&mut self, rr: &[&mut dyn RankingRule<Q>]);
@@ -96,7 +93,6 @@ pub trait SearchLogger<Q: RankingRuleQueryTrait> {
         ranking_rule: &dyn RankingRule<'transaction, Q>,
         query: &Q,
         universe: &RoaringBitmap,
-        time: Instant,
     );
     fn next_bucket_ranking_rule<'transaction>(
         &mut self,
@@ -104,21 +100,18 @@ pub trait SearchLogger<Q: RankingRuleQueryTrait> {
         ranking_rule: &dyn RankingRule<'transaction, Q>,
         universe: &RoaringBitmap,
         candidates: &RoaringBitmap,
-        time: Instant,
     );
     fn skip_bucket_ranking_rule<'transaction>(
         &mut self,
         ranking_rule_idx: usize,
         ranking_rule: &dyn RankingRule<'transaction, Q>,
         candidates: &RoaringBitmap,
-        time: Instant,
     );
     fn end_iteration_ranking_rule<'transaction>(
         &mut self,
         ranking_rule_idx: usize,
         ranking_rule: &dyn RankingRule<'transaction, Q>,
         universe: &RoaringBitmap,
-        time: Instant,
     );
     fn add_to_results(&mut self, docids: &[u32]);
 
diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs
index 94120cd8a..3e9b43f1b 100644
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@@ -9,55 +9,113 @@ mod resolve_query_graph;
 mod sort;
 mod words;
 
-use charabia::Tokenize;
-use heed::RoTxn;
+use std::collections::BTreeSet;
 
-use query_graph::{QueryGraph, QueryNode};
 pub use ranking_rules::{
-    execute_search, RankingRule, RankingRuleOutput, RankingRuleOutputIter,
+    apply_ranking_rules, RankingRule, RankingRuleOutput, RankingRuleOutputIter,
     RankingRuleOutputIterWrapper, RankingRuleQueryTrait,
 };
+
+use crate::{
+    new::query_term::located_query_terms_from_string, Filter, Index, Result, TermsMatchingStrategy,
+};
+use charabia::Tokenize;
+use db_cache::DatabaseCache;
+use heed::RoTxn;
+use query_graph::{QueryGraph, QueryNode};
 use roaring::RoaringBitmap;
 
-use self::db_cache::DatabaseCache;
-use self::query_term::{word_derivations, LocatedQueryTerm};
-use crate::{Index, Result};
+use self::{
+    logger::SearchLogger,
+    resolve_query_graph::{resolve_query_graph, NodeDocIdsCache},
+};
 
 pub enum BitmapOrAllRef<'s> {
     Bitmap(&'s RoaringBitmap),
     All,
 }
 
-pub fn make_query_graph<'transaction>(
+#[allow(clippy::too_many_arguments)]
+pub fn resolve_maximally_reduced_query_graph<'transaction>(
     index: &Index,
-    txn: &RoTxn,
+    txn: &'transaction heed::RoTxn,
+    db_cache: &mut DatabaseCache<'transaction>,
+    universe: &RoaringBitmap,
+    query_graph: &QueryGraph,
+    node_docids_cache: &mut NodeDocIdsCache,
+    matching_strategy: TermsMatchingStrategy,
+    logger: &mut dyn SearchLogger<QueryGraph>,
+) -> Result<RoaringBitmap> {
+    let mut graph = query_graph.clone();
+    let mut positions_to_remove = match matching_strategy {
+        TermsMatchingStrategy::Last => {
+            let mut all_positions = BTreeSet::new();
+            for n in query_graph.nodes.iter() {
+                match n {
+                    QueryNode::Term(term) => {
+                        all_positions.extend(term.positions.clone().into_iter());
+                    }
+                    QueryNode::Deleted | QueryNode::Start | QueryNode::End => {}
+                }
+            }
+            all_positions.into_iter().collect()
+        }
+        TermsMatchingStrategy::All => vec![],
+    };
+    // don't remove the first term
+    positions_to_remove.remove(0);
+    loop {
+        if positions_to_remove.is_empty() {
+            break;
+        } else {
+            let position_to_remove = positions_to_remove.pop().unwrap();
+            let _ = graph.remove_words_at_position(position_to_remove);
+        }
+    }
+    logger.query_for_universe(&graph);
+    let docids = resolve_query_graph(index, txn, db_cache, node_docids_cache, &graph, universe)?;
+
+    Ok(docids)
+}
+
+#[allow(clippy::too_many_arguments)]
+pub fn execute_search<'transaction>(
+    index: &Index,
+    txn: &'transaction RoTxn,
     db_cache: &mut DatabaseCache<'transaction>,
     query: &str,
-) -> Result<QueryGraph> {
+    filters: Option<Filter>,
+    from: usize,
+    length: usize,
+    logger: &mut dyn SearchLogger<QueryGraph>,
+) -> Result<Vec<u32>> {
     assert!(!query.is_empty());
-    let authorize_typos = index.authorize_typos(txn)?;
-    let min_len_one_typo = index.min_word_len_one_typo(txn)?;
-    let min_len_two_typos = index.min_word_len_two_typos(txn)?;
+    let query_terms = located_query_terms_from_string(index, txn, query.tokenize(), None).unwrap();
+    let graph = QueryGraph::from_query(index, txn, db_cache, query_terms)?;
 
-    let exact_words = index.exact_words(txn)?;
-    let fst = index.words_fst(txn)?;
+    logger.initial_query(&graph);
 
-    // TODO: get rid of this closure
-    // also, ngrams can have one typo?
-    let query = LocatedQueryTerm::from_query(query.tokenize(), None, move |word, is_prefix| {
-        let typos = if !authorize_typos
-            || word.len() < min_len_one_typo as usize
-            || exact_words.as_ref().map_or(false, |fst| fst.contains(word))
-        {
-            0
-        } else if word.len() < min_len_two_typos as usize {
-            1
-        } else {
-            2
-        };
-        word_derivations(index, txn, word, typos, is_prefix, &fst)
-    })
-    .unwrap();
-    let graph = QueryGraph::from_query(index, txn, db_cache, query)?;
-    Ok(graph)
+    let universe = if let Some(filters) = filters {
+        filters.evaluate(txn, index)?
+    } else {
+        index.documents_ids(txn)?
+    };
+
+    let mut node_docids_cache = NodeDocIdsCache::default();
+
+    let universe = resolve_maximally_reduced_query_graph(
+        index,
+        txn,
+        db_cache,
+        &universe,
+        &graph,
+        &mut node_docids_cache,
+        TermsMatchingStrategy::Last,
+        logger,
+    )?;
+    // TODO: create ranking rules here, reuse the node docids cache for the words ranking rule
+
+    logger.initial_universe(&universe);
+
+    apply_ranking_rules(index, txn, db_cache, &graph, &universe, from, length, logger)
 }
diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs
index 8178f8ded..e86c175af 100644
--- a/milli/src/search/new/query_graph.rs
+++ b/milli/src/search/new/query_graph.rs
@@ -4,7 +4,7 @@ use heed::RoTxn;
 use roaring::RoaringBitmap;
 
 use super::db_cache::DatabaseCache;
-use super::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
+use super::query_term::{self, LocatedQueryTerm, QueryTerm, WordDerivations};
 use crate::{Index, Result};
 
 #[derive(Debug, Clone)]
@@ -31,6 +31,7 @@ pub struct QueryGraph {
 }
 
 fn _assert_sizes() {
+    // TODO: QueryNodes are too big now, 184B is an unreasonable size
     let _: [u8; 184] = [0; std::mem::size_of::<QueryNode>()];
     let _: [u8; 48] = [0; std::mem::size_of::<Edges>()];
 }
@@ -75,7 +76,7 @@ impl QueryGraph {
         index: &Index,
         txn: &RoTxn,
         _db_cache: &mut DatabaseCache<'transaction>,
-        query: Vec<LocatedQueryTerm>,
+        terms: Vec<LocatedQueryTerm>,
     ) -> Result<QueryGraph> {
         // TODO: maybe empty nodes should not be removed here, to compute
         // the score of the `words` ranking rule correctly
@@ -90,8 +91,8 @@ impl QueryGraph {
             (vec![], vec![], vec![graph.root_node]);
 
         // TODO: split words / synonyms
-        for length in 1..=query.len() {
-            let query = &query[..length];
+        for length in 1..=terms.len() {
+            let query = &terms[..length];
 
             let term0 = query.last().unwrap();
 
@@ -104,7 +105,7 @@ impl QueryGraph {
 
             if !prev1.is_empty() {
                 if let Some((ngram2_str, ngram2_pos)) =
-                    LocatedQueryTerm::ngram2(&query[length - 2], &query[length - 1])
+                    query_term::ngram2(&query[length - 2], &query[length - 1])
                 {
                     if word_set.contains(ngram2_str.as_bytes()) {
                         let ngram2 = LocatedQueryTerm {
@@ -128,11 +129,9 @@ impl QueryGraph {
                 }
             }
             if !prev2.is_empty() {
-                if let Some((ngram3_str, ngram3_pos)) = LocatedQueryTerm::ngram3(
-                    &query[length - 3],
-                    &query[length - 2],
-                    &query[length - 1],
-                ) {
+                if let Some((ngram3_str, ngram3_pos)) =
+                    query_term::ngram3(&query[length - 3], &query[length - 2], &query[length - 1])
+                {
                     if word_set.contains(ngram3_str.as_bytes()) {
                         let ngram3 = LocatedQueryTerm {
                             value: QueryTerm::Word {
@@ -143,8 +142,9 @@ impl QueryGraph {
                                     one_typo: vec![],
                                     two_typos: vec![],
                                     use_prefix_db: false,
-                                    synonyms: vec![],  // TODO: ngram synonyms
+                                    synonyms: vec![], // TODO: ngram synonyms
                                     split_words: None, // TODO: maybe ngram split words?
+                                                      // would be nice for typos like su nflower
                                 },
                             },
                             positions: ngram3_pos,
diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs
index 9ea72aa3a..3820b8ed0 100644
--- a/milli/src/search/new/query_term.rs
+++ b/milli/src/search/new/query_term.rs
@@ -178,9 +178,15 @@ fn split_best_frequency(
 
 #[derive(Debug, Clone)]
 pub enum QueryTerm {
+    // TODO: should there be SplitWord, NGram2, and NGram3 variants?
+    // NGram2 can have 1 typo and synonyms
+    // NGram3 cannot have typos but can have synonyms
+    // SplitWords are a phrase
+    // Can NGrams be prefixes?
     Phrase { phrase: Phrase },
     Word { derivations: WordDerivations },
 }
+
 impl QueryTerm {
     pub fn original_single_word(&self) -> Option<&str> {
         match self {
@@ -209,53 +215,77 @@ impl LocatedQueryTerm {
             QueryTerm::Word { derivations, .. } => derivations.is_empty(),
         }
     }
-    /// Create primitive query from tokenized query string,
-    /// the primitive query is an intermediate state to build the query tree.
-    pub fn from_query(
-        query: NormalizedTokenIter<Vec<u8>>,
-        words_limit: Option<usize>,
-        // TODO:` use index + txn + ? instead of closure
-        derivations: impl Fn(&str, bool) -> Result<WordDerivations>,
-    ) -> Result<Vec<LocatedQueryTerm>> {
-        let mut primitive_query = Vec::new();
-        let mut phrase = Vec::new();
+}
 
-        let mut quoted = false;
+pub fn located_query_terms_from_string<'transaction>(
+    index: &Index,
+    txn: &'transaction RoTxn,
+    query: NormalizedTokenIter<Vec<u8>>,
+    words_limit: Option<usize>,
+) -> Result<Vec<LocatedQueryTerm>> {
+    let authorize_typos = index.authorize_typos(txn)?;
+    let min_len_one_typo = index.min_word_len_one_typo(txn)?;
+    let min_len_two_typos = index.min_word_len_two_typos(txn)?;
 
-        let parts_limit = words_limit.unwrap_or(usize::MAX);
+    let exact_words = index.exact_words(txn)?;
+    let fst = index.words_fst(txn)?;
 
-        let mut position = -1i8;
-        let mut phrase_start = -1i8;
-        let mut phrase_end = -1i8;
+    let nbr_typos = |word: &str| {
+        if !authorize_typos
+            || word.len() < min_len_one_typo as usize
+            || exact_words.as_ref().map_or(false, |fst| fst.contains(word))
+        {
+            0
+        } else if word.len() < min_len_two_typos as usize {
+            1
+        } else {
+            2
+        }
+    };
 
-        let mut peekable = query.peekable();
-        while let Some(token) = peekable.next() {
-            // early return if word limit is exceeded
-            if primitive_query.len() >= parts_limit {
-                return Ok(primitive_query);
-            }
+    let derivations = |word: &str, is_prefix: bool| {
+        word_derivations(index, txn, word, nbr_typos(word), is_prefix, &fst)
+    };
 
-            match token.kind {
-                TokenKind::Word | TokenKind::StopWord => {
-                    position += 1;
-                    // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote,
-                    // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word,
-                    // 3. if the word is the last token of the query we push it as a prefix word.
-                    if quoted {
-                        phrase_end = position;
-                        if phrase.is_empty() {
-                            phrase_start = position;
-                        }
-                        if let TokenKind::StopWord = token.kind {
-                            phrase.push(None);
-                        } else {
-                            // TODO: in a phrase, check that every word exists
-                            // otherwise return WordDerivations::Empty
-                            phrase.push(Some(token.lemma().to_string()));
-                        }
-                    } else if peekable.peek().is_some() {
-                        if let TokenKind::StopWord = token.kind {
-                        } else {
+    let mut primitive_query = Vec::new();
+    let mut phrase = Vec::new();
+
+    let mut quoted = false;
+
+    let parts_limit = words_limit.unwrap_or(usize::MAX);
+
+    let mut position = -1i8;
+    let mut phrase_start = -1i8;
+    let mut phrase_end = -1i8;
+
+    let mut peekable = query.peekable();
+    while let Some(token) = peekable.next() {
+        // early return if word limit is exceeded
+        if primitive_query.len() >= parts_limit {
+            return Ok(primitive_query);
+        }
+
+        match token.kind {
+            TokenKind::Word | TokenKind::StopWord => {
+                position += 1;
+                // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote,
+                // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word,
+                // 3. if the word is the last token of the query we push it as a prefix word.
+                if quoted {
+                    phrase_end = position;
+                    if phrase.is_empty() {
+                        phrase_start = position;
+                    }
+                    if let TokenKind::StopWord = token.kind {
+                        phrase.push(None);
+                    } else {
+                        // TODO: in a phrase, check that every word exists
+                        // otherwise return WordDerivations::Empty
+                        phrase.push(Some(token.lemma().to_string()));
+                    }
+                } else if peekable.peek().is_some() {
+                    match token.kind {
+                        TokenKind::Word => {
                             let derivations = derivations(token.lemma(), false)?;
                             let located_term = LocatedQueryTerm {
                                 value: QueryTerm::Word { derivations },
@@ -263,100 +293,91 @@ impl LocatedQueryTerm {
                             };
                             primitive_query.push(located_term);
                         }
-                    } else {
-                        let derivations = derivations(token.lemma(), true)?;
-                        let located_term = LocatedQueryTerm {
-                            value: QueryTerm::Word { derivations },
-                            positions: position..=position,
-                        };
-                        primitive_query.push(located_term);
+                        TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {}
+                    }
+                } else {
+                    let derivations = derivations(token.lemma(), true)?;
+                    let located_term = LocatedQueryTerm {
+                        value: QueryTerm::Word { derivations },
+                        positions: position..=position,
+                    };
+                    primitive_query.push(located_term);
+                }
+            }
+            TokenKind::Separator(separator_kind) => {
+                match separator_kind {
+                    SeparatorKind::Hard => {
+                        position += 1;
+                    }
+                    SeparatorKind::Soft => {
+                        position += 0;
                     }
                 }
-                TokenKind::Separator(separator_kind) => {
-                    match separator_kind {
-                        SeparatorKind::Hard => {
-                            position += 1;
-                        }
-                        SeparatorKind::Soft => {
-                            position += 0;
-                        }
-                    }
-                    let quote_count = token.lemma().chars().filter(|&s| s == '"').count();
-                    // swap quoted state if we encounter a double quote
-                    if quote_count % 2 != 0 {
-                        quoted = !quoted;
-                    }
-                    // if there is a quote or a hard separator we close the phrase.
-                    if !phrase.is_empty()
-                        && (quote_count > 0 || separator_kind == SeparatorKind::Hard)
-                    {
-                        let located_query_term = LocatedQueryTerm {
-                            value: QueryTerm::Phrase {
-                                phrase: Phrase { words: mem::take(&mut phrase) },
-                            },
-                            positions: phrase_start..=phrase_end,
-                        };
-                        primitive_query.push(located_query_term);
-                    }
+                let quote_count = token.lemma().chars().filter(|&s| s == '"').count();
+                // swap quoted state if we encounter a double quote
+                if quote_count % 2 != 0 {
+                    quoted = !quoted;
+                }
+                // if there is a quote or a hard separator we close the phrase.
+                if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard)
+                {
+                    let located_query_term = LocatedQueryTerm {
+                        value: QueryTerm::Phrase {
+                            phrase: Phrase { words: mem::take(&mut phrase) },
+                        },
+                        positions: phrase_start..=phrase_end,
+                    };
+                    primitive_query.push(located_query_term);
                 }
-                _ => (),
             }
+            _ => (),
         }
-
-        // If a quote is never closed, we consider all of the end of the query as a phrase.
-        if !phrase.is_empty() {
-            let located_query_term = LocatedQueryTerm {
-                value: QueryTerm::Phrase { phrase: Phrase { words: mem::take(&mut phrase) } },
-                positions: phrase_start..=phrase_end,
-            };
-            primitive_query.push(located_query_term);
-        }
-
-        Ok(primitive_query)
     }
+
+    // If a quote is never closed, we consider all of the end of the query as a phrase.
+    if !phrase.is_empty() {
+        let located_query_term = LocatedQueryTerm {
+            value: QueryTerm::Phrase { phrase: Phrase { words: mem::take(&mut phrase) } },
+            positions: phrase_start..=phrase_end,
+        };
+        primitive_query.push(located_query_term);
+    }
+
+    Ok(primitive_query)
 }
 
-impl LocatedQueryTerm {
-    pub fn ngram2(
-        x: &LocatedQueryTerm,
-        y: &LocatedQueryTerm,
-    ) -> Option<(String, RangeInclusive<i8>)> {
-        if *x.positions.end() != y.positions.start() - 1 {
-            println!(
-                "x positions end: {}, y positions start: {}",
-                *x.positions.end(),
-                y.positions.start()
-            );
-            return None;
-        }
-        match (&x.value.original_single_word(), &y.value.original_single_word()) {
-            (Some(w1), Some(w2)) => {
-                let term = (format!("{w1}{w2}"), *x.positions.start()..=*y.positions.end());
-                Some(term)
-            }
-            _ => None,
-        }
+// TODO: return a word derivations instead?
+pub fn ngram2(x: &LocatedQueryTerm, y: &LocatedQueryTerm) -> Option<(String, RangeInclusive<i8>)> {
+    if *x.positions.end() != y.positions.start() - 1 {
+        return None;
     }
-    pub fn ngram3(
-        x: &LocatedQueryTerm,
-        y: &LocatedQueryTerm,
-        z: &LocatedQueryTerm,
-    ) -> Option<(String, RangeInclusive<i8>)> {
-        if *x.positions.end() != y.positions.start() - 1
-            || *y.positions.end() != z.positions.start() - 1
-        {
-            return None;
-        }
-        match (
-            &x.value.original_single_word(),
-            &y.value.original_single_word(),
-            &z.value.original_single_word(),
-        ) {
-            (Some(w1), Some(w2), Some(w3)) => {
-                let term = (format!("{w1}{w2}{w3}"), *x.positions.start()..=*z.positions.end());
-                Some(term)
-            }
-            _ => None,
+    match (&x.value.original_single_word(), &y.value.original_single_word()) {
+        (Some(w1), Some(w2)) => {
+            let term = (format!("{w1}{w2}"), *x.positions.start()..=*y.positions.end());
+            Some(term)
         }
+        _ => None,
+    }
+}
+pub fn ngram3(
+    x: &LocatedQueryTerm,
+    y: &LocatedQueryTerm,
+    z: &LocatedQueryTerm,
+) -> Option<(String, RangeInclusive<i8>)> {
+    if *x.positions.end() != y.positions.start() - 1
+        || *y.positions.end() != z.positions.start() - 1
+    {
+        return None;
+    }
+    match (
+        &x.value.original_single_word(),
+        &y.value.original_single_word(),
+        &z.value.original_single_word(),
+    ) {
+        (Some(w1), Some(w2), Some(w3)) => {
+            let term = (format!("{w1}{w2}{w3}"), *x.positions.start()..=*z.positions.end());
+            Some(term)
+        }
+        _ => None,
     }
 }
diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs
index ef2eba895..3d48fd69c 100644
--- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs
+++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs
@@ -41,11 +41,12 @@ impl<G: RankingRuleGraphTrait> EdgeDocidsCache<G> {
             EdgeDetails::Unconditional => Ok(BitmapOrAllRef::All),
             EdgeDetails::Data(details) => {
                 if self.cache.contains_key(&edge_index) {
+                    // TODO: should we update the bitmap in the cache if the new universe
+                    // reduces it?
                     return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index]));
                 }
                 // TODO: maybe universe doesn't belong here
                 let docids = universe & G::compute_docids(index, txn, db_cache, details)?;
-
                 let _ = self.cache.insert(edge_index, docids);
                 let docids = &self.cache[&edge_index];
                 Ok(BitmapOrAllRef::Bitmap(docids))
diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs
index ac5e1f46b..e65d5f70b 100644
--- a/milli/src/search/new/ranking_rule_graph/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/mod.rs
@@ -1,19 +1,22 @@
-pub mod build;
-pub mod cheapest_paths;
-pub mod edge_docids_cache;
-pub mod empty_paths_cache;
-pub mod paths_map;
-pub mod proximity;
-pub mod resolve_paths;
-pub mod typo;
+mod build;
+mod cheapest_paths;
+mod edge_docids_cache;
+mod empty_paths_cache;
+mod paths_map;
+mod proximity;
+mod resolve_paths;
+mod typo;
+
+pub use edge_docids_cache::EdgeDocidsCache;
+pub use empty_paths_cache::EmptyPathsCache;
+pub use proximity::ProximityGraph;
+pub use typo::TypoGraph;
 
 use std::ops::ControlFlow;
 
 use heed::RoTxn;
 use roaring::RoaringBitmap;
 
-use self::empty_paths_cache::EmptyPathsCache;
-
 use super::db_cache::DatabaseCache;
 use super::logger::SearchLogger;
 use super::{QueryGraph, QueryNode};
diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs
index fbe3c8169..06c860d7e 100644
--- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs
@@ -105,6 +105,7 @@ pub fn visit_to_node<'transaction, 'from_data>(
     assert!(!updb1);
 
     let derivations1 = derivations1.all_derivations_except_prefix_db();
+    // TODO: eventually, we want to get rid of the uses from `orginal`
     let original_word_2 = derivations2.original.clone();
     let mut cost_proximity_word_pairs = BTreeMap::<u8, BTreeMap<u8, Vec<WordPair>>>::new();
 
diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs
index bc0523b31..9fa840ad5 100644
--- a/milli/src/search/new/ranking_rules.rs
+++ b/milli/src/search/new/ranking_rules.rs
@@ -1,5 +1,3 @@
-use std::time::Instant;
-
 use heed::RoTxn;
 use roaring::RoaringBitmap;
 
@@ -8,11 +6,11 @@ use super::logger::SearchLogger;
 
 use super::QueryGraph;
 use crate::new::graph_based_ranking_rule::GraphBasedRankingRule;
-use crate::new::ranking_rule_graph::proximity::ProximityGraph;
-use crate::new::ranking_rule_graph::typo::TypoGraph;
+use crate::new::ranking_rule_graph::ProximityGraph;
+use crate::new::ranking_rule_graph::TypoGraph;
 use crate::new::words::Words;
 // use crate::search::new::sort::Sort;
-use crate::{Filter, Index, Result, TermsMatchingStrategy};
+use crate::{Index, Result, TermsMatchingStrategy};
 
 pub trait RankingRuleOutputIter<'transaction, Query> {
     fn next_bucket(&mut self) -> Result<Option<RankingRuleOutput<Query>>>;
@@ -100,18 +98,18 @@ pub struct RankingRuleOutput<Q> {
 
 // TODO: can make it generic over the query type (either query graph or placeholder) fairly easily
 #[allow(clippy::too_many_arguments)]
-pub fn execute_search<'transaction>(
+pub fn apply_ranking_rules<'transaction>(
     index: &Index,
     txn: &'transaction heed::RoTxn,
     // TODO: ranking rules parameter
     db_cache: &mut DatabaseCache<'transaction>,
     query_graph: &QueryGraph,
-    filters: Option<Filter>,
+    universe: &RoaringBitmap,
     from: usize,
     length: usize,
     logger: &mut dyn SearchLogger<QueryGraph>,
 ) -> Result<Vec<u32>> {
-    logger.initial_query(query_graph, Instant::now());
+    logger.initial_query(query_graph);
     let words = &mut Words::new(TermsMatchingStrategy::Last);
     // let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?;
     let proximity = &mut GraphBasedRankingRule::<ProximityGraph>::new("proximity".to_owned());
@@ -122,25 +120,13 @@ pub fn execute_search<'transaction>(
 
     logger.ranking_rules(&ranking_rules);
 
-    let universe = if let Some(filters) = filters {
-        filters.evaluate(txn, index)?
-    } else {
-        index.documents_ids(txn)?
-    };
-
     if universe.len() < from as u64 {
         return Ok(vec![]);
     }
 
     let ranking_rules_len = ranking_rules.len();
-    logger.start_iteration_ranking_rule(
-        0,
-        ranking_rules[0],
-        query_graph,
-        &universe,
-        Instant::now(),
-    );
-    ranking_rules[0].start_iteration(index, txn, db_cache, logger, &universe, query_graph)?;
+    logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, universe);
+    ranking_rules[0].start_iteration(index, txn, db_cache, logger, universe, query_graph)?;
 
     let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len];
     candidates[0] = universe.clone();
@@ -154,7 +140,6 @@ pub fn execute_search<'transaction>(
                 cur_ranking_rule_index,
                 ranking_rules[cur_ranking_rule_index],
                 &candidates[cur_ranking_rule_index],
-                Instant::now(),
             );
             candidates[cur_ranking_rule_index].clear();
             ranking_rules[cur_ranking_rule_index].end_iteration(index, txn, db_cache, logger);
@@ -183,7 +168,6 @@ pub fn execute_search<'transaction>(
                             cur_ranking_rule_index,
                             ranking_rules[cur_ranking_rule_index],
                             &candidates,
-                            Instant::now(),
                         );
                     } else {
                         let all_candidates = candidates.iter().collect::<Vec<_>>();
@@ -193,7 +177,6 @@ pub fn execute_search<'transaction>(
                             cur_ranking_rule_index,
                             ranking_rules[cur_ranking_rule_index],
                             &skipped_candidates.into_iter().collect(),
-                            Instant::now(),
                         );
                         let candidates = candidates
                             .iter()
@@ -234,7 +217,6 @@ pub fn execute_search<'transaction>(
             ranking_rules[cur_ranking_rule_index],
             &candidates[cur_ranking_rule_index],
             &next_bucket.candidates,
-            Instant::now(),
         );
 
         assert!(candidates[cur_ranking_rule_index].is_superset(&next_bucket.candidates));
@@ -255,7 +237,6 @@ pub fn execute_search<'transaction>(
             ranking_rules[cur_ranking_rule_index],
             &next_bucket.query,
             &candidates[cur_ranking_rule_index],
-            Instant::now(),
         );
         ranking_rules[cur_ranking_rule_index].start_iteration(
             index,
@@ -272,11 +253,11 @@ pub fn execute_search<'transaction>(
 
 #[cfg(test)]
 mod tests {
-    use super::execute_search;
     // use crate::allocator::ALLOC;
     use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
     use crate::index::tests::TempIndex;
     use crate::new::db_cache::DatabaseCache;
+    use crate::new::execute_search;
     use big_s::S;
     use heed::EnvOpenOptions;
     use maplit::hashset;
@@ -284,8 +265,7 @@ mod tests {
     use std::io::{BufRead, BufReader, Cursor, Seek};
     use std::time::Instant;
     // use crate::new::logger::detailed::DetailedSearchLogger;
-    use crate::new::logger::{DefaultSearchLogger, SearchLogger};
-    use crate::new::make_query_graph;
+    use crate::new::logger::DefaultSearchLogger;
     use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
     use crate::{Criterion, Index, Object, Search, TermsMatchingStrategy};
 
@@ -321,17 +301,20 @@ mod tests {
             ]))
             .unwrap();
         let txn = index.read_txn().unwrap();
-        let mut logger = DefaultSearchLogger;
         let mut db_cache = DatabaseCache::default();
 
-        let query_graph =
-            make_query_graph(&index, &txn, &mut db_cache, "releases from poison by the government")
-                .unwrap();
-        logger.initial_query(&query_graph, Instant::now());
+        let results = execute_search(
+            &index,
+            &txn,
+            &mut db_cache,
+            "releases from poison by the government",
+            None,
+            0,
+            50,
+            &mut DefaultSearchLogger,
+        )
+        .unwrap();
 
-        let results =
-            execute_search(&index, &txn, &mut db_cache, &query_graph, None, 0, 50, &mut logger)
-                .unwrap();
         println!("{results:?}")
     }
 
@@ -352,21 +335,13 @@ mod tests {
 
         let mut db_cache = DatabaseCache::default();
 
-        let query_graph = make_query_graph(
-            &index,
-            &txn,
-            &mut db_cache,
-            "which a the releases from poison by the government",
-        )
-        .unwrap();
-
         // let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log");
 
         let results = execute_search(
             &index,
             &txn,
             &mut db_cache,
-            &query_graph,
+            "which a the releases from poison by the government",
             None,
             0,
             20,
@@ -453,17 +428,13 @@ mod tests {
 
         let mut db_cache = DatabaseCache::default();
 
-        let query_graph =
-            make_query_graph(&index, &txn, &mut db_cache, "releases from poison by the government")
-                .unwrap();
-
         let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log");
 
         let results = execute_search(
             &index,
             &txn,
             &mut db_cache,
-            &query_graph,
+            "releases from poison by the government",
             None,
             0,
             20,
diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs
index da4599ec5..10c0800ba 100644
--- a/milli/src/search/new/words.rs
+++ b/milli/src/search/new/words.rs
@@ -43,12 +43,9 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words {
         _parent_candidates: &RoaringBitmap,
         parent_query_graph: &QueryGraph,
     ) -> Result<()> {
-        // println!("Words: start iteration");
         self.exhausted = false;
         self.query_graph = Some(parent_query_graph.clone());
 
-        // TODO: a phrase can contain many positions, but represents a single node.
-        // That's a problem.
         let positions_to_remove = match self.terms_matching_strategy {
             TermsMatchingStrategy::Last => {
                 let mut all_positions = BTreeSet::new();
@@ -60,11 +57,13 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words {
                         QueryNode::Deleted | QueryNode::Start | QueryNode::End => {}
                     }
                 }
-                all_positions.into_iter().collect()
+                let mut r: Vec<i8> = all_positions.into_iter().collect();
+                // don't remove the first term
+                r.remove(0);
+                r
             }
             TermsMatchingStrategy::All => vec![],
         };
-        // println!("positions to remove: {positions_to_remove:?}");
         self.positions_to_remove = positions_to_remove;
         self.iterating = true;
         Ok(())
@@ -78,7 +77,6 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words {
         logger: &mut dyn SearchLogger<QueryGraph>,
         universe: &RoaringBitmap,
     ) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
-        // println!("Words: next bucket");
         assert!(self.iterating);
         assert!(universe.len() > 1);
 
@@ -122,9 +120,9 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words {
         _db_cache: &mut DatabaseCache<'transaction>,
         _logger: &mut dyn SearchLogger<QueryGraph>,
     ) {
-        // println!("Words: end iteration");
         self.iterating = false;
         self.exhausted = true;
         self.positions_to_remove = vec![];
+        self.query_graph = None;
     }
 }