Fix: computation of initial universe, code organisation

2025-07-04 12:27:13 +02:00 · 2023-03-06 08:35:01 +01:00 · 2023-03-06 08:35:01 +01:00 · cab2b6bcda
commit cab2b6bcda
parent c4979a2fda
11 changed files with 341 additions and 275 deletions
--- a/milli/src/search/new/graph_based_ranking_rule.rs
+++ b/milli/src/search/new/graph_based_ranking_rule.rs
@ -3,8 +3,8 @@ use roaring::RoaringBitmap;

 use super::db_cache::DatabaseCache;
 use super::logger::SearchLogger;
-use super::ranking_rule_graph::edge_docids_cache::EdgeDocidsCache;
-use super::ranking_rule_graph::empty_paths_cache::EmptyPathsCache;
+use super::ranking_rule_graph::EdgeDocidsCache;
+use super::ranking_rule_graph::EmptyPathsCache;

 use super::ranking_rule_graph::{RankingRuleGraph, RankingRuleGraphTrait};
 use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput};
--- a/milli/src/search/new/logger/detailed.rs
+++ b/milli/src/search/new/logger/detailed.rs
@ -5,13 +5,13 @@ use std::fs::File;
 use std::time::Instant;
 use std::{io::Write, path::PathBuf};

-use crate::new::ranking_rule_graph::typo::TypoGraph;
+use crate::new::ranking_rule_graph::TypoGraph;
 use crate::new::{QueryNode, QueryGraph};
 use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
-use crate::new::ranking_rule_graph::empty_paths_cache::EmptyPathsCache;
+use crate::new::ranking_rule_graph::EmptyPathsCache;
 use crate::new::ranking_rule_graph::{Edge, EdgeDetails, RankingRuleGraphTrait};
 use crate::new::ranking_rule_graph::{
-    proximity::ProximityGraph, RankingRuleGraph,
+    ProximityGraph, RankingRuleGraph,
 };

 use super::{RankingRule, SearchLogger};
@ -21,18 +21,18 @@ pub enum SearchEvents {
        ranking_rule_idx: usize,
        query: QueryGraph,
        universe: RoaringBitmap,
-        time: Instant,
+        time: Instant
    },
    RankingRuleNextBucket {
        ranking_rule_idx: usize,
        universe: RoaringBitmap,
        candidates: RoaringBitmap,
-        time: Instant,
+        time: Instant
    },
    RankingRuleEndIteration {
        ranking_rule_idx: usize,
        universe: RoaringBitmap,
-        time: Instant,
+        time: Instant
    },
    ExtendResults {
        new: Vec<u32>,
@ -56,13 +56,14 @@ pub enum SearchEvents {
        distances: Vec<Vec<u64>>,
        cost: u64,
    },
-    RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap, time: Instant, },
+    RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap, time: Instant },
 }

 pub struct DetailedSearchLogger {
    folder_path: PathBuf,
    initial_query: Option<QueryGraph>,
    initial_query_time: Option<Instant>,
+    query_for_universe: Option<QueryGraph>,
    initial_universe: Option<RoaringBitmap>,
    ranking_rules_ids: Option<Vec<String>>,
    events: Vec<SearchEvents>,
@ -73,6 +74,7 @@ impl DetailedSearchLogger {
            folder_path: PathBuf::new().join(folder_path),
            initial_query: None,
            initial_query_time: None,
+            query_for_universe: None,
            initial_universe: None,
            ranking_rules_ids: None,
            events: vec![],
@ -81,9 +83,13 @@ impl DetailedSearchLogger {
 }

 impl SearchLogger<QueryGraph> for DetailedSearchLogger {
-    fn initial_query(&mut self, query: &QueryGraph, time: Instant) {
+    fn initial_query(&mut self, query: &QueryGraph) {
        self.initial_query = Some(query.clone());
-        self.initial_query_time = Some(time);
+        self.initial_query_time = Some(Instant::now());
+    }
+
+    fn query_for_universe(&mut self, query: &QueryGraph) {
+        self.query_for_universe = Some(query.clone());
    }

    fn initial_universe(&mut self, universe: &RoaringBitmap) {
@ -99,13 +105,13 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
        _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>,
        query: &QueryGraph,
        universe: &RoaringBitmap,
-        time: Instant,
+        
    ) {
        self.events.push(SearchEvents::RankingRuleStartIteration {
            ranking_rule_idx,
            query: query.clone(),
            universe: universe.clone(),
-            time,
+            time: Instant::now(),
        })
    }

@ -115,13 +121,13 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
        _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>,
        universe: &RoaringBitmap,
        candidates: &RoaringBitmap,
-        time: Instant,
+        
    ) {
        self.events.push(SearchEvents::RankingRuleNextBucket {
            ranking_rule_idx,
            universe: universe.clone(),
            candidates: candidates.clone(),
-            time,
+            time: Instant::now(),
        })
    }
    fn skip_bucket_ranking_rule<'transaction>(
@ -129,12 +135,12 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
        ranking_rule_idx: usize,
        _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>,
        candidates: &RoaringBitmap,
-        time: Instant,
+        
    ) {
        self.events.push(SearchEvents::RankingRuleSkipBucket {
            ranking_rule_idx,
            candidates: candidates.clone(),
-            time
+            time: Instant::now()
        })
    }

@ -143,12 +149,12 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
        ranking_rule_idx: usize,
        _ranking_rule: &dyn RankingRule<'transaction, QueryGraph>,
        universe: &RoaringBitmap,
-        time: Instant,
+        
    ) {
        self.events.push(SearchEvents::RankingRuleEndIteration {
            ranking_rule_idx,
            universe: universe.clone(),
-            time
+            time: Instant::now()
        })
    }
    fn add_to_results(&mut self, docids: &[u32]) {
@ -184,6 +190,20 @@ impl DetailedSearchLogger {

        let index_path = self.folder_path.join("index.d2");
        let mut file = std::fs::File::create(index_path).unwrap();
+        writeln!(&mut file, "direction: right").unwrap();
+        writeln!(&mut file, "Initial Query Graph: {{").unwrap();
+        let initial_query_graph = self.initial_query.as_ref().unwrap();
+        Self::query_graph_d2_description(initial_query_graph, &mut file);
+        writeln!(&mut file, "}}").unwrap();
+
+        writeln!(&mut file, "Query Graph Used To Compute Universe: {{").unwrap();
+        let query_graph_for_universe = self.query_for_universe.as_ref().unwrap();
+        Self::query_graph_d2_description(query_graph_for_universe, &mut file);
+        writeln!(&mut file, "}}").unwrap();
+
+        let initial_universe = self.initial_universe.as_ref().unwrap();
+        writeln!(&mut file, "Initial Universe Length {}", initial_universe.len()).unwrap();
+
        writeln!(&mut file, "Control Flow Between Ranking Rules: {{").unwrap();
        writeln!(&mut file, "shape: sequence_diagram").unwrap();
        for (idx, rr_id) in self.ranking_rules_ids.as_ref().unwrap().iter().enumerate() {
--- a/milli/src/search/new/logger/mod.rs
+++ b/milli/src/search/new/logger/mod.rs
@ -2,19 +2,17 @@
 pub mod detailed;

 use roaring::RoaringBitmap;
-use std::time::Instant;

 use super::{
-    ranking_rule_graph::{
-        empty_paths_cache::EmptyPathsCache, proximity::ProximityGraph, typo::TypoGraph,
-        RankingRuleGraph,
-    },
+    ranking_rule_graph::{EmptyPathsCache, ProximityGraph, RankingRuleGraph, TypoGraph},
    RankingRule, RankingRuleQueryTrait,
 };

 pub struct DefaultSearchLogger;
 impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
-    fn initial_query(&mut self, _query: &Q, _time: Instant) {}
+    fn initial_query(&mut self, _query: &Q) {}
+
+    fn query_for_universe(&mut self, _query: &Q) {}

    fn initial_universe(&mut self, _universe: &RoaringBitmap) {}

@ -26,7 +24,6 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
        _ranking_rule: &dyn RankingRule<'transaction, Q>,
        _query: &Q,
        _universe: &RoaringBitmap,
-        _time: Instant,
    ) {
    }

@ -36,7 +33,6 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
        _ranking_rule: &dyn RankingRule<'transaction, Q>,
        _universe: &RoaringBitmap,
        _candidates: &RoaringBitmap,
-        _time: Instant,
    ) {
    }
    fn skip_bucket_ranking_rule<'transaction>(
@ -44,7 +40,6 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
        _ranking_rule_idx: usize,
        _ranking_rule: &dyn RankingRule<'transaction, Q>,
        _candidates: &RoaringBitmap,
-        _time: Instant,
    ) {
    }

@ -53,7 +48,6 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
        _ranking_rule_idx: usize,
        _ranking_rule: &dyn RankingRule<'transaction, Q>,
        _universe: &RoaringBitmap,
-        _time: Instant,
    ) {
    }

@ -85,7 +79,10 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
 }

 pub trait SearchLogger<Q: RankingRuleQueryTrait> {
-    fn initial_query(&mut self, query: &Q, time: Instant);
+    fn initial_query(&mut self, query: &Q);
+
+    fn query_for_universe(&mut self, query: &Q);
+
    fn initial_universe(&mut self, universe: &RoaringBitmap);

    fn ranking_rules(&mut self, rr: &[&mut dyn RankingRule<Q>]);
@ -96,7 +93,6 @@ pub trait SearchLogger<Q: RankingRuleQueryTrait> {
        ranking_rule: &dyn RankingRule<'transaction, Q>,
        query: &Q,
        universe: &RoaringBitmap,
-        time: Instant,
    );
    fn next_bucket_ranking_rule<'transaction>(
        &mut self,
@ -104,21 +100,18 @@ pub trait SearchLogger<Q: RankingRuleQueryTrait> {
        ranking_rule: &dyn RankingRule<'transaction, Q>,
        universe: &RoaringBitmap,
        candidates: &RoaringBitmap,
-        time: Instant,
    );
    fn skip_bucket_ranking_rule<'transaction>(
        &mut self,
        ranking_rule_idx: usize,
        ranking_rule: &dyn RankingRule<'transaction, Q>,
        candidates: &RoaringBitmap,
-        time: Instant,
    );
    fn end_iteration_ranking_rule<'transaction>(
        &mut self,
        ranking_rule_idx: usize,
        ranking_rule: &dyn RankingRule<'transaction, Q>,
        universe: &RoaringBitmap,
-        time: Instant,
    );
    fn add_to_results(&mut self, docids: &[u32]);

--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@ -9,55 +9,113 @@ mod resolve_query_graph;
 mod sort;
 mod words;

-use charabia::Tokenize;
-use heed::RoTxn;
+use std::collections::BTreeSet;

-use query_graph::{QueryGraph, QueryNode};
 pub use ranking_rules::{
-    execute_search, RankingRule, RankingRuleOutput, RankingRuleOutputIter,
+    apply_ranking_rules, RankingRule, RankingRuleOutput, RankingRuleOutputIter,
    RankingRuleOutputIterWrapper, RankingRuleQueryTrait,
 };
+
+use crate::{
+    new::query_term::located_query_terms_from_string, Filter, Index, Result, TermsMatchingStrategy,
+};
+use charabia::Tokenize;
+use db_cache::DatabaseCache;
+use heed::RoTxn;
+use query_graph::{QueryGraph, QueryNode};
 use roaring::RoaringBitmap;

-use self::db_cache::DatabaseCache;
-use self::query_term::{word_derivations, LocatedQueryTerm};
-use crate::{Index, Result};
+use self::{
+    logger::SearchLogger,
+    resolve_query_graph::{resolve_query_graph, NodeDocIdsCache},
+};

 pub enum BitmapOrAllRef<'s> {
    Bitmap(&'s RoaringBitmap),
    All,
 }

-pub fn make_query_graph<'transaction>(
+#[allow(clippy::too_many_arguments)]
+pub fn resolve_maximally_reduced_query_graph<'transaction>(
    index: &Index,
-    txn: &RoTxn,
+    txn: &'transaction heed::RoTxn,
+    db_cache: &mut DatabaseCache<'transaction>,
+    universe: &RoaringBitmap,
+    query_graph: &QueryGraph,
+    node_docids_cache: &mut NodeDocIdsCache,
+    matching_strategy: TermsMatchingStrategy,
+    logger: &mut dyn SearchLogger<QueryGraph>,
+) -> Result<RoaringBitmap> {
+    let mut graph = query_graph.clone();
+    let mut positions_to_remove = match matching_strategy {
+        TermsMatchingStrategy::Last => {
+            let mut all_positions = BTreeSet::new();
+            for n in query_graph.nodes.iter() {
+                match n {
+                    QueryNode::Term(term) => {
+                        all_positions.extend(term.positions.clone().into_iter());
+                    }
+                    QueryNode::Deleted | QueryNode::Start | QueryNode::End => {}
+                }
+            }
+            all_positions.into_iter().collect()
+        }
+        TermsMatchingStrategy::All => vec![],
+    };
+    // don't remove the first term
+    positions_to_remove.remove(0);
+    loop {
+        if positions_to_remove.is_empty() {
+            break;
+        } else {
+            let position_to_remove = positions_to_remove.pop().unwrap();
+            let _ = graph.remove_words_at_position(position_to_remove);
+        }
+    }
+    logger.query_for_universe(&graph);
+    let docids = resolve_query_graph(index, txn, db_cache, node_docids_cache, &graph, universe)?;
+
+    Ok(docids)
+}
+
+#[allow(clippy::too_many_arguments)]
+pub fn execute_search<'transaction>(
+    index: &Index,
+    txn: &'transaction RoTxn,
    db_cache: &mut DatabaseCache<'transaction>,
    query: &str,
-) -> Result<QueryGraph> {
+    filters: Option<Filter>,
+    from: usize,
+    length: usize,
+    logger: &mut dyn SearchLogger<QueryGraph>,
+) -> Result<Vec<u32>> {
    assert!(!query.is_empty());
-    let authorize_typos = index.authorize_typos(txn)?;
-    let min_len_one_typo = index.min_word_len_one_typo(txn)?;
-    let min_len_two_typos = index.min_word_len_two_typos(txn)?;
+    let query_terms = located_query_terms_from_string(index, txn, query.tokenize(), None).unwrap();
+    let graph = QueryGraph::from_query(index, txn, db_cache, query_terms)?;

-    let exact_words = index.exact_words(txn)?;
-    let fst = index.words_fst(txn)?;
+    logger.initial_query(&graph);

-    // TODO: get rid of this closure
-    // also, ngrams can have one typo?
-    let query = LocatedQueryTerm::from_query(query.tokenize(), None, move |word, is_prefix| {
-        let typos = if !authorize_typos
-            || word.len() < min_len_one_typo as usize
-            || exact_words.as_ref().map_or(false, |fst| fst.contains(word))
-        {
-            0
-        } else if word.len() < min_len_two_typos as usize {
-            1
-        } else {
-            2
-        };
-        word_derivations(index, txn, word, typos, is_prefix, &fst)
-    })
-    .unwrap();
-    let graph = QueryGraph::from_query(index, txn, db_cache, query)?;
-    Ok(graph)
+    let universe = if let Some(filters) = filters {
+        filters.evaluate(txn, index)?
+    } else {
+        index.documents_ids(txn)?
+    };
+
+    let mut node_docids_cache = NodeDocIdsCache::default();
+
+    let universe = resolve_maximally_reduced_query_graph(
+        index,
+        txn,
+        db_cache,
+        &universe,
+        &graph,
+        &mut node_docids_cache,
+        TermsMatchingStrategy::Last,
+        logger,
+    )?;
+    // TODO: create ranking rules here, reuse the node docids cache for the words ranking rule
+
+    logger.initial_universe(&universe);
+
+    apply_ranking_rules(index, txn, db_cache, &graph, &universe, from, length, logger)
 }
--- a/milli/src/search/new/query_graph.rs
+++ b/milli/src/search/new/query_graph.rs
@ -4,7 +4,7 @@ use heed::RoTxn;
 use roaring::RoaringBitmap;

 use super::db_cache::DatabaseCache;
-use super::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
+use super::query_term::{self, LocatedQueryTerm, QueryTerm, WordDerivations};
 use crate::{Index, Result};

 #[derive(Debug, Clone)]
@ -31,6 +31,7 @@ pub struct QueryGraph {
 }

 fn _assert_sizes() {
+    // TODO: QueryNodes are too big now, 184B is an unreasonable size
    let _: [u8; 184] = [0; std::mem::size_of::<QueryNode>()];
    let _: [u8; 48] = [0; std::mem::size_of::<Edges>()];
 }
@ -75,7 +76,7 @@ impl QueryGraph {
        index: &Index,
        txn: &RoTxn,
        _db_cache: &mut DatabaseCache<'transaction>,
-        query: Vec<LocatedQueryTerm>,
+        terms: Vec<LocatedQueryTerm>,
    ) -> Result<QueryGraph> {
        // TODO: maybe empty nodes should not be removed here, to compute
        // the score of the `words` ranking rule correctly
@ -90,8 +91,8 @@ impl QueryGraph {
            (vec![], vec![], vec![graph.root_node]);

        // TODO: split words / synonyms
-        for length in 1..=query.len() {
-            let query = &query[..length];
+        for length in 1..=terms.len() {
+            let query = &terms[..length];

            let term0 = query.last().unwrap();

@ -104,7 +105,7 @@ impl QueryGraph {

            if !prev1.is_empty() {
                if let Some((ngram2_str, ngram2_pos)) =
-                    LocatedQueryTerm::ngram2(&query[length - 2], &query[length - 1])
+                    query_term::ngram2(&query[length - 2], &query[length - 1])
                {
                    if word_set.contains(ngram2_str.as_bytes()) {
                        let ngram2 = LocatedQueryTerm {
@ -128,11 +129,9 @@ impl QueryGraph {
                }
            }
            if !prev2.is_empty() {
-                if let Some((ngram3_str, ngram3_pos)) = LocatedQueryTerm::ngram3(
-                    &query[length - 3],
-                    &query[length - 2],
-                    &query[length - 1],
-                ) {
+                if let Some((ngram3_str, ngram3_pos)) =
+                    query_term::ngram3(&query[length - 3], &query[length - 2], &query[length - 1])
+                {
                    if word_set.contains(ngram3_str.as_bytes()) {
                        let ngram3 = LocatedQueryTerm {
                            value: QueryTerm::Word {
@ -143,8 +142,9 @@ impl QueryGraph {
                                    one_typo: vec![],
                                    two_typos: vec![],
                                    use_prefix_db: false,
-                                    synonyms: vec![],  // TODO: ngram synonyms
+                                    synonyms: vec![], // TODO: ngram synonyms
                                    split_words: None, // TODO: maybe ngram split words?
+                                                      // would be nice for typos like su nflower
                                },
                            },
                            positions: ngram3_pos,
--- a/milli/src/search/new/query_term.rs
+++ b/milli/src/search/new/query_term.rs
@ -178,9 +178,15 @@ fn split_best_frequency(

 #[derive(Debug, Clone)]
 pub enum QueryTerm {
+    // TODO: should there be SplitWord, NGram2, and NGram3 variants?
+    // NGram2 can have 1 typo and synonyms
+    // NGram3 cannot have typos but can have synonyms
+    // SplitWords are a phrase
+    // Can NGrams be prefixes?
    Phrase { phrase: Phrase },
    Word { derivations: WordDerivations },
 }
+
 impl QueryTerm {
    pub fn original_single_word(&self) -> Option<&str> {
        match self {
@ -209,53 +215,77 @@ impl LocatedQueryTerm {
            QueryTerm::Word { derivations, .. } => derivations.is_empty(),
        }
    }
-    /// Create primitive query from tokenized query string,
-    /// the primitive query is an intermediate state to build the query tree.
-    pub fn from_query(
-        query: NormalizedTokenIter<Vec<u8>>,
-        words_limit: Option<usize>,
-        // TODO:` use index + txn + ? instead of closure
-        derivations: impl Fn(&str, bool) -> Result<WordDerivations>,
-    ) -> Result<Vec<LocatedQueryTerm>> {
-        let mut primitive_query = Vec::new();
-        let mut phrase = Vec::new();
+}

-        let mut quoted = false;
+pub fn located_query_terms_from_string<'transaction>(
+    index: &Index,
+    txn: &'transaction RoTxn,
+    query: NormalizedTokenIter<Vec<u8>>,
+    words_limit: Option<usize>,
+) -> Result<Vec<LocatedQueryTerm>> {
+    let authorize_typos = index.authorize_typos(txn)?;
+    let min_len_one_typo = index.min_word_len_one_typo(txn)?;
+    let min_len_two_typos = index.min_word_len_two_typos(txn)?;

-        let parts_limit = words_limit.unwrap_or(usize::MAX);
+    let exact_words = index.exact_words(txn)?;
+    let fst = index.words_fst(txn)?;

-        let mut position = -1i8;
-        let mut phrase_start = -1i8;
-        let mut phrase_end = -1i8;
+    let nbr_typos = |word: &str| {
+        if !authorize_typos
+            || word.len() < min_len_one_typo as usize
+            || exact_words.as_ref().map_or(false, |fst| fst.contains(word))
+        {
+            0
+        } else if word.len() < min_len_two_typos as usize {
+            1
+        } else {
+            2
+        }
+    };

-        let mut peekable = query.peekable();
-        while let Some(token) = peekable.next() {
-            // early return if word limit is exceeded
-            if primitive_query.len() >= parts_limit {
-                return Ok(primitive_query);
-            }
+    let derivations = |word: &str, is_prefix: bool| {
+        word_derivations(index, txn, word, nbr_typos(word), is_prefix, &fst)
+    };

-            match token.kind {
-                TokenKind::Word | TokenKind::StopWord => {
-                    position += 1;
-                    // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote,
-                    // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word,
-                    // 3. if the word is the last token of the query we push it as a prefix word.
-                    if quoted {
-                        phrase_end = position;
-                        if phrase.is_empty() {
-                            phrase_start = position;
-                        }
-                        if let TokenKind::StopWord = token.kind {
-                            phrase.push(None);
-                        } else {
-                            // TODO: in a phrase, check that every word exists
-                            // otherwise return WordDerivations::Empty
-                            phrase.push(Some(token.lemma().to_string()));
-                        }
-                    } else if peekable.peek().is_some() {
-                        if let TokenKind::StopWord = token.kind {
-                        } else {
+    let mut primitive_query = Vec::new();
+    let mut phrase = Vec::new();
+
+    let mut quoted = false;
+
+    let parts_limit = words_limit.unwrap_or(usize::MAX);
+
+    let mut position = -1i8;
+    let mut phrase_start = -1i8;
+    let mut phrase_end = -1i8;
+
+    let mut peekable = query.peekable();
+    while let Some(token) = peekable.next() {
+        // early return if word limit is exceeded
+        if primitive_query.len() >= parts_limit {
+            return Ok(primitive_query);
+        }
+
+        match token.kind {
+            TokenKind::Word | TokenKind::StopWord => {
+                position += 1;
+                // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote,
+                // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word,
+                // 3. if the word is the last token of the query we push it as a prefix word.
+                if quoted {
+                    phrase_end = position;
+                    if phrase.is_empty() {
+                        phrase_start = position;
+                    }
+                    if let TokenKind::StopWord = token.kind {
+                        phrase.push(None);
+                    } else {
+                        // TODO: in a phrase, check that every word exists
+                        // otherwise return WordDerivations::Empty
+                        phrase.push(Some(token.lemma().to_string()));
+                    }
+                } else if peekable.peek().is_some() {
+                    match token.kind {
+                        TokenKind::Word => {
                            let derivations = derivations(token.lemma(), false)?;
                            let located_term = LocatedQueryTerm {
                                value: QueryTerm::Word { derivations },
@ -263,100 +293,91 @@ impl LocatedQueryTerm {
                            };
                            primitive_query.push(located_term);
                        }
-                    } else {
-                        let derivations = derivations(token.lemma(), true)?;
-                        let located_term = LocatedQueryTerm {
-                            value: QueryTerm::Word { derivations },
-                            positions: position..=position,
-                        };
-                        primitive_query.push(located_term);
+                        TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {}
+                    }
+                } else {
+                    let derivations = derivations(token.lemma(), true)?;
+                    let located_term = LocatedQueryTerm {
+                        value: QueryTerm::Word { derivations },
+                        positions: position..=position,
+                    };
+                    primitive_query.push(located_term);
+                }
+            }
+            TokenKind::Separator(separator_kind) => {
+                match separator_kind {
+                    SeparatorKind::Hard => {
+                        position += 1;
+                    }
+                    SeparatorKind::Soft => {
+                        position += 0;
                    }
                }
-                TokenKind::Separator(separator_kind) => {
-                    match separator_kind {
-                        SeparatorKind::Hard => {
-                            position += 1;
-                        }
-                        SeparatorKind::Soft => {
-                            position += 0;
-                        }
-                    }
-                    let quote_count = token.lemma().chars().filter(|&s| s == '"').count();
-                    // swap quoted state if we encounter a double quote
-                    if quote_count % 2 != 0 {
-                        quoted = !quoted;
-                    }
-                    // if there is a quote or a hard separator we close the phrase.
-                    if !phrase.is_empty()
-                        && (quote_count > 0 || separator_kind == SeparatorKind::Hard)
-                    {
-                        let located_query_term = LocatedQueryTerm {
-                            value: QueryTerm::Phrase {
-                                phrase: Phrase { words: mem::take(&mut phrase) },
-                            },
-                            positions: phrase_start..=phrase_end,
-                        };
-                        primitive_query.push(located_query_term);
-                    }
+                let quote_count = token.lemma().chars().filter(|&s| s == '"').count();
+                // swap quoted state if we encounter a double quote
+                if quote_count % 2 != 0 {
+                    quoted = !quoted;
+                }
+                // if there is a quote or a hard separator we close the phrase.
+                if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard)
+                {
+                    let located_query_term = LocatedQueryTerm {
+                        value: QueryTerm::Phrase {
+                            phrase: Phrase { words: mem::take(&mut phrase) },
+                        },
+                        positions: phrase_start..=phrase_end,
+                    };
+                    primitive_query.push(located_query_term);
                }
-                _ => (),
            }
+            _ => (),
        }
-
-        // If a quote is never closed, we consider all of the end of the query as a phrase.
-        if !phrase.is_empty() {
-            let located_query_term = LocatedQueryTerm {
-                value: QueryTerm::Phrase { phrase: Phrase { words: mem::take(&mut phrase) } },
-                positions: phrase_start..=phrase_end,
-            };
-            primitive_query.push(located_query_term);
-        }
-
-        Ok(primitive_query)
    }
+
+    // If a quote is never closed, we consider all of the end of the query as a phrase.
+    if !phrase.is_empty() {
+        let located_query_term = LocatedQueryTerm {
+            value: QueryTerm::Phrase { phrase: Phrase { words: mem::take(&mut phrase) } },
+            positions: phrase_start..=phrase_end,
+        };
+        primitive_query.push(located_query_term);
+    }
+
+    Ok(primitive_query)
 }

-impl LocatedQueryTerm {
-    pub fn ngram2(
-        x: &LocatedQueryTerm,
-        y: &LocatedQueryTerm,
-    ) -> Option<(String, RangeInclusive<i8>)> {
-        if *x.positions.end() != y.positions.start() - 1 {
-            println!(
-                "x positions end: {}, y positions start: {}",
-                *x.positions.end(),
-                y.positions.start()
-            );
-            return None;
-        }
-        match (&x.value.original_single_word(), &y.value.original_single_word()) {
-            (Some(w1), Some(w2)) => {
-                let term = (format!("{w1}{w2}"), *x.positions.start()..=*y.positions.end());
-                Some(term)
-            }
-            _ => None,
-        }
+// TODO: return a word derivations instead?
+pub fn ngram2(x: &LocatedQueryTerm, y: &LocatedQueryTerm) -> Option<(String, RangeInclusive<i8>)> {
+    if *x.positions.end() != y.positions.start() - 1 {
+        return None;
    }
-    pub fn ngram3(
-        x: &LocatedQueryTerm,
-        y: &LocatedQueryTerm,
-        z: &LocatedQueryTerm,
-    ) -> Option<(String, RangeInclusive<i8>)> {
-        if *x.positions.end() != y.positions.start() - 1
-            || *y.positions.end() != z.positions.start() - 1
-        {
-            return None;
-        }
-        match (
-            &x.value.original_single_word(),
-            &y.value.original_single_word(),
-            &z.value.original_single_word(),
-        ) {
-            (Some(w1), Some(w2), Some(w3)) => {
-                let term = (format!("{w1}{w2}{w3}"), *x.positions.start()..=*z.positions.end());
-                Some(term)
-            }
-            _ => None,
+    match (&x.value.original_single_word(), &y.value.original_single_word()) {
+        (Some(w1), Some(w2)) => {
+            let term = (format!("{w1}{w2}"), *x.positions.start()..=*y.positions.end());
+            Some(term)
        }
+        _ => None,
+    }
+}
+pub fn ngram3(
+    x: &LocatedQueryTerm,
+    y: &LocatedQueryTerm,
+    z: &LocatedQueryTerm,
+) -> Option<(String, RangeInclusive<i8>)> {
+    if *x.positions.end() != y.positions.start() - 1
+        || *y.positions.end() != z.positions.start() - 1
+    {
+        return None;
+    }
+    match (
+        &x.value.original_single_word(),
+        &y.value.original_single_word(),
+        &z.value.original_single_word(),
+    ) {
+        (Some(w1), Some(w2), Some(w3)) => {
+            let term = (format!("{w1}{w2}{w3}"), *x.positions.start()..=*z.positions.end());
+            Some(term)
+        }
+        _ => None,
    }
 }
--- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs
+++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs
@ -41,11 +41,12 @@ impl<G: RankingRuleGraphTrait> EdgeDocidsCache<G> {
            EdgeDetails::Unconditional => Ok(BitmapOrAllRef::All),
            EdgeDetails::Data(details) => {
                if self.cache.contains_key(&edge_index) {
+                    // TODO: should we update the bitmap in the cache if the new universe
+                    // reduces it?
                    return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index]));
                }
                // TODO: maybe universe doesn't belong here
                let docids = universe & G::compute_docids(index, txn, db_cache, details)?;
-
                let _ = self.cache.insert(edge_index, docids);
                let docids = &self.cache[&edge_index];
                Ok(BitmapOrAllRef::Bitmap(docids))
--- a/milli/src/search/new/ranking_rule_graph/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/mod.rs
@ -1,19 +1,22 @@
-pub mod build;
-pub mod cheapest_paths;
-pub mod edge_docids_cache;
-pub mod empty_paths_cache;
-pub mod paths_map;
-pub mod proximity;
-pub mod resolve_paths;
-pub mod typo;
+mod build;
+mod cheapest_paths;
+mod edge_docids_cache;
+mod empty_paths_cache;
+mod paths_map;
+mod proximity;
+mod resolve_paths;
+mod typo;
+
+pub use edge_docids_cache::EdgeDocidsCache;
+pub use empty_paths_cache::EmptyPathsCache;
+pub use proximity::ProximityGraph;
+pub use typo::TypoGraph;

 use std::ops::ControlFlow;

 use heed::RoTxn;
 use roaring::RoaringBitmap;

-use self::empty_paths_cache::EmptyPathsCache;
-
 use super::db_cache::DatabaseCache;
 use super::logger::SearchLogger;
 use super::{QueryGraph, QueryNode};
--- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs
@ -105,6 +105,7 @@ pub fn visit_to_node<'transaction, 'from_data>(
    assert!(!updb1);

    let derivations1 = derivations1.all_derivations_except_prefix_db();
+    // TODO: eventually, we want to get rid of the uses from `orginal`
    let original_word_2 = derivations2.original.clone();
    let mut cost_proximity_word_pairs = BTreeMap::<u8, BTreeMap<u8, Vec<WordPair>>>::new();

--- a/milli/src/search/new/ranking_rules.rs
+++ b/milli/src/search/new/ranking_rules.rs
@ -1,5 +1,3 @@
-use std::time::Instant;
-
 use heed::RoTxn;
 use roaring::RoaringBitmap;

@ -8,11 +6,11 @@ use super::logger::SearchLogger;

 use super::QueryGraph;
 use crate::new::graph_based_ranking_rule::GraphBasedRankingRule;
-use crate::new::ranking_rule_graph::proximity::ProximityGraph;
-use crate::new::ranking_rule_graph::typo::TypoGraph;
+use crate::new::ranking_rule_graph::ProximityGraph;
+use crate::new::ranking_rule_graph::TypoGraph;
 use crate::new::words::Words;
 // use crate::search::new::sort::Sort;
-use crate::{Filter, Index, Result, TermsMatchingStrategy};
+use crate::{Index, Result, TermsMatchingStrategy};

 pub trait RankingRuleOutputIter<'transaction, Query> {
    fn next_bucket(&mut self) -> Result<Option<RankingRuleOutput<Query>>>;
@ -100,18 +98,18 @@ pub struct RankingRuleOutput<Q> {

 // TODO: can make it generic over the query type (either query graph or placeholder) fairly easily
 #[allow(clippy::too_many_arguments)]
-pub fn execute_search<'transaction>(
+pub fn apply_ranking_rules<'transaction>(
    index: &Index,
    txn: &'transaction heed::RoTxn,
    // TODO: ranking rules parameter
    db_cache: &mut DatabaseCache<'transaction>,
    query_graph: &QueryGraph,
-    filters: Option<Filter>,
+    universe: &RoaringBitmap,
    from: usize,
    length: usize,
    logger: &mut dyn SearchLogger<QueryGraph>,
 ) -> Result<Vec<u32>> {
-    logger.initial_query(query_graph, Instant::now());
+    logger.initial_query(query_graph);
    let words = &mut Words::new(TermsMatchingStrategy::Last);
    // let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?;
    let proximity = &mut GraphBasedRankingRule::<ProximityGraph>::new("proximity".to_owned());
@ -122,25 +120,13 @@ pub fn execute_search<'transaction>(

    logger.ranking_rules(&ranking_rules);

-    let universe = if let Some(filters) = filters {
-        filters.evaluate(txn, index)?
-    } else {
-        index.documents_ids(txn)?
-    };
-
    if universe.len() < from as u64 {
        return Ok(vec![]);
    }

    let ranking_rules_len = ranking_rules.len();
-    logger.start_iteration_ranking_rule(
-        0,
-        ranking_rules[0],
-        query_graph,
-        &universe,
-        Instant::now(),
-    );
-    ranking_rules[0].start_iteration(index, txn, db_cache, logger, &universe, query_graph)?;
+    logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, universe);
+    ranking_rules[0].start_iteration(index, txn, db_cache, logger, universe, query_graph)?;

    let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len];
    candidates[0] = universe.clone();
@ -154,7 +140,6 @@ pub fn execute_search<'transaction>(
                cur_ranking_rule_index,
                ranking_rules[cur_ranking_rule_index],
                &candidates[cur_ranking_rule_index],
-                Instant::now(),
            );
            candidates[cur_ranking_rule_index].clear();
            ranking_rules[cur_ranking_rule_index].end_iteration(index, txn, db_cache, logger);
@ -183,7 +168,6 @@ pub fn execute_search<'transaction>(
                            cur_ranking_rule_index,
                            ranking_rules[cur_ranking_rule_index],
                            &candidates,
-                            Instant::now(),
                        );
                    } else {
                        let all_candidates = candidates.iter().collect::<Vec<_>>();
@ -193,7 +177,6 @@ pub fn execute_search<'transaction>(
                            cur_ranking_rule_index,
                            ranking_rules[cur_ranking_rule_index],
                            &skipped_candidates.into_iter().collect(),
-                            Instant::now(),
                        );
                        let candidates = candidates
                            .iter()
@ -234,7 +217,6 @@ pub fn execute_search<'transaction>(
            ranking_rules[cur_ranking_rule_index],
            &candidates[cur_ranking_rule_index],
            &next_bucket.candidates,
-            Instant::now(),
        );

        assert!(candidates[cur_ranking_rule_index].is_superset(&next_bucket.candidates));
@ -255,7 +237,6 @@ pub fn execute_search<'transaction>(
            ranking_rules[cur_ranking_rule_index],
            &next_bucket.query,
            &candidates[cur_ranking_rule_index],
-            Instant::now(),
        );
        ranking_rules[cur_ranking_rule_index].start_iteration(
            index,
@ -272,11 +253,11 @@ pub fn execute_search<'transaction>(

 #[cfg(test)]
 mod tests {
-    use super::execute_search;
    // use crate::allocator::ALLOC;
    use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
    use crate::index::tests::TempIndex;
    use crate::new::db_cache::DatabaseCache;
+    use crate::new::execute_search;
    use big_s::S;
    use heed::EnvOpenOptions;
    use maplit::hashset;
@ -284,8 +265,7 @@ mod tests {
    use std::io::{BufRead, BufReader, Cursor, Seek};
    use std::time::Instant;
    // use crate::new::logger::detailed::DetailedSearchLogger;
-    use crate::new::logger::{DefaultSearchLogger, SearchLogger};
-    use crate::new::make_query_graph;
+    use crate::new::logger::DefaultSearchLogger;
    use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
    use crate::{Criterion, Index, Object, Search, TermsMatchingStrategy};

@ -321,17 +301,20 @@ mod tests {
            ]))
            .unwrap();
        let txn = index.read_txn().unwrap();
-        let mut logger = DefaultSearchLogger;
        let mut db_cache = DatabaseCache::default();

-        let query_graph =
-            make_query_graph(&index, &txn, &mut db_cache, "releases from poison by the government")
-                .unwrap();
-        logger.initial_query(&query_graph, Instant::now());
+        let results = execute_search(
+            &index,
+            &txn,
+            &mut db_cache,
+            "releases from poison by the government",
+            None,
+            0,
+            50,
+            &mut DefaultSearchLogger,
+        )
+        .unwrap();

-        let results =
-            execute_search(&index, &txn, &mut db_cache, &query_graph, None, 0, 50, &mut logger)
-                .unwrap();
        println!("{results:?}")
    }

@ -352,21 +335,13 @@ mod tests {

        let mut db_cache = DatabaseCache::default();

-        let query_graph = make_query_graph(
-            &index,
-            &txn,
-            &mut db_cache,
-            "which a the releases from poison by the government",
-        )
-        .unwrap();
-
        // let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log");

        let results = execute_search(
            &index,
            &txn,
            &mut db_cache,
-            &query_graph,
+            "which a the releases from poison by the government",
            None,
            0,
            20,
@ -453,17 +428,13 @@ mod tests {

        let mut db_cache = DatabaseCache::default();

-        let query_graph =
-            make_query_graph(&index, &txn, &mut db_cache, "releases from poison by the government")
-                .unwrap();
-
        let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log");

        let results = execute_search(
            &index,
            &txn,
            &mut db_cache,
-            &query_graph,
+            "releases from poison by the government",
            None,
            0,
            20,
--- a/milli/src/search/new/words.rs
+++ b/milli/src/search/new/words.rs
@ -43,12 +43,9 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words {
        _parent_candidates: &RoaringBitmap,
        parent_query_graph: &QueryGraph,
    ) -> Result<()> {
-        // println!("Words: start iteration");
        self.exhausted = false;
        self.query_graph = Some(parent_query_graph.clone());

-        // TODO: a phrase can contain many positions, but represents a single node.
-        // That's a problem.
        let positions_to_remove = match self.terms_matching_strategy {
            TermsMatchingStrategy::Last => {
                let mut all_positions = BTreeSet::new();
@ -60,11 +57,13 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words {
                        QueryNode::Deleted | QueryNode::Start | QueryNode::End => {}
                    }
                }
-                all_positions.into_iter().collect()
+                let mut r: Vec<i8> = all_positions.into_iter().collect();
+                // don't remove the first term
+                r.remove(0);
+                r
            }
            TermsMatchingStrategy::All => vec![],
        };
-        // println!("positions to remove: {positions_to_remove:?}");
        self.positions_to_remove = positions_to_remove;
        self.iterating = true;
        Ok(())
@ -78,7 +77,6 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words {
        logger: &mut dyn SearchLogger<QueryGraph>,
        universe: &RoaringBitmap,
    ) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
-        // println!("Words: next bucket");
        assert!(self.iterating);
        assert!(universe.len() > 1);

@ -122,9 +120,9 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words {
        _db_cache: &mut DatabaseCache<'transaction>,
        _logger: &mut dyn SearchLogger<QueryGraph>,
    ) {
-        // println!("Words: end iteration");
        self.iterating = false;
        self.exhausted = true;
        self.positions_to_remove = vec![];
+        self.query_graph = None;
    }
 }