From 50a7393c554333de51494b59834845b0fb51dc3d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Fri, 21 Jun 2024 11:43:04 +0200
Subject: [PATCH 1/9] Modify the compute_query_term_subset_docids function to
 accept the universe

---
 milli/src/search/new/query_graph.rs                   |  2 +-
 .../search/new/ranking_rule_graph/exactness/mod.rs    |  2 +-
 .../ranking_rule_graph/proximity/compute_docids.rs    |  4 +---
 milli/src/search/new/ranking_rule_graph/typo/mod.rs   |  3 +--
 milli/src/search/new/ranking_rule_graph/words/mod.rs  |  3 +--
 milli/src/search/new/resolve_query_graph.rs           | 11 ++++++-----
 6 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs
index 40e324a7e..9ab5d9dad 100644
--- a/milli/src/search/new/query_graph.rs
+++ b/milli/src/search/new/query_graph.rs
@@ -302,7 +302,7 @@ impl QueryGraph {
             for (_, node) in self.nodes.iter() {
                 match &node.data {
                     QueryNodeData::Term(t) => {
-                        let docids = compute_query_term_subset_docids(ctx, &t.term_subset)?;
+                        let docids = compute_query_term_subset_docids(ctx, None, &t.term_subset)?;
                         for id in t.term_ids.clone() {
                             term_docids
                                 .entry(id)
diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs
index d3fa040ee..274688877 100644
--- a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs
@@ -59,7 +59,7 @@ impl RankingRuleGraphTrait for ExactnessGraph {
             }
             ExactnessCondition::Any(dest_node) => {
                 let docids =
-                    universe & compute_query_term_subset_docids(ctx, &dest_node.term_subset)?;
+                    compute_query_term_subset_docids(ctx, Some(universe), &dest_node.term_subset)?;
                 (docids, dest_node.clone())
             }
         };
diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
index b348c682d..e73e5e6aa 100644
--- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
@@ -22,10 +22,8 @@ pub fn compute_docids(
             (left_term, right_term, *cost)
         }
         ProximityCondition::Term { term } => {
-            let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?;
-            docids &= universe;
             return Ok(ComputedCondition {
-                docids,
+                docids: compute_query_term_subset_docids(ctx, Some(universe), &term.term_subset)?,
                 universe_len: universe.len(),
                 start_term_subset: None,
                 end_term_subset: term.clone(),
diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs
index c1256f6a1..b19163ba2 100644
--- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs
@@ -27,8 +27,7 @@ impl RankingRuleGraphTrait for TypoGraph {
     ) -> Result<ComputedCondition> {
         let TypoCondition { term, .. } = condition;
         // maybe compute_query_term_subset_docids should accept a universe as argument
-        let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?;
-        docids &= universe;
+        let docids = compute_query_term_subset_docids(ctx, Some(universe), &term.term_subset)?;
 
         Ok(ComputedCondition {
             docids,
diff --git a/milli/src/search/new/ranking_rule_graph/words/mod.rs b/milli/src/search/new/ranking_rule_graph/words/mod.rs
index bb6baf669..19b54e76f 100644
--- a/milli/src/search/new/ranking_rule_graph/words/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/words/mod.rs
@@ -26,8 +26,7 @@ impl RankingRuleGraphTrait for WordsGraph {
     ) -> Result<ComputedCondition> {
         let WordsCondition { term, .. } = condition;
         // maybe compute_query_term_subset_docids should accept a universe as argument
-        let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?;
-        docids &= universe;
+        let docids = compute_query_term_subset_docids(ctx, Some(universe), &term.term_subset)?;
 
         Ok(ComputedCondition {
             docids,
diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs
index 8d89e8275..3f24319f1 100644
--- a/milli/src/search/new/resolve_query_graph.rs
+++ b/milli/src/search/new/resolve_query_graph.rs
@@ -31,6 +31,7 @@ impl<'ctx> SearchContext<'ctx> {
 }
 pub fn compute_query_term_subset_docids(
     ctx: &mut SearchContext<'_>,
+    universe: Option<&RoaringBitmap>,
     term: &QueryTermSubset,
 ) -> Result<RoaringBitmap> {
     let mut docids = RoaringBitmap::new();
@@ -49,7 +50,10 @@ pub fn compute_query_term_subset_docids(
         }
     }
 
-    Ok(docids)
+    match universe {
+        Some(universe) => Ok(docids & universe),
+        None => Ok(docids),
+    }
 }
 
 pub fn compute_query_term_subset_docids_within_field_id(
@@ -147,10 +151,7 @@ pub fn compute_query_graph_docids(
                 term_subset,
                 positions: _,
                 term_ids: _,
-            }) => {
-                let node_docids = compute_query_term_subset_docids(ctx, term_subset)?;
-                predecessors_docids & node_docids
-            }
+            }) => compute_query_term_subset_docids(ctx, Some(&predecessors_docids), term_subset)?,
             QueryNodeData::Deleted => {
                 panic!()
             }

From 0ca1a4e8053f4a7c56894846c6cc19fcba153c8d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Fri, 21 Jun 2024 14:26:05 +0200
Subject: [PATCH 2/9] Always do the intersections with the universe

---
 milli/src/search/new/db_cache.rs              | 183 ++++++++++++------
 milli/src/search/new/exact_attribute.rs       |  11 +-
 milli/src/search/new/mod.rs                   |  10 +-
 .../new/query_term/compute_derivations.rs     |   2 +-
 .../new/ranking_rule_graph/exactness/mod.rs   |  13 +-
 .../search/new/ranking_rule_graph/fid/mod.rs  |   8 +-
 .../new/ranking_rule_graph/position/mod.rs    |   1 +
 .../proximity/compute_docids.rs               |  37 ++--
 milli/src/search/new/resolve_query_graph.rs   |  43 ++--
 9 files changed, 201 insertions(+), 107 deletions(-)

diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs
index 95f8e7557..d33058af1 100644
--- a/milli/src/search/new/db_cache.rs
+++ b/milli/src/search/new/db_cache.rs
@@ -46,36 +46,70 @@ pub struct DatabaseCache<'ctx> {
     pub word_prefix_fids: FxHashMap<Interned<String>, Vec<u16>>,
 }
 impl<'ctx> DatabaseCache<'ctx> {
-    fn get_value<'v, K1, KC, DC>(
+    fn get_value<'v, K1, KC>(
         txn: &'ctx RoTxn<'_>,
         cache_key: K1,
         db_key: &'v KC::EItem,
         cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>,
+        universe: Option<&RoaringBitmap>,
         db: Database<KC, Bytes>,
-    ) -> Result<Option<DC::DItem>>
+    ) -> Result<Option<RoaringBitmap>>
     where
         K1: Copy + Eq + Hash,
         KC: BytesEncode<'v>,
-        DC: BytesDecodeOwned,
     {
         if let Entry::Vacant(entry) = cache.entry(cache_key) {
             let bitmap_ptr = db.get(txn, db_key)?.map(Cow::Borrowed);
             entry.insert(bitmap_ptr);
         }
 
-        match cache.get(&cache_key).unwrap() {
-            Some(Cow::Borrowed(bytes)) => DC::bytes_decode_owned(bytes)
+        let bitmap_bytes = match cache.get(&cache_key).unwrap() {
+            Some(Cow::Borrowed(bytes)) => bytes,
+            Some(Cow::Owned(bytes)) => bytes.as_slice(),
+            None => return Ok(None),
+        };
+
+        match (bitmap_bytes, universe) {
+            (bytes, Some(universe)) => {
+                CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)
+                    .map(Some)
+                    .map_err(Into::into)
+            }
+            (bytes, None) => CboRoaringBitmapCodec::bytes_decode_owned(bytes)
                 .map(Some)
                 .map_err(heed::Error::Decoding)
                 .map_err(Into::into),
-            Some(Cow::Owned(bytes)) => DC::bytes_decode_owned(bytes)
-                .map(Some)
-                .map_err(heed::Error::Decoding)
-                .map_err(Into::into),
-            None => Ok(None),
         }
     }
 
+    fn get_value_length<'v, K1, KC>(
+        txn: &'ctx RoTxn<'_>,
+        cache_key: K1,
+        db_key: &'v KC::EItem,
+        cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>,
+        db: Database<KC, Bytes>,
+    ) -> Result<Option<u64>>
+    where
+        K1: Copy + Eq + Hash,
+        KC: BytesEncode<'v>,
+    {
+        if let Entry::Vacant(entry) = cache.entry(cache_key) {
+            let bitmap_ptr = db.get(txn, db_key)?.map(Cow::Borrowed);
+            entry.insert(bitmap_ptr);
+        }
+
+        let bitmap_bytes = match cache.get(&cache_key).unwrap() {
+            Some(Cow::Borrowed(bytes)) => bytes,
+            Some(Cow::Owned(bytes)) => bytes.as_slice(),
+            None => return Ok(None),
+        };
+
+        CboRoaringBitmapLenCodec::bytes_decode_owned(bitmap_bytes)
+            .map(Some)
+            .map_err(heed::Error::Decoding)
+            .map_err(Into::into)
+    }
+
     fn get_value_from_keys<'v, K1, KC, DC>(
         txn: &'ctx RoTxn<'_>,
         cache_key: K1,
@@ -137,11 +171,15 @@ impl<'ctx> SearchContext<'ctx> {
         }
     }
 
-    pub fn word_docids(&mut self, word: Word) -> Result<Option<RoaringBitmap>> {
+    pub fn word_docids(
+        &mut self,
+        universe: Option<&RoaringBitmap>,
+        word: Word,
+    ) -> Result<Option<RoaringBitmap>> {
         match word {
             Word::Original(word) => {
-                let exact = self.get_db_exact_word_docids(word)?;
-                let tolerant = self.get_db_word_docids(word)?;
+                let exact = self.get_db_exact_word_docids(universe, word)?;
+                let tolerant = self.get_db_word_docids(universe, word)?;
                 Ok(match (exact, tolerant) {
                     (None, None) => None,
                     (None, Some(tolerant)) => Some(tolerant),
@@ -153,12 +191,16 @@ impl<'ctx> SearchContext<'ctx> {
                     }
                 })
             }
-            Word::Derived(word) => self.get_db_word_docids(word),
+            Word::Derived(word) => self.get_db_word_docids(universe, word),
         }
     }
 
     /// Retrieve or insert the given value in the `word_docids` database.
-    fn get_db_word_docids(&mut self, word: Interned<String>) -> Result<Option<RoaringBitmap>> {
+    fn get_db_word_docids(
+        &mut self,
+        universe: Option<&RoaringBitmap>,
+        word: Interned<String>,
+    ) -> Result<Option<RoaringBitmap>> {
         match &self.restricted_fids {
             Some(restricted_fids) => {
                 let interned = self.word_interner.get(word).as_str();
@@ -174,11 +216,12 @@ impl<'ctx> SearchContext<'ctx> {
                     merge_cbo_roaring_bitmaps,
                 )
             }
-            None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
+            None => DatabaseCache::get_value::<_, _>(
                 self.txn,
                 word,
                 self.word_interner.get(word).as_str(),
                 &mut self.db_cache.word_docids,
+                universe,
                 self.index.word_docids.remap_data_type::<Bytes>(),
             ),
         }
@@ -186,6 +229,7 @@ impl<'ctx> SearchContext<'ctx> {
 
     fn get_db_exact_word_docids(
         &mut self,
+        universe: Option<&RoaringBitmap>,
         word: Interned<String>,
     ) -> Result<Option<RoaringBitmap>> {
         match &self.restricted_fids {
@@ -203,21 +247,26 @@ impl<'ctx> SearchContext<'ctx> {
                     merge_cbo_roaring_bitmaps,
                 )
             }
-            None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
+            None => DatabaseCache::get_value::<_, _>(
                 self.txn,
                 word,
                 self.word_interner.get(word).as_str(),
                 &mut self.db_cache.exact_word_docids,
+                universe,
                 self.index.exact_word_docids.remap_data_type::<Bytes>(),
             ),
         }
     }
 
-    pub fn word_prefix_docids(&mut self, prefix: Word) -> Result<Option<RoaringBitmap>> {
+    pub fn word_prefix_docids(
+        &mut self,
+        universe: Option<&RoaringBitmap>,
+        prefix: Word,
+    ) -> Result<Option<RoaringBitmap>> {
         match prefix {
             Word::Original(prefix) => {
-                let exact = self.get_db_exact_word_prefix_docids(prefix)?;
-                let tolerant = self.get_db_word_prefix_docids(prefix)?;
+                let exact = self.get_db_exact_word_prefix_docids(universe, prefix)?;
+                let tolerant = self.get_db_word_prefix_docids(universe, prefix)?;
                 Ok(match (exact, tolerant) {
                     (None, None) => None,
                     (None, Some(tolerant)) => Some(tolerant),
@@ -229,13 +278,14 @@ impl<'ctx> SearchContext<'ctx> {
                     }
                 })
             }
-            Word::Derived(prefix) => self.get_db_word_prefix_docids(prefix),
+            Word::Derived(prefix) => self.get_db_word_prefix_docids(universe, prefix),
         }
     }
 
     /// Retrieve or insert the given value in the `word_prefix_docids` database.
     fn get_db_word_prefix_docids(
         &mut self,
+        universe: Option<&RoaringBitmap>,
         prefix: Interned<String>,
     ) -> Result<Option<RoaringBitmap>> {
         match &self.restricted_fids {
@@ -253,11 +303,12 @@ impl<'ctx> SearchContext<'ctx> {
                     merge_cbo_roaring_bitmaps,
                 )
             }
-            None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
+            None => DatabaseCache::get_value::<_, _>(
                 self.txn,
                 prefix,
                 self.word_interner.get(prefix).as_str(),
                 &mut self.db_cache.word_prefix_docids,
+                universe,
                 self.index.word_prefix_docids.remap_data_type::<Bytes>(),
             ),
         }
@@ -265,6 +316,7 @@ impl<'ctx> SearchContext<'ctx> {
 
     fn get_db_exact_word_prefix_docids(
         &mut self,
+        universe: Option<&RoaringBitmap>,
         prefix: Interned<String>,
     ) -> Result<Option<RoaringBitmap>> {
         match &self.restricted_fids {
@@ -282,11 +334,12 @@ impl<'ctx> SearchContext<'ctx> {
                     merge_cbo_roaring_bitmaps,
                 )
             }
-            None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
+            None => DatabaseCache::get_value::<_, _>(
                 self.txn,
                 prefix,
                 self.word_interner.get(prefix).as_str(),
                 &mut self.db_cache.exact_word_prefix_docids,
+                universe,
                 self.index.exact_word_prefix_docids.remap_data_type::<Bytes>(),
             ),
         }
@@ -294,6 +347,7 @@ impl<'ctx> SearchContext<'ctx> {
 
     pub fn get_db_word_pair_proximity_docids(
         &mut self,
+        universe: Option<&RoaringBitmap>,
         word1: Interned<String>,
         word2: Interned<String>,
         proximity: u8,
@@ -320,8 +374,8 @@ impl<'ctx> SearchContext<'ctx> {
                     for fid in fids {
                         // for each field, intersect left word bitmap and right word bitmap,
                         // then merge the result in a global bitmap before storing it in the cache.
-                        let word1_docids = self.get_db_word_fid_docids(word1, fid)?;
-                        let word2_docids = self.get_db_word_fid_docids(word2, fid)?;
+                        let word1_docids = self.get_db_word_fid_docids(universe, word1, fid)?;
+                        let word2_docids = self.get_db_word_fid_docids(universe, word2, fid)?;
                         if let (Some(word1_docids), Some(word2_docids)) =
                             (word1_docids, word2_docids)
                         {
@@ -341,7 +395,33 @@ impl<'ctx> SearchContext<'ctx> {
 
                 Ok(docids)
             }
-            ProximityPrecision::ByWord => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
+            ProximityPrecision::ByWord => DatabaseCache::get_value::<_, _>(
+                self.txn,
+                (proximity, word1, word2),
+                &(
+                    proximity,
+                    self.word_interner.get(word1).as_str(),
+                    self.word_interner.get(word2).as_str(),
+                ),
+                &mut self.db_cache.word_pair_proximity_docids,
+                universe,
+                self.index.word_pair_proximity_docids.remap_data_type::<Bytes>(),
+            ),
+        }
+    }
+
+    pub fn get_db_word_pair_proximity_docids_len(
+        &mut self,
+        universe: Option<&RoaringBitmap>,
+        word1: Interned<String>,
+        word2: Interned<String>,
+        proximity: u8,
+    ) -> Result<Option<u64>> {
+        match self.index.proximity_precision(self.txn)?.unwrap_or_default() {
+            ProximityPrecision::ByAttribute => Ok(self
+                .get_db_word_pair_proximity_docids(universe, word1, word2, proximity)?
+                .map(|d| d.len())),
+            ProximityPrecision::ByWord => DatabaseCache::get_value_length::<_, _>(
                 self.txn,
                 (proximity, word1, word2),
                 &(
@@ -355,34 +435,9 @@ impl<'ctx> SearchContext<'ctx> {
         }
     }
 
-    pub fn get_db_word_pair_proximity_docids_len(
-        &mut self,
-        word1: Interned<String>,
-        word2: Interned<String>,
-        proximity: u8,
-    ) -> Result<Option<u64>> {
-        match self.index.proximity_precision(self.txn)?.unwrap_or_default() {
-            ProximityPrecision::ByAttribute => Ok(self
-                .get_db_word_pair_proximity_docids(word1, word2, proximity)?
-                .map(|d| d.len())),
-            ProximityPrecision::ByWord => {
-                DatabaseCache::get_value::<_, _, CboRoaringBitmapLenCodec>(
-                    self.txn,
-                    (proximity, word1, word2),
-                    &(
-                        proximity,
-                        self.word_interner.get(word1).as_str(),
-                        self.word_interner.get(word2).as_str(),
-                    ),
-                    &mut self.db_cache.word_pair_proximity_docids,
-                    self.index.word_pair_proximity_docids.remap_data_type::<Bytes>(),
-                )
-            }
-        }
-    }
-
     pub fn get_db_word_prefix_pair_proximity_docids(
         &mut self,
+        universe: Option<&RoaringBitmap>,
         word1: Interned<String>,
         prefix2: Interned<String>,
         mut proximity: u8,
@@ -409,8 +464,9 @@ impl<'ctx> SearchContext<'ctx> {
                     // for each field, intersect left word bitmap and right word bitmap,
                     // then merge the result in a global bitmap before storing it in the cache.
                     for fid in fids {
-                        let word1_docids = self.get_db_word_fid_docids(word1, fid)?;
-                        let prefix2_docids = self.get_db_word_prefix_fid_docids(prefix2, fid)?;
+                        let word1_docids = self.get_db_word_fid_docids(universe, word1, fid)?;
+                        let prefix2_docids =
+                            self.get_db_word_prefix_fid_docids(universe, prefix2, fid)?;
                         if let (Some(word1_docids), Some(prefix2_docids)) =
                             (word1_docids, prefix2_docids)
                         {
@@ -452,16 +508,18 @@ impl<'ctx> SearchContext<'ctx> {
 
     pub fn get_db_prefix_word_pair_proximity_docids(
         &mut self,
+        universe: Option<&RoaringBitmap>,
         left_prefix: Interned<String>,
         right: Interned<String>,
         proximity: u8,
     ) -> Result<Option<RoaringBitmap>> {
         // only accept exact matches on reverted positions
-        self.get_db_word_pair_proximity_docids(left_prefix, right, proximity)
+        self.get_db_word_pair_proximity_docids(universe, left_prefix, right, proximity)
     }
 
     pub fn get_db_word_fid_docids(
         &mut self,
+        universe: Option<&RoaringBitmap>,
         word: Interned<String>,
         fid: u16,
     ) -> Result<Option<RoaringBitmap>> {
@@ -470,17 +528,19 @@ impl<'ctx> SearchContext<'ctx> {
             return Ok(None);
         }
 
-        DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
+        DatabaseCache::get_value::<_, _>(
             self.txn,
             (word, fid),
             &(self.word_interner.get(word).as_str(), fid),
             &mut self.db_cache.word_fid_docids,
+            universe,
             self.index.word_fid_docids.remap_data_type::<Bytes>(),
         )
     }
 
     pub fn get_db_word_prefix_fid_docids(
         &mut self,
+        universe: Option<&RoaringBitmap>,
         word_prefix: Interned<String>,
         fid: u16,
     ) -> Result<Option<RoaringBitmap>> {
@@ -489,11 +549,12 @@ impl<'ctx> SearchContext<'ctx> {
             return Ok(None);
         }
 
-        DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
+        DatabaseCache::get_value::<_, _>(
             self.txn,
             (word_prefix, fid),
             &(self.word_interner.get(word_prefix).as_str(), fid),
             &mut self.db_cache.word_prefix_fid_docids,
+            universe,
             self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
         )
     }
@@ -554,28 +615,32 @@ impl<'ctx> SearchContext<'ctx> {
 
     pub fn get_db_word_position_docids(
         &mut self,
+        universe: Option<&RoaringBitmap>,
         word: Interned<String>,
         position: u16,
     ) -> Result<Option<RoaringBitmap>> {
-        DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
+        DatabaseCache::get_value::<_, _>(
             self.txn,
             (word, position),
             &(self.word_interner.get(word).as_str(), position),
             &mut self.db_cache.word_position_docids,
+            universe,
             self.index.word_position_docids.remap_data_type::<Bytes>(),
         )
     }
 
     pub fn get_db_word_prefix_position_docids(
         &mut self,
+        universe: Option<&RoaringBitmap>,
         word_prefix: Interned<String>,
         position: u16,
     ) -> Result<Option<RoaringBitmap>> {
-        DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
+        DatabaseCache::get_value::<_, _>(
             self.txn,
             (word_prefix, position),
             &(self.word_interner.get(word_prefix).as_str(), position),
             &mut self.db_cache.word_prefix_position_docids,
+            universe,
             self.index.word_prefix_position_docids.remap_data_type::<Bytes>(),
         )
     }
diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs
index d270c4847..52efa63a4 100644
--- a/milli/src/search/new/exact_attribute.rs
+++ b/milli/src/search/new/exact_attribute.rs
@@ -171,9 +171,10 @@ impl State {
                 // Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of
                 // longer phrases we'll be losing on precision here.
                 let bucketed_position = crate::bucketed_position(position + offset);
-                let word_position_docids =
-                    ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default()
-                        & universe;
+                let word_position_docids = ctx
+                    .get_db_word_position_docids(Some(universe), *word, bucketed_position)?
+                    .unwrap_or_default()
+                    & universe;
                 candidates &= word_position_docids;
                 if candidates.is_empty() {
                     return Ok(State::Empty(query_graph.clone()));
@@ -199,7 +200,9 @@ impl State {
                     // ignore stop words words in phrases
                     .flatten()
                     .map(|word| -> Result<_> {
-                        Ok(ctx.get_db_word_fid_docids(*word, fid)?.unwrap_or_default())
+                        Ok(ctx
+                            .get_db_word_fid_docids(Some(universe), *word, fid)?
+                            .unwrap_or_default())
                     }),
             )?;
             intersection &= &candidates;
diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs
index 208e1b428..73aaa330f 100644
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@@ -232,11 +232,12 @@ fn resolve_universe(
 #[tracing::instrument(level = "trace", skip_all, target = "search::query")]
 fn resolve_negative_words(
     ctx: &mut SearchContext<'_>,
+    universe: Option<&RoaringBitmap>,
     negative_words: &[Word],
 ) -> Result<RoaringBitmap> {
     let mut negative_bitmap = RoaringBitmap::new();
     for &word in negative_words {
-        if let Some(bitmap) = ctx.word_docids(word)? {
+        if let Some(bitmap) = ctx.word_docids(universe, word)? {
             negative_bitmap |= bitmap;
         }
     }
@@ -246,13 +247,14 @@ fn resolve_negative_words(
 #[tracing::instrument(level = "trace", skip_all, target = "search::query")]
 fn resolve_negative_phrases(
     ctx: &mut SearchContext<'_>,
+    universe: Option<&RoaringBitmap>,
     negative_phrases: &[LocatedQueryTerm],
 ) -> Result<RoaringBitmap> {
     let mut negative_bitmap = RoaringBitmap::new();
     for term in negative_phrases {
         let query_term = ctx.term_interner.get(term.value);
         if let Some(phrase) = query_term.original_phrase() {
-            negative_bitmap |= ctx.get_phrase_docids(phrase)?;
+            negative_bitmap |= ctx.get_phrase_docids(universe, phrase)?;
         }
     }
     Ok(negative_bitmap)
@@ -686,8 +688,8 @@ pub fn execute_search(
             located_query_terms_from_tokens(ctx, tokens, words_limit)?;
         used_negative_operator = !negative_words.is_empty() || !negative_phrases.is_empty();
 
-        let ignored_documents = resolve_negative_words(ctx, &negative_words)?;
-        let ignored_phrases = resolve_negative_phrases(ctx, &negative_phrases)?;
+        let ignored_documents = resolve_negative_words(ctx, Some(&universe), &negative_words)?;
+        let ignored_phrases = resolve_negative_phrases(ctx, Some(&universe), &negative_phrases)?;
 
         universe -= ignored_documents;
         universe -= ignored_phrases;
diff --git a/milli/src/search/new/query_term/compute_derivations.rs b/milli/src/search/new/query_term/compute_derivations.rs
index c1783431c..e2a136328 100644
--- a/milli/src/search/new/query_term/compute_derivations.rs
+++ b/milli/src/search/new/query_term/compute_derivations.rs
@@ -417,7 +417,7 @@ fn split_best_frequency(
         let left = ctx.word_interner.insert(left.to_owned());
         let right = ctx.word_interner.insert(right.to_owned());
 
-        if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(left, right, 1)? {
+        if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(None, left, right, 1)? {
             if best.map_or(true, |(old, _, _)| frequency > old) {
                 best = Some((frequency, left, right));
             }
diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs
index 274688877..ae3a582bb 100644
--- a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs
@@ -26,18 +26,15 @@ fn compute_docids(
     } else {
         return Ok(Default::default());
     };
-    let mut candidates = match exact_term {
-        ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(),
+
+    let candidates = match exact_term {
+        // TODO I move the intersection here
+        ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(Some(universe), phrase)? & universe,
         ExactTerm::Word(word) => {
-            if let Some(word_candidates) = ctx.word_docids(Word::Original(word))? {
-                word_candidates
-            } else {
-                return Ok(Default::default());
-            }
+            ctx.word_docids(Some(universe), Word::Original(word))?.unwrap_or_default()
         }
     };
 
-    candidates &= universe;
     Ok(candidates)
 }
 
diff --git a/milli/src/search/new/ranking_rule_graph/fid/mod.rs b/milli/src/search/new/ranking_rule_graph/fid/mod.rs
index 2019eb380..ad4a83dc3 100644
--- a/milli/src/search/new/ranking_rule_graph/fid/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/fid/mod.rs
@@ -30,8 +30,12 @@ impl RankingRuleGraphTrait for FidGraph {
 
         let docids = if let Some(fid) = condition.fid {
             // maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument
-            let docids =
-                compute_query_term_subset_docids_within_field_id(ctx, &term.term_subset, fid)?;
+            let docids = compute_query_term_subset_docids_within_field_id(
+                ctx,
+                Some(universe),
+                &term.term_subset,
+                fid,
+            )?;
             docids & universe
         } else {
             RoaringBitmap::new()
diff --git a/milli/src/search/new/ranking_rule_graph/position/mod.rs b/milli/src/search/new/ranking_rule_graph/position/mod.rs
index 567aad550..fd61c1dcc 100644
--- a/milli/src/search/new/ranking_rule_graph/position/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/position/mod.rs
@@ -33,6 +33,7 @@ impl RankingRuleGraphTrait for PositionGraph {
             docids |= universe
                 & compute_query_term_subset_docids_within_position(
                     ctx,
+                    Some(universe),
                     &term.term_subset,
                     *position,
                 )?;
diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
index e73e5e6aa..29459cd99 100644
--- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
@@ -74,10 +74,10 @@ pub fn compute_docids(
         if right_derivs.len() > 1 {
             let universe = &universe;
             if let Some(left_phrase) = left_phrase {
-                if universe.is_disjoint(ctx.get_phrase_docids(left_phrase)?) {
+                if universe.is_disjoint(ctx.get_phrase_docids(Some(universe), left_phrase)?) {
                     continue;
                 }
-            } else if let Some(left_word_docids) = ctx.word_docids(left_word)? {
+            } else if let Some(left_word_docids) = ctx.word_docids(Some(universe), left_word)? {
                 if universe.is_disjoint(&left_word_docids) {
                     continue;
                 }
@@ -123,7 +123,10 @@ fn compute_prefix_edges(
 
     let mut universe = universe.clone();
     if let Some(phrase) = left_phrase {
-        let phrase_docids = ctx.get_phrase_docids(phrase)?;
+        // TODO we can clearly give the universe to this method
+        //      Unfortunately, it is deserializing/computing stuff and
+        //      keeping the result as a materialized bitmap.
+        let phrase_docids = ctx.get_phrase_docids(Some(&universe), phrase)?;
         if !phrase_docids.is_empty() {
             used_left_phrases.insert(phrase);
         }
@@ -133,9 +136,13 @@ fn compute_prefix_edges(
         }
     }
 
-    if let Some(new_docids) =
-        ctx.get_db_word_prefix_pair_proximity_docids(left_word, right_prefix, forward_proximity)?
-    {
+    // TODO check that the fact that the universe always changes is not an issue, e.g. caching stuff.
+    if let Some(new_docids) = ctx.get_db_word_prefix_pair_proximity_docids(
+        Some(&universe),
+        left_word,
+        right_prefix,
+        forward_proximity,
+    )? {
         let new_docids = &universe & new_docids;
         if !new_docids.is_empty() {
             used_left_words.insert(left_word);
@@ -147,6 +154,7 @@ fn compute_prefix_edges(
     // No swapping when computing the proximity between a phrase and a word
     if left_phrase.is_none() {
         if let Some(new_docids) = ctx.get_db_prefix_word_pair_proximity_docids(
+            Some(&universe),
             right_prefix,
             left_word,
             backward_proximity,
@@ -177,26 +185,29 @@ fn compute_non_prefix_edges(
     let mut universe = universe.clone();
 
     for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() {
-        let phrase_docids = ctx.get_phrase_docids(phrase)?;
+        // TODO do the intersection in the method, again!
+        let phrase_docids = ctx.get_phrase_docids(Some(&universe), phrase)?;
         universe &= phrase_docids;
         if universe.is_empty() {
             return Ok(());
         }
     }
 
+    // TODO check that it is not an issue to alterate the universe
     if let Some(new_docids) =
-        ctx.get_db_word_pair_proximity_docids(word1, word2, forward_proximity)?
+        ctx.get_db_word_pair_proximity_docids(Some(&universe), word1, word2, forward_proximity)?
     {
-        let new_docids = &universe & new_docids;
         if !new_docids.is_empty() {
             *docids |= new_docids;
         }
     }
     if backward_proximity >= 1 && left_phrase.is_none() && right_phrase.is_none() {
-        if let Some(new_docids) =
-            ctx.get_db_word_pair_proximity_docids(word2, word1, backward_proximity)?
-        {
-            let new_docids = &universe & new_docids;
+        if let Some(new_docids) = ctx.get_db_word_pair_proximity_docids(
+            Some(&universe),
+            word2,
+            word1,
+            backward_proximity,
+        )? {
             if !new_docids.is_empty() {
                 *docids |= new_docids;
             }
diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs
index 3f24319f1..6cab2f364 100644
--- a/milli/src/search/new/resolve_query_graph.rs
+++ b/milli/src/search/new/resolve_query_graph.rs
@@ -19,11 +19,16 @@ pub struct PhraseDocIdsCache {
 }
 impl<'ctx> SearchContext<'ctx> {
     /// Get the document ids associated with the given phrase
-    pub fn get_phrase_docids(&mut self, phrase: Interned<Phrase>) -> Result<&RoaringBitmap> {
+    pub fn get_phrase_docids(
+        &mut self,
+        universe: Option<&RoaringBitmap>,
+        phrase: Interned<Phrase>,
+    ) -> Result<&RoaringBitmap> {
         if self.phrase_docids.cache.contains_key(&phrase) {
             return Ok(&self.phrase_docids.cache[&phrase]);
         };
-        let docids = compute_phrase_docids(self, phrase)?;
+        let docids = compute_phrase_docids(self, universe, phrase)?;
+        // TODO can we improve that? Because there is an issue, we keep that in cache...
         let _ = self.phrase_docids.cache.insert(phrase, docids);
         let docids = &self.phrase_docids.cache[&phrase];
         Ok(docids)
@@ -35,17 +40,18 @@ pub fn compute_query_term_subset_docids(
     term: &QueryTermSubset,
 ) -> Result<RoaringBitmap> {
     let mut docids = RoaringBitmap::new();
+    // TODO use the MultiOps trait to do large intersections
     for word in term.all_single_words_except_prefix_db(ctx)? {
-        if let Some(word_docids) = ctx.word_docids(word)? {
+        if let Some(word_docids) = ctx.word_docids(universe, word)? {
             docids |= word_docids;
         }
     }
     for phrase in term.all_phrases(ctx)? {
-        docids |= ctx.get_phrase_docids(phrase)?;
+        docids |= ctx.get_phrase_docids(universe, phrase)?;
     }
 
     if let Some(prefix) = term.use_prefix_db(ctx) {
-        if let Some(prefix_docids) = ctx.word_prefix_docids(prefix)? {
+        if let Some(prefix_docids) = ctx.word_prefix_docids(universe, prefix)? {
             docids |= prefix_docids;
         }
     }
@@ -58,12 +64,13 @@ pub fn compute_query_term_subset_docids(
 
 pub fn compute_query_term_subset_docids_within_field_id(
     ctx: &mut SearchContext<'_>,
+    universe: Option<&RoaringBitmap>,
     term: &QueryTermSubset,
     fid: u16,
 ) -> Result<RoaringBitmap> {
     let mut docids = RoaringBitmap::new();
     for word in term.all_single_words_except_prefix_db(ctx)? {
-        if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(word.interned(), fid)? {
+        if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(universe, word.interned(), fid)? {
             docids |= word_fid_docids;
         }
     }
@@ -72,15 +79,15 @@ pub fn compute_query_term_subset_docids_within_field_id(
         // There may be false positives when resolving a phrase, so we're not
         // guaranteed that all of its words are within a single fid.
         if let Some(word) = phrase.words(ctx).iter().flatten().next() {
-            if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(*word, fid)? {
-                docids |= ctx.get_phrase_docids(phrase)? & word_fid_docids;
+            if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(universe, *word, fid)? {
+                docids |= ctx.get_phrase_docids(Some(&word_fid_docids), phrase)?;
             }
         }
     }
 
     if let Some(word_prefix) = term.use_prefix_db(ctx) {
         if let Some(word_fid_docids) =
-            ctx.get_db_word_prefix_fid_docids(word_prefix.interned(), fid)?
+            ctx.get_db_word_prefix_fid_docids(universe, word_prefix.interned(), fid)?
         {
             docids |= word_fid_docids;
         }
@@ -91,13 +98,14 @@ pub fn compute_query_term_subset_docids_within_field_id(
 
 pub fn compute_query_term_subset_docids_within_position(
     ctx: &mut SearchContext<'_>,
+    universe: Option<&RoaringBitmap>,
     term: &QueryTermSubset,
     position: u16,
 ) -> Result<RoaringBitmap> {
     let mut docids = RoaringBitmap::new();
     for word in term.all_single_words_except_prefix_db(ctx)? {
         if let Some(word_position_docids) =
-            ctx.get_db_word_position_docids(word.interned(), position)?
+            ctx.get_db_word_position_docids(universe, word.interned(), position)?
         {
             docids |= word_position_docids;
         }
@@ -107,15 +115,17 @@ pub fn compute_query_term_subset_docids_within_position(
         // It's difficult to know the expected position of the words in the phrase,
         // so instead we just check the first one.
         if let Some(word) = phrase.words(ctx).iter().flatten().next() {
-            if let Some(word_position_docids) = ctx.get_db_word_position_docids(*word, position)? {
-                docids |= ctx.get_phrase_docids(phrase)? & word_position_docids
+            if let Some(word_position_docids) =
+                ctx.get_db_word_position_docids(universe, *word, position)?
+            {
+                docids |= ctx.get_phrase_docids(Some(&word_position_docids), phrase)?;
             }
         }
     }
 
     if let Some(word_prefix) = term.use_prefix_db(ctx) {
         if let Some(word_position_docids) =
-            ctx.get_db_word_prefix_position_docids(word_prefix.interned(), position)?
+            ctx.get_db_word_prefix_position_docids(universe, word_prefix.interned(), position)?
         {
             docids |= word_position_docids;
         }
@@ -180,6 +190,7 @@ pub fn compute_query_graph_docids(
 
 pub fn compute_phrase_docids(
     ctx: &mut SearchContext<'_>,
+    universe: Option<&RoaringBitmap>,
     phrase: Interned<Phrase>,
 ) -> Result<RoaringBitmap> {
     let Phrase { words } = ctx.phrase_interner.get(phrase).clone();
@@ -189,7 +200,7 @@ pub fn compute_phrase_docids(
     }
     let mut candidates = RoaringBitmap::new();
     for word in words.iter().flatten().copied() {
-        if let Some(word_docids) = ctx.word_docids(Word::Original(word))? {
+        if let Some(word_docids) = ctx.word_docids(universe, Word::Original(word))? {
             candidates |= word_docids;
         } else {
             return Ok(RoaringBitmap::new());
@@ -213,7 +224,7 @@ pub fn compute_phrase_docids(
                 .filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
             {
                 if dist == 0 {
-                    match ctx.get_db_word_pair_proximity_docids(s1, s2, 1)? {
+                    match ctx.get_db_word_pair_proximity_docids(universe, s1, s2, 1)? {
                         Some(m) => bitmaps.push(m),
                         // If there are no documents for this pair, there will be no
                         // results for the phrase query.
@@ -223,7 +234,7 @@ pub fn compute_phrase_docids(
                     let mut bitmap = RoaringBitmap::new();
                     for dist in 0..=dist {
                         if let Some(m) =
-                            ctx.get_db_word_pair_proximity_docids(s1, s2, dist as u8 + 1)?
+                            ctx.get_db_word_pair_proximity_docids(universe, s1, s2, dist as u8 + 1)?
                         {
                             bitmap |= m;
                         }

From 41f51adbeceeea45ad7e08e06e908e440aa681b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Fri, 21 Jun 2024 14:56:47 +0200
Subject: [PATCH 3/9] Do less useless intersections

---
 milli/src/search/new/exact_attribute.rs       | 26 ++++++++++++++-----
 .../search/new/ranking_rule_graph/fid/mod.rs  |  6 ++---
 .../new/ranking_rule_graph/position/mod.rs    | 14 +++++-----
 .../proximity/compute_docids.rs               |  7 ++---
 milli/src/search/new/resolve_query_graph.rs   |  1 +
 5 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs
index 52efa63a4..b013adfa0 100644
--- a/milli/src/search/new/exact_attribute.rs
+++ b/milli/src/search/new/exact_attribute.rs
@@ -1,3 +1,4 @@
+use heed::types::Bytes;
 use roaring::{MultiOps, RoaringBitmap};
 
 use super::query_graph::QueryGraph;
@@ -5,7 +6,7 @@ use super::ranking_rules::{RankingRule, RankingRuleOutput};
 use crate::score_details::{self, ScoreDetails};
 use crate::search::new::query_graph::QueryNodeData;
 use crate::search::new::query_term::ExactTerm;
-use crate::{Result, SearchContext, SearchLogger};
+use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger};
 
 /// A ranking rule that produces 3 disjoint buckets:
 ///
@@ -173,8 +174,7 @@ impl State {
                 let bucketed_position = crate::bucketed_position(position + offset);
                 let word_position_docids = ctx
                     .get_db_word_position_docids(Some(universe), *word, bucketed_position)?
-                    .unwrap_or_default()
-                    & universe;
+                    .unwrap_or_default();
                 candidates &= word_position_docids;
                 if candidates.is_empty() {
                     return Ok(State::Empty(query_graph.clone()));
@@ -205,16 +205,24 @@ impl State {
                             .unwrap_or_default())
                     }),
             )?;
+            // TODO Why not doing this intersection in the MultiOps above?
             intersection &= &candidates;
             if !intersection.is_empty() {
                 // Although not really worth it in terms of performance,
                 // if would be good to put this in cache for the sake of consistency
                 let candidates_with_exact_word_count = if count_all_positions < u8::MAX as usize {
-                    ctx.index
+                    let bitmap_bytes = ctx
+                        .index
                         .field_id_word_count_docids
-                        .get(ctx.txn, &(fid, count_all_positions as u8))?
-                        .unwrap_or_default()
-                        & universe
+                        .remap_data_type::<Bytes>()
+                        .get(ctx.txn, &(fid, count_all_positions as u8))?;
+
+                    match bitmap_bytes {
+                        Some(bytes) => {
+                            CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)?
+                        }
+                        None => RoaringBitmap::default(),
+                    }
                 } else {
                     RoaringBitmap::default()
                 };
@@ -237,6 +245,8 @@ impl State {
         let (state, output) = match state {
             State::Uninitialized => (state, None),
             State::ExactAttribute(query_graph, candidates_per_attribute) => {
+                // TODO it can be much faster to do the intersections before the unions...
+                //      or maybe the candidates_per_attribute are not containing anything outside universe
                 let mut candidates = MultiOps::union(candidates_per_attribute.iter().map(
                     |FieldCandidates { start_with_exact, exact_word_count }| {
                         start_with_exact & exact_word_count
@@ -255,6 +265,8 @@ impl State {
                 )
             }
             State::AttributeStarts(query_graph, candidates_per_attribute) => {
+                // TODO it can be much faster to do the intersections before the unions...
+                //      or maybe the candidates_per_attribute are not containing anything outside universe
                 let mut candidates = MultiOps::union(candidates_per_attribute.into_iter().map(
                     |FieldCandidates { mut start_with_exact, exact_word_count }| {
                         start_with_exact -= exact_word_count;
diff --git a/milli/src/search/new/ranking_rule_graph/fid/mod.rs b/milli/src/search/new/ranking_rule_graph/fid/mod.rs
index ad4a83dc3..67775ddea 100644
--- a/milli/src/search/new/ranking_rule_graph/fid/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/fid/mod.rs
@@ -29,14 +29,12 @@ impl RankingRuleGraphTrait for FidGraph {
         let FidCondition { term, .. } = condition;
 
         let docids = if let Some(fid) = condition.fid {
-            // maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument
-            let docids = compute_query_term_subset_docids_within_field_id(
+            compute_query_term_subset_docids_within_field_id(
                 ctx,
                 Some(universe),
                 &term.term_subset,
                 fid,
-            )?;
-            docids & universe
+            )?
         } else {
             RoaringBitmap::new()
         };
diff --git a/milli/src/search/new/ranking_rule_graph/position/mod.rs b/milli/src/search/new/ranking_rule_graph/position/mod.rs
index fd61c1dcc..cbfa705a3 100644
--- a/milli/src/search/new/ranking_rule_graph/position/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/position/mod.rs
@@ -28,15 +28,15 @@ impl RankingRuleGraphTrait for PositionGraph {
     ) -> Result<ComputedCondition> {
         let PositionCondition { term, positions } = condition;
         let mut docids = RoaringBitmap::new();
+        // TODO use MultiOps to do the big union
         for position in positions {
             // maybe compute_query_term_subset_docids_within_position should accept a universe as argument
-            docids |= universe
-                & compute_query_term_subset_docids_within_position(
-                    ctx,
-                    Some(universe),
-                    &term.term_subset,
-                    *position,
-                )?;
+            docids |= compute_query_term_subset_docids_within_position(
+                ctx,
+                Some(universe),
+                &term.term_subset,
+                *position,
+            )?;
         }
         Ok(ComputedCondition {
             docids,
diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
index 29459cd99..6d3199a8d 100644
--- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
@@ -143,7 +143,6 @@ fn compute_prefix_edges(
         right_prefix,
         forward_proximity,
     )? {
-        let new_docids = &universe & new_docids;
         if !new_docids.is_empty() {
             used_left_words.insert(left_word);
             used_right_prefix.insert(right_prefix);
@@ -153,13 +152,13 @@ fn compute_prefix_edges(
 
     // No swapping when computing the proximity between a phrase and a word
     if left_phrase.is_none() {
+        // TODO check that the fact that the universe always changes is not an issue, e.g. caching stuff.
         if let Some(new_docids) = ctx.get_db_prefix_word_pair_proximity_docids(
             Some(&universe),
             right_prefix,
             left_word,
             backward_proximity,
         )? {
-            let new_docids = &universe & new_docids;
             if !new_docids.is_empty() {
                 used_left_words.insert(left_word);
                 used_right_prefix.insert(right_prefix);
@@ -185,9 +184,7 @@ fn compute_non_prefix_edges(
     let mut universe = universe.clone();
 
     for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() {
-        // TODO do the intersection in the method, again!
-        let phrase_docids = ctx.get_phrase_docids(Some(&universe), phrase)?;
-        universe &= phrase_docids;
+        universe &= ctx.get_phrase_docids(Some(&universe), phrase)?;
         if universe.is_empty() {
             return Ok(());
         }
diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs
index 6cab2f364..24e677477 100644
--- a/milli/src/search/new/resolve_query_graph.rs
+++ b/milli/src/search/new/resolve_query_graph.rs
@@ -251,6 +251,7 @@ pub fn compute_phrase_docids(
         // We sort the bitmaps so that we perform the small intersections first, which is faster.
         bitmaps.sort_unstable_by_key(|a| a.len());
 
+        // TODO use MultiOps intersection which and remove the above sort
         for bitmap in bitmaps {
             candidates &= bitmap;
 

From cd7a20fa32f49368ff9d2130171b860bd141841f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Fri, 21 Jun 2024 15:03:51 +0200
Subject: [PATCH 4/9] Make it work by avoid storing invalid stuff in the cache

---
 milli/src/search/new/mod.rs                                 | 4 ++--
 milli/src/search/new/ranking_rule_graph/exactness/mod.rs    | 2 +-
 .../new/ranking_rule_graph/proximity/compute_docids.rs      | 6 +++---
 milli/src/search/new/resolve_query_graph.rs                 | 6 +++---
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs
index 73aaa330f..5932346a1 100644
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@@ -247,14 +247,14 @@ fn resolve_negative_words(
 #[tracing::instrument(level = "trace", skip_all, target = "search::query")]
 fn resolve_negative_phrases(
     ctx: &mut SearchContext<'_>,
-    universe: Option<&RoaringBitmap>,
+    _universe: Option<&RoaringBitmap>,
     negative_phrases: &[LocatedQueryTerm],
 ) -> Result<RoaringBitmap> {
     let mut negative_bitmap = RoaringBitmap::new();
     for term in negative_phrases {
         let query_term = ctx.term_interner.get(term.value);
         if let Some(phrase) = query_term.original_phrase() {
-            negative_bitmap |= ctx.get_phrase_docids(universe, phrase)?;
+            negative_bitmap |= ctx.get_phrase_docids(None, phrase)?;
         }
     }
     Ok(negative_bitmap)
diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs
index ae3a582bb..31f6315d7 100644
--- a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs
@@ -29,7 +29,7 @@ fn compute_docids(
 
     let candidates = match exact_term {
         // TODO I move the intersection here
-        ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(Some(universe), phrase)? & universe,
+        ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(None, phrase)? & universe,
         ExactTerm::Word(word) => {
             ctx.word_docids(Some(universe), Word::Original(word))?.unwrap_or_default()
         }
diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
index 6d3199a8d..74bef506a 100644
--- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
@@ -74,7 +74,7 @@ pub fn compute_docids(
         if right_derivs.len() > 1 {
             let universe = &universe;
             if let Some(left_phrase) = left_phrase {
-                if universe.is_disjoint(ctx.get_phrase_docids(Some(universe), left_phrase)?) {
+                if universe.is_disjoint(ctx.get_phrase_docids(None, left_phrase)?) {
                     continue;
                 }
             } else if let Some(left_word_docids) = ctx.word_docids(Some(universe), left_word)? {
@@ -126,7 +126,7 @@ fn compute_prefix_edges(
         // TODO we can clearly give the universe to this method
         //      Unfortunately, it is deserializing/computing stuff and
         //      keeping the result as a materialized bitmap.
-        let phrase_docids = ctx.get_phrase_docids(Some(&universe), phrase)?;
+        let phrase_docids = ctx.get_phrase_docids(None, phrase)?;
         if !phrase_docids.is_empty() {
             used_left_phrases.insert(phrase);
         }
@@ -184,7 +184,7 @@ fn compute_non_prefix_edges(
     let mut universe = universe.clone();
 
     for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() {
-        universe &= ctx.get_phrase_docids(Some(&universe), phrase)?;
+        universe &= ctx.get_phrase_docids(None, phrase)?;
         if universe.is_empty() {
             return Ok(());
         }
diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs
index 24e677477..d9fdac86e 100644
--- a/milli/src/search/new/resolve_query_graph.rs
+++ b/milli/src/search/new/resolve_query_graph.rs
@@ -47,7 +47,7 @@ pub fn compute_query_term_subset_docids(
         }
     }
     for phrase in term.all_phrases(ctx)? {
-        docids |= ctx.get_phrase_docids(universe, phrase)?;
+        docids |= ctx.get_phrase_docids(None, phrase)?;
     }
 
     if let Some(prefix) = term.use_prefix_db(ctx) {
@@ -80,7 +80,7 @@ pub fn compute_query_term_subset_docids_within_field_id(
         // guaranteed that all of its words are within a single fid.
         if let Some(word) = phrase.words(ctx).iter().flatten().next() {
             if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(universe, *word, fid)? {
-                docids |= ctx.get_phrase_docids(Some(&word_fid_docids), phrase)?;
+                docids |= ctx.get_phrase_docids(None, phrase)? & word_fid_docids;
             }
         }
     }
@@ -118,7 +118,7 @@ pub fn compute_query_term_subset_docids_within_position(
             if let Some(word_position_docids) =
                 ctx.get_db_word_position_docids(universe, *word, position)?
             {
-                docids |= ctx.get_phrase_docids(Some(&word_position_docids), phrase)?;
+                docids |= ctx.get_phrase_docids(None, phrase)? & word_position_docids;
             }
         }
     }

From 93ba0510942014c8623a9d52296f84f3aea831c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 10 Jul 2024 10:03:00 +0200
Subject: [PATCH 5/9] Remove the invalid get_phrases_docids universe parameter

---
 .../new/ranking_rule_graph/exactness/mod.rs   |  2 +-
 .../proximity/compute_docids.rs               |  6 +++---
 milli/src/search/new/resolve_query_graph.rs   | 21 +++++++------------
 3 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs
index 31f6315d7..e0c2294e5 100644
--- a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs
@@ -29,7 +29,7 @@ fn compute_docids(
 
     let candidates = match exact_term {
         // TODO I move the intersection here
-        ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(None, phrase)? & universe,
+        ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)? & universe,
         ExactTerm::Word(word) => {
             ctx.word_docids(Some(universe), Word::Original(word))?.unwrap_or_default()
         }
diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
index 74bef506a..4ef83b534 100644
--- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
@@ -74,7 +74,7 @@ pub fn compute_docids(
         if right_derivs.len() > 1 {
             let universe = &universe;
             if let Some(left_phrase) = left_phrase {
-                if universe.is_disjoint(ctx.get_phrase_docids(None, left_phrase)?) {
+                if universe.is_disjoint(ctx.get_phrase_docids(left_phrase)?) {
                     continue;
                 }
             } else if let Some(left_word_docids) = ctx.word_docids(Some(universe), left_word)? {
@@ -126,7 +126,7 @@ fn compute_prefix_edges(
         // TODO we can clearly give the universe to this method
         //      Unfortunately, it is deserializing/computing stuff and
         //      keeping the result as a materialized bitmap.
-        let phrase_docids = ctx.get_phrase_docids(None, phrase)?;
+        let phrase_docids = ctx.get_phrase_docids(phrase)?;
         if !phrase_docids.is_empty() {
             used_left_phrases.insert(phrase);
         }
@@ -184,7 +184,7 @@ fn compute_non_prefix_edges(
     let mut universe = universe.clone();
 
     for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() {
-        universe &= ctx.get_phrase_docids(None, phrase)?;
+        universe &= ctx.get_phrase_docids(phrase)?;
         if universe.is_empty() {
             return Ok(());
         }
diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs
index d9fdac86e..7a47b0a66 100644
--- a/milli/src/search/new/resolve_query_graph.rs
+++ b/milli/src/search/new/resolve_query_graph.rs
@@ -19,15 +19,11 @@ pub struct PhraseDocIdsCache {
 }
 impl<'ctx> SearchContext<'ctx> {
     /// Get the document ids associated with the given phrase
-    pub fn get_phrase_docids(
-        &mut self,
-        universe: Option<&RoaringBitmap>,
-        phrase: Interned<Phrase>,
-    ) -> Result<&RoaringBitmap> {
+    pub fn get_phrase_docids(&mut self, phrase: Interned<Phrase>) -> Result<&RoaringBitmap> {
         if self.phrase_docids.cache.contains_key(&phrase) {
             return Ok(&self.phrase_docids.cache[&phrase]);
         };
-        let docids = compute_phrase_docids(self, universe, phrase)?;
+        let docids = compute_phrase_docids(self, phrase)?;
         // TODO can we improve that? Because there is an issue, we keep that in cache...
         let _ = self.phrase_docids.cache.insert(phrase, docids);
         let docids = &self.phrase_docids.cache[&phrase];
@@ -47,7 +43,7 @@ pub fn compute_query_term_subset_docids(
         }
     }
     for phrase in term.all_phrases(ctx)? {
-        docids |= ctx.get_phrase_docids(None, phrase)?;
+        docids |= ctx.get_phrase_docids(phrase)?;
     }
 
     if let Some(prefix) = term.use_prefix_db(ctx) {
@@ -80,7 +76,7 @@ pub fn compute_query_term_subset_docids_within_field_id(
         // guaranteed that all of its words are within a single fid.
         if let Some(word) = phrase.words(ctx).iter().flatten().next() {
             if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(universe, *word, fid)? {
-                docids |= ctx.get_phrase_docids(None, phrase)? & word_fid_docids;
+                docids |= ctx.get_phrase_docids(phrase)? & word_fid_docids;
             }
         }
     }
@@ -118,7 +114,7 @@ pub fn compute_query_term_subset_docids_within_position(
             if let Some(word_position_docids) =
                 ctx.get_db_word_position_docids(universe, *word, position)?
             {
-                docids |= ctx.get_phrase_docids(None, phrase)? & word_position_docids;
+                docids |= ctx.get_phrase_docids(phrase)? & word_position_docids;
             }
         }
     }
@@ -190,7 +186,6 @@ pub fn compute_query_graph_docids(
 
 pub fn compute_phrase_docids(
     ctx: &mut SearchContext<'_>,
-    universe: Option<&RoaringBitmap>,
     phrase: Interned<Phrase>,
 ) -> Result<RoaringBitmap> {
     let Phrase { words } = ctx.phrase_interner.get(phrase).clone();
@@ -200,7 +195,7 @@ pub fn compute_phrase_docids(
     }
     let mut candidates = RoaringBitmap::new();
     for word in words.iter().flatten().copied() {
-        if let Some(word_docids) = ctx.word_docids(universe, Word::Original(word))? {
+        if let Some(word_docids) = ctx.word_docids(None, Word::Original(word))? {
             candidates |= word_docids;
         } else {
             return Ok(RoaringBitmap::new());
@@ -224,7 +219,7 @@ pub fn compute_phrase_docids(
                 .filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
             {
                 if dist == 0 {
-                    match ctx.get_db_word_pair_proximity_docids(universe, s1, s2, 1)? {
+                    match ctx.get_db_word_pair_proximity_docids(None, s1, s2, 1)? {
                         Some(m) => bitmaps.push(m),
                         // If there are no documents for this pair, there will be no
                         // results for the phrase query.
@@ -234,7 +229,7 @@ pub fn compute_phrase_docids(
                     let mut bitmap = RoaringBitmap::new();
                     for dist in 0..=dist {
                         if let Some(m) =
-                            ctx.get_db_word_pair_proximity_docids(universe, s1, s2, dist as u8 + 1)?
+                            ctx.get_db_word_pair_proximity_docids(None, s1, s2, dist as u8 + 1)?
                         {
                             bitmap |= m;
                         }

From febea735caaec330ecab1c075c536b6c04dc6501 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 10 Jul 2024 10:03:20 +0200
Subject: [PATCH 6/9] Remove the unused universe parameter from
 resolve_negative_phrases

---
 milli/src/search/new/mod.rs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs
index 5932346a1..f6a4a802c 100644
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@@ -247,14 +247,13 @@ fn resolve_negative_words(
 #[tracing::instrument(level = "trace", skip_all, target = "search::query")]
 fn resolve_negative_phrases(
     ctx: &mut SearchContext<'_>,
-    _universe: Option<&RoaringBitmap>,
     negative_phrases: &[LocatedQueryTerm],
 ) -> Result<RoaringBitmap> {
     let mut negative_bitmap = RoaringBitmap::new();
     for term in negative_phrases {
         let query_term = ctx.term_interner.get(term.value);
         if let Some(phrase) = query_term.original_phrase() {
-            negative_bitmap |= ctx.get_phrase_docids(None, phrase)?;
+            negative_bitmap |= ctx.get_phrase_docids(phrase)?;
         }
     }
     Ok(negative_bitmap)
@@ -689,7 +688,7 @@ pub fn execute_search(
         used_negative_operator = !negative_words.is_empty() || !negative_phrases.is_empty();
 
         let ignored_documents = resolve_negative_words(ctx, Some(&universe), &negative_words)?;
-        let ignored_phrases = resolve_negative_phrases(ctx, Some(&universe), &negative_phrases)?;
+        let ignored_phrases = resolve_negative_phrases(ctx, &negative_phrases)?;
 
         universe -= ignored_documents;
         universe -= ignored_phrases;

From 1693d1a311cdfeef8f41c19f00e4620c2378fa91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 10 Jul 2024 10:09:39 +0200
Subject: [PATCH 7/9] Simplify the check to decide to stop a loop

---
 .../search/new/ranking_rule_graph/proximity/compute_docids.rs   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
index 4ef83b534..68b934e20 100644
--- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
@@ -78,7 +78,7 @@ pub fn compute_docids(
                     continue;
                 }
             } else if let Some(left_word_docids) = ctx.word_docids(Some(universe), left_word)? {
-                if universe.is_disjoint(&left_word_docids) {
+                if left_word_docids.is_empty() {
                     continue;
                 }
             }

From ce61cb7fe61bc2f877ec3c10d666690e626543e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 10 Jul 2024 10:11:44 +0200
Subject: [PATCH 8/9] Simplify and speedup an intersection pass

---
 milli/src/search/new/exact_attribute.rs | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs
index b013adfa0..269879ad6 100644
--- a/milli/src/search/new/exact_attribute.rs
+++ b/milli/src/search/new/exact_attribute.rs
@@ -193,7 +193,7 @@ impl State {
         let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len());
         // then check that there exists at least one attribute that has all of the terms
         for fid in searchable_fields_ids {
-            let mut intersection = MultiOps::intersection(
+            let intersection = MultiOps::intersection(
                 words_positions
                     .iter()
                     .flat_map(|(words, ..)| words.iter())
@@ -201,12 +201,10 @@ impl State {
                     .flatten()
                     .map(|word| -> Result<_> {
                         Ok(ctx
-                            .get_db_word_fid_docids(Some(universe), *word, fid)?
+                            .get_db_word_fid_docids(Some(&candidates), *word, fid)?
                             .unwrap_or_default())
                     }),
             )?;
-            // TODO Why not doing this intersection in the MultiOps above?
-            intersection &= &candidates;
             if !intersection.is_empty() {
                 // Although not really worth it in terms of performance,
                 // if would be good to put this in cache for the sake of consistency

From 3bac22fd87b7c5d384eb2395b087c7bc61e8f0fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 10 Jul 2024 10:12:35 +0200
Subject: [PATCH 9/9] We do not do intersections with the universe when it is
 related to cache

---
 .../search/new/ranking_rule_graph/proximity/compute_docids.rs  | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
index 68b934e20..39dbef46d 100644
--- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
@@ -136,7 +136,6 @@ fn compute_prefix_edges(
         }
     }
 
-    // TODO check that the fact that the universe always changes is not an issue, e.g. caching stuff.
     if let Some(new_docids) = ctx.get_db_word_prefix_pair_proximity_docids(
         Some(&universe),
         left_word,
@@ -152,7 +151,6 @@ fn compute_prefix_edges(
 
     // No swapping when computing the proximity between a phrase and a word
     if left_phrase.is_none() {
-        // TODO check that the fact that the universe always changes is not an issue, e.g. caching stuff.
         if let Some(new_docids) = ctx.get_db_prefix_word_pair_proximity_docids(
             Some(&universe),
             right_prefix,
@@ -190,7 +188,6 @@ fn compute_non_prefix_edges(
         }
     }
 
-    // TODO check that it is not an issue to alterate the universe
     if let Some(new_docids) =
         ctx.get_db_word_pair_proximity_docids(Some(&universe), word1, word2, forward_proximity)?
     {