Merge #3757

3757: Adjust the cost of edges in the `position` ranking rule by bucketing positions more aggressively r=loiclec a=loiclec This PR significantly improves the performance of the `position` ranking rule when: 1. a query contains many words 2. the `position` ranking rule needs to be called many times 3. the score of the documents according to `position` is high These conditions greatly increase: 1. the number of edge traversals that are needed to find a valid path from the `start` node to the `end` node 2. the number of edges that need to be deleted from the graph, and therefore the number of times that we need to recompute all the possible costs from START to END As a result, a majority of the search time is spent in `visit_condition`, `visit_node`, and `update_all_costs_before_node`. This is frustrating because it often happens when the "universe" given to the rule consists of only a handful of document ids. By limiting the number of possible edges between two nodes from `20` to `10`, we: 1. reduce the number of possible costs from START to END 2. reduce the number of edges that will be deleted 3. make it faster to update the costs after deleting an edge 4. reduce the number of buckets that need to be computed In terms of relevancy, I don't think we lose or gain much. We still prefer terms that are in a lower positions, with decreasing precision as we go further. The previous choice of bucketing wasn't chosen in a principled way, and neither is this one. They both "feel" right to me. Co-authored-by: Loïc Lecrenier <loic.lecrenier@me.com> Co-authored-by: meili-bors[bot] <89034592+meili-bors[bot]@users.noreply.github.com>
2025-05-25 09:03:59 +02:00 · 2023-05-17 11:43:59 +00:00 · 2023-05-17 11:43:59 +00:00 · 101f5a20d2
commit 101f5a20d2
parent 6ce1ce77e6 3e19702de6
2 changed files with 14 additions and 21 deletions
--- a/milli/src/search/new/ranking_rule_graph/position/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/position/mod.rs
@ -111,23 +111,16 @@ impl RankingRuleGraphTrait for PositionGraph {

 fn cost_from_position(sum_positions: u32) -> u32 {
    match sum_positions {
-        0 | 1 | 2 | 3 => sum_positions,
-        4 | 5 => 4,
-        6 | 7 => 5,
-        8 | 9 => 6,
-        10 | 11 => 7,
-        12 | 13 => 8,
-        14 | 15 => 9,
-        16 | 17..=24 => 10,
-        25..=32 => 11,
-        33..=64 => 12,
-        65..=128 => 13,
-        129..=256 => 14,
-        257..=512 => 15,
-        513..=1024 => 16,
-        1025..=2048 => 17,
-        2049..=4096 => 18,
-        4097..=8192 => 19,
-        _ => 20,
+        0 => 0,
+        1 => 1,
+        2..=4 => 2,
+        5..=7 => 3,
+        8..=11 => 4,
+        12..=16 => 5,
+        17..=24 => 6,
+        25..=64 => 7,
+        65..=256 => 8,
+        257..=1024 => 9,
+        _ => 10,
    }
 }
--- a/milli/src/search/new/tests/attribute_position.rs
+++ b/milli/src/search/new/tests/attribute_position.rs
@ -138,7 +138,7 @@ fn test_attribute_position_simple() {
    s.terms_matching_strategy(TermsMatchingStrategy::All);
    s.query("quick brown");
    let SearchResult { documents_ids, .. } = s.execute().unwrap();
-    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 11, 12, 13, 3, 4, 2, 1, 0, 6, 8, 7, 9, 5]");
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 11, 12, 13, 2, 3, 4, 1, 0, 6, 8, 7, 9, 5]");
 }
 #[test]
 fn test_attribute_position_repeated() {
@ -163,7 +163,7 @@ fn test_attribute_position_different_fields() {
    s.terms_matching_strategy(TermsMatchingStrategy::All);
    s.query("quick brown");
    let SearchResult { documents_ids, .. } = s.execute().unwrap();
-    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 11, 12, 13, 3, 4, 2, 1, 0, 6, 8, 7, 9, 5]");
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 11, 12, 13, 2, 3, 4, 1, 0, 6, 8, 7, 9, 5]");
 }

 #[test]
@ -176,5 +176,5 @@ fn test_attribute_position_ngrams() {
    s.terms_matching_strategy(TermsMatchingStrategy::All);
    s.query("quick brown");
    let SearchResult { documents_ids, .. } = s.execute().unwrap();
-    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 11, 12, 13, 3, 4, 2, 1, 0, 6, 8, 7, 9, 5]");
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 11, 12, 13, 2, 3, 4, 1, 0, 6, 8, 7, 9, 5]");
 }