fix the search cutoff and add a test

2025-06-14 20:11:38 +02:00 · 2024-03-14 17:34:46 +01:00 · 2024-03-14 17:34:46 +01:00 · b8cda6c300
commit b8cda6c300
parent b72495eb58
9 changed files with 590 additions and 55 deletions
--- a/meilisearch/tests/search/mod.rs
+++ b/meilisearch/tests/search/mod.rs
@ -834,6 +834,115 @@ async fn test_score_details() {
        .await;
 }
 #[actix_rt::test]
 async fn test_degraded_score_details() {
    let server = Server::new().await;
    let index = server.index("test");
    let documents = NESTED_DOCUMENTS.clone();
    index.add_documents(json!(documents), None).await;
    // We can't really use anything else than 0ms here; otherwise, the test will get flaky.
    let (res, _code) = index.update_settings(json!({ "searchCutoff": 0 })).await;
    index.wait_task(res.uid()).await;
    index
        .search(
            json!({
                "q": "b",
                "showRankingScoreDetails": true,
            }),
            |response, code| {
                meili_snap::snapshot!(code, @"200 OK");
                meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
                [
                  {
                    "id": 852,
                    "father": "jean",
                    "mother": "michelle",
                    "doggos": [
                      {
                        "name": "bobby",
                        "age": 2
                      },
                      {
                        "name": "buddy",
                        "age": 4
                      }
                    ],
                    "cattos": "pésti",
                    "_vectors": {
                      "manual": [
                        1,
                        2,
                        3
                      ]
                    },
                    "_rankingScoreDetails": {
                      "skipped": 0.0
                    }
                  },
                  {
                    "id": 654,
                    "father": "pierre",
                    "mother": "sabine",
                    "doggos": [
                      {
                        "name": "gros bill",
                        "age": 8
                      }
                    ],
                    "cattos": [
                      "simba",
                      "pestiféré"
                    ],
                    "_vectors": {
                      "manual": [
                        1,
                        2,
                        54
                      ]
                    },
                    "_rankingScoreDetails": {
                      "skipped": 0.0
                    }
                  },
                  {
                    "id": 951,
                    "father": "jean-baptiste",
                    "mother": "sophie",
                    "doggos": [
                      {
                        "name": "turbo",
                        "age": 5
                      },
                      {
                        "name": "fast",
                        "age": 6
                      }
                    ],
                    "cattos": [
                      "moumoute",
                      "gomez"
                    ],
                    "_vectors": {
                      "manual": [
                        10,
                        23,
                        32
                      ]
                    },
                    "_rankingScoreDetails": {
                      "skipped": 0.0
                    }
                  }
                ]
                "###);
            },
        )
        .await;
 }
 #[actix_rt::test]
 async fn experimental_feature_vector_store() {
    let server = Server::new().await;
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@ -105,10 +105,15 @@ pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2;
 pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1;
-#[derive(Clone, Copy)]
+#[derive(Clone)]
 pub struct TimeBudget {
    started_at: std::time::Instant,
    budget: std::time::Duration,
    /// When testing the time budget, ensuring we did more than iteration of the bucket sort can be useful.
    /// But to avoid being flaky, the only option is to add the ability to stop after a specific number of calls instead of a `Duration`.
    #[cfg(test)]
    stop_after: Option<(std::sync::Arc<std::sync::atomic::AtomicUsize>, usize)>,
 }
 impl fmt::Debug for TimeBudget {
@ -129,18 +134,40 @@ impl Default for TimeBudget {
 impl TimeBudget {
    pub fn new(budget: std::time::Duration) -> Self {
-        Self { started_at: std::time::Instant::now(), budget }
+        Self {
            started_at: std::time::Instant::now(),
            budget,
            #[cfg(test)]
            stop_after: None,
        }
    }
    pub fn max() -> Self {
        Self::new(std::time::Duration::from_secs(u64::MAX))
    }
-    pub fn exceeded(&self) -> bool {
+    #[cfg(test)]
-        self.must_stop()
+    pub fn with_stop_after(mut self, stop_after: usize) -> Self {
        use std::sync::atomic::AtomicUsize;
        use std::sync::Arc;
        self.stop_after = Some((Arc::new(AtomicUsize::new(0)), stop_after));
        self
    }
    pub fn exceeded(&self) -> bool {
        #[cfg(test)]
        if let Some((current, stop_after)) = &self.stop_after {
            let current = current.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
            if current >= *stop_after {
                return true;
            } else {
                // if a number has been specified then we ignore entirely the time budget
                return false;
            }
        }
    pub fn must_stop(&self) -> bool {
        self.started_at.elapsed() > self.budget
    }
 }
--- a/milli/src/score_details.rs
+++ b/milli/src/score_details.rs
@ -17,6 +17,9 @@ pub enum ScoreDetails {
    Sort(Sort),
    Vector(Vector),
    GeoSort(GeoSort),
    /// Returned when we don't have the time to finish applying all the subsequent ranking-rules
    Skipped,
 }
 #[derive(Clone, Copy)]
@ -50,6 +53,7 @@ impl ScoreDetails {
            ScoreDetails::Sort(_) => None,
            ScoreDetails::GeoSort(_) => None,
            ScoreDetails::Vector(_) => None,
            ScoreDetails::Skipped => Some(Rank { rank: 0, max_rank: 1 }),
        }
    }
@ -97,6 +101,7 @@ impl ScoreDetails {
            ScoreDetails::Vector(vector) => RankOrValue::Score(
                vector.value_similarity.as_ref().map(|(_, s)| *s as f64).unwrap_or(0.0f64),
            ),
            ScoreDetails::Skipped => RankOrValue::Score(0.),
        }
    }
@ -256,6 +261,13 @@ impl ScoreDetails {
                    details_map.insert(vector, details);
                    order += 1;
                }
                ScoreDetails::Skipped => {
                    details_map.insert(
                        "skipped".to_string(),
                        serde_json::Number::from_f64(0.).unwrap().into(),
                    );
                    order += 1;
                }
            }
        }
        details_map
--- a/milli/src/search/hybrid.rs
+++ b/milli/src/search/hybrid.rs
@ -132,7 +132,7 @@ impl<'a> Search<'a> {
            index: self.index,
            distribution_shift: self.distribution_shift,
            embedder_name: self.embedder_name.clone(),
-            time_budget: self.time_budget,
+            time_budget: self.time_budget.clone(),
        };
        let vector_query = search.vector.take();
--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@ -195,7 +195,7 @@ impl<'a> Search<'a> {
                self.limit,
                self.distribution_shift,
                embedder_name,
-                self.time_budget,
+                self.time_budget.clone(),
            )?,
            None => execute_search(
                &mut ctx,
@ -211,7 +211,7 @@ impl<'a> Search<'a> {
                Some(self.words_limit),
                &mut DefaultSearchLogger,
                &mut DefaultSearchLogger,
-                self.time_budget,
+                self.time_budget.clone(),
            )?,
        };
--- a/milli/src/search/new/bucket_sort.rs
+++ b/milli/src/search/new/bucket_sort.rs
@ -161,11 +161,21 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
    while valid_docids.len() < length {
        if time_budget.exceeded() {
            loop {
                let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]);
                ranking_rule_scores.push(ScoreDetails::Skipped);
                maybe_add_to_results!(bucket);
                ranking_rule_scores.pop();
                if cur_ranking_rule_index == 0 {
                    break;
                }
                back!();
            }
            return Ok(BucketSortOutput {
-                scores: vec![Default::default(); valid_docids.len()],
+                scores: valid_scores,
                docids: valid_docids,
                all_candidates,
                degraded: true,
--- a/milli/src/search/new/tests/cutoff.rs
+++ b/milli/src/search/new/tests/cutoff.rs
@ -0,0 +1,419 @@
 //! This module test the search cutoff and ensure a few things:
 //! 1. A basic test works and mark the search as degraded
 //! 2. A test that ensure the filters are affectively applied even with a cutoff of 0
 //! 3. A test that ensure the cutoff works well with the ranking scores
 use std::time::Duration;
 use big_s::S;
 use maplit::hashset;
 use meili_snap::snapshot;
 use crate::index::tests::TempIndex;
 use crate::{Criterion, Filter, Search, TimeBudget};
 fn create_index() -> TempIndex {
    let index = TempIndex::new();
    index
        .update_settings(|s| {
            s.set_primary_key("id".to_owned());
            s.set_searchable_fields(vec!["text".to_owned()]);
            s.set_filterable_fields(hashset! { S("id") });
            s.set_criteria(vec![Criterion::Words, Criterion::Typo]);
        })
        .unwrap();
    // reverse the ID / insertion order so we see better what was sorted from what got the insertion order ordering
    index
        .add_documents(documents!([
            {
                "id": 4,
                "text": "hella puppo kefir",
            },
            {
                "id": 3,
                "text": "hella puppy kefir",
            },
            {
                "id": 2,
                "text": "hello",
            },
            {
                "id": 1,
                "text": "hello puppy",
            },
            {
                "id": 0,
                "text": "hello puppy kefir",
            },
        ]))
        .unwrap();
    index
 }
 #[test]
 fn basic_degraded_search() {
    let index = create_index();
    let rtxn = index.read_txn().unwrap();
    let mut search = Search::new(&rtxn, &index);
    search.query("hello puppy kefir");
    search.limit(3);
    search.time_budget(TimeBudget::new(Duration::from_millis(0)));
    let result = search.execute().unwrap();
    assert!(result.degraded);
 }
 #[test]
 fn degraded_search_cannot_skip_filter() {
    let index = create_index();
    let rtxn = index.read_txn().unwrap();
    let mut search = Search::new(&rtxn, &index);
    search.query("hello puppy kefir");
    search.limit(100);
    search.time_budget(TimeBudget::new(Duration::from_millis(0)));
    let filter_condition = Filter::from_str("id > 2").unwrap().unwrap();
    search.filter(filter_condition);
    let result = search.execute().unwrap();
    assert!(result.degraded);
    snapshot!(format!("{:?}\n{:?}", result.candidates, result.documents_ids), @r###"
    RoaringBitmap<[0, 1]>
    [0, 1]
    "###);
 }
 #[test]
 fn degraded_search_and_score_details() {
    let index = create_index();
    let rtxn = index.read_txn().unwrap();
    let mut search = Search::new(&rtxn, &index);
    search.query("hello puppy kefir");
    search.limit(4);
    search.time_budget(TimeBudget::max());
    let result = search.execute().unwrap();
    snapshot!(format!("{:#?}\n{:#?}", result.documents_ids, result.document_scores), @r###"
    [
        4,
        1,
        0,
        3,
    ]
    [
        [
            Words(
                Words {
                    matching_words: 3,
                    max_matching_words: 3,
                },
            ),
            Typo(
                Typo {
                    typo_count: 0,
                    max_typo_count: 3,
                },
            ),
        ],
        [
            Words(
                Words {
                    matching_words: 3,
                    max_matching_words: 3,
                },
            ),
            Typo(
                Typo {
                    typo_count: 1,
                    max_typo_count: 3,
                },
            ),
        ],
        [
            Words(
                Words {
                    matching_words: 3,
                    max_matching_words: 3,
                },
            ),
        ],
        [
            Words(
                Words {
                    matching_words: 2,
                    max_matching_words: 3,
                },
            ),
        ],
    ]
    "###);
    // Do ONE loop iteration. Not much can be deduced, almost everyone matched the words first bucket.
    search.time_budget(TimeBudget::max().with_stop_after(1));
    let result = search.execute().unwrap();
    snapshot!(format!("{:#?}\n{:#?}", result.documents_ids, result.document_scores), @r###"
    [
        0,
        1,
        4,
        2,
    ]
    [
        [
            Words(
                Words {
                    matching_words: 3,
                    max_matching_words: 3,
                },
            ),
            Skipped,
        ],
        [
            Words(
                Words {
                    matching_words: 3,
                    max_matching_words: 3,
                },
            ),
            Skipped,
        ],
        [
            Words(
                Words {
                    matching_words: 3,
                    max_matching_words: 3,
                },
            ),
            Skipped,
        ],
        [
            Skipped,
        ],
    ]
    "###);
    // Do TWO loop iterations. The first document should be entirely sorted
    search.time_budget(TimeBudget::max().with_stop_after(2));
    let result = search.execute().unwrap();
    snapshot!(format!("{:#?}\n{:#?}", result.documents_ids, result.document_scores), @r###"
    [
        4,
        0,
        1,
        2,
    ]
    [
        [
            Words(
                Words {
                    matching_words: 3,
                    max_matching_words: 3,
                },
            ),
            Typo(
                Typo {
                    typo_count: 0,
                    max_typo_count: 3,
                },
            ),
        ],
        [
            Words(
                Words {
                    matching_words: 3,
                    max_matching_words: 3,
                },
            ),
            Skipped,
        ],
        [
            Words(
                Words {
                    matching_words: 3,
                    max_matching_words: 3,
                },
            ),
            Skipped,
        ],
        [
            Skipped,
        ],
    ]
    "###);
    // Do THREE loop iterations. The second document should be entirely sorted as well
    search.time_budget(TimeBudget::max().with_stop_after(3));
    let result = search.execute().unwrap();
    snapshot!(format!("{:#?}\n{:#?}", result.documents_ids, result.document_scores), @r###"
    [
        4,
        1,
        0,
        2,
    ]
    [
        [
            Words(
                Words {
                    matching_words: 3,
                    max_matching_words: 3,
                },
            ),
            Typo(
                Typo {
                    typo_count: 0,
                    max_typo_count: 3,
                },
            ),
        ],
        [
            Words(
                Words {
                    matching_words: 3,
                    max_matching_words: 3,
                },
            ),
            Typo(
                Typo {
                    typo_count: 1,
                    max_typo_count: 3,
                },
            ),
        ],
        [
            Words(
                Words {
                    matching_words: 3,
                    max_matching_words: 3,
                },
            ),
            Skipped,
        ],
        [
            Skipped,
        ],
    ]
    "###);
    // Do FOUR loop iterations. The third document should be entirely sorted as well
    // The words bucket have still not progressed thus the last document doesn't have any info yet.
    search.time_budget(TimeBudget::max().with_stop_after(4));
    let result = search.execute().unwrap();
    snapshot!(format!("{:#?}\n{:#?}", result.documents_ids, result.document_scores), @r###"
    [
        4,
        1,
        0,
        2,
    ]
    [
        [
            Words(
                Words {
                    matching_words: 3,
                    max_matching_words: 3,
                },
            ),
            Typo(
                Typo {
                    typo_count: 0,
                    max_typo_count: 3,
                },
            ),
        ],
        [
            Words(
                Words {
                    matching_words: 3,
                    max_matching_words: 3,
                },
            ),
            Typo(
                Typo {
                    typo_count: 1,
                    max_typo_count: 3,
                },
            ),
        ],
        [
            Words(
                Words {
                    matching_words: 3,
                    max_matching_words: 3,
                },
            ),
        ],
        [
            Skipped,
        ],
    ]
    "###);
    // After FIVE loop iteration. The words ranking rule gave us a new bucket.
    // Since we reached the limit we were able to early exit without checking the typo ranking rule.
    search.time_budget(TimeBudget::max().with_stop_after(5));
    let result = search.execute().unwrap();
    snapshot!(format!("{:#?}\n{:#?}", result.documents_ids, result.document_scores), @r###"
    [
        4,
        1,
        0,
        3,
    ]
    [
        [
            Words(
                Words {
                    matching_words: 3,
                    max_matching_words: 3,
                },
            ),
            Typo(
                Typo {
                    typo_count: 0,
                    max_typo_count: 3,
                },
            ),
        ],
        [
            Words(
                Words {
                    matching_words: 3,
                    max_matching_words: 3,
                },
            ),
            Typo(
                Typo {
                    typo_count: 1,
                    max_typo_count: 3,
                },
            ),
        ],
        [
            Words(
                Words {
                    matching_words: 3,
                    max_matching_words: 3,
                },
            ),
        ],
        [
            Words(
                Words {
                    matching_words: 2,
                    max_matching_words: 3,
                },
            ),
        ],
    ]
    "###);
 }
--- a/milli/src/search/new/tests/mod.rs
+++ b/milli/src/search/new/tests/mod.rs
@ -1,5 +1,6 @@
 pub mod attribute_fid;
 pub mod attribute_position;
 pub mod cutoff;
 pub mod distinct;
 pub mod exactness;
 pub mod geo_sort;
--- a/milli/tests/search/mod.rs
+++ b/milli/tests/search/mod.rs
@ -1,19 +1,14 @@
 use std::cmp::Reverse;
 use std::collections::HashSet;
 use std::io::Cursor;
 use std::time::Duration;
 use big_s::S;
 use either::{Either, Left, Right};
 use heed::EnvOpenOptions;
 use maplit::{btreemap, hashset};
 use meili_snap::snapshot;
 use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
 use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
-use milli::{
+use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object, TermsMatchingStrategy};
    AscDesc, Criterion, DocumentId, Filter, Index, Member, Object, Search, TermsMatchingStrategy,
    TimeBudget,
 };
 use serde::{Deserialize, Deserializer};
 use slice_group_by::GroupBy;
@ -354,41 +349,3 @@ where
    let result = serde_json::Value::deserialize(deserializer)?;
    Ok(Some(result))
 }
 #[test]
 fn basic_degraded_search() {
    use Criterion::*;
    let criteria = vec![Words, Typo, Proximity, Attribute, Exactness];
    let index = setup_search_index_with_criteria(&criteria);
    let rtxn = index.read_txn().unwrap();
    let mut search = Search::new(&rtxn, &index);
    search.query(TEST_QUERY);
    search.limit(EXTERNAL_DOCUMENTS_IDS.len());
    search.time_budget(TimeBudget::new(Duration::from_millis(0)));
    let result = search.execute().unwrap();
    assert!(result.degraded);
 }
 #[test]
 fn degraded_search_cannot_skip_filter() {
    use Criterion::*;
    let criteria = vec![Words, Typo, Proximity, Attribute, Exactness];
    let index = setup_search_index_with_criteria(&criteria);
    let rtxn = index.read_txn().unwrap();
    let mut search = Search::new(&rtxn, &index);
    search.query(TEST_QUERY);
    search.limit(EXTERNAL_DOCUMENTS_IDS.len());
    search.time_budget(TimeBudget::new(Duration::from_millis(0)));
    let filter_condition = Filter::from_str("tag = etiopia").unwrap().unwrap();
    search.filter(filter_condition);
    let result = search.execute().unwrap();
    assert!(result.degraded);
    snapshot!(format!("{:?}\n{:?}", result.candidates, result.documents_ids), @r###"
    RoaringBitmap<[0, 2, 5, 8, 11, 14]>
    [0, 2, 5, 8, 11, 14]
    "###);
 }