fix the search cutoff and add a test

This commit is contained in:
Tamo 2024-03-14 17:34:46 +01:00
parent b72495eb58
commit b8cda6c300
9 changed files with 590 additions and 55 deletions

View File

@ -834,6 +834,115 @@ async fn test_score_details() {
.await;
}
#[actix_rt::test]
async fn test_degraded_score_details() {
let server = Server::new().await;
let index = server.index("test");
let documents = NESTED_DOCUMENTS.clone();
index.add_documents(json!(documents), None).await;
// We can't really use anything else than 0ms here; otherwise, the test will get flaky.
let (res, _code) = index.update_settings(json!({ "searchCutoff": 0 })).await;
index.wait_task(res.uid()).await;
index
.search(
json!({
"q": "b",
"showRankingScoreDetails": true,
}),
|response, code| {
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
[
{
"id": 852,
"father": "jean",
"mother": "michelle",
"doggos": [
{
"name": "bobby",
"age": 2
},
{
"name": "buddy",
"age": 4
}
],
"cattos": "pésti",
"_vectors": {
"manual": [
1,
2,
3
]
},
"_rankingScoreDetails": {
"skipped": 0.0
}
},
{
"id": 654,
"father": "pierre",
"mother": "sabine",
"doggos": [
{
"name": "gros bill",
"age": 8
}
],
"cattos": [
"simba",
"pestiféré"
],
"_vectors": {
"manual": [
1,
2,
54
]
},
"_rankingScoreDetails": {
"skipped": 0.0
}
},
{
"id": 951,
"father": "jean-baptiste",
"mother": "sophie",
"doggos": [
{
"name": "turbo",
"age": 5
},
{
"name": "fast",
"age": 6
}
],
"cattos": [
"moumoute",
"gomez"
],
"_vectors": {
"manual": [
10,
23,
32
]
},
"_rankingScoreDetails": {
"skipped": 0.0
}
}
]
"###);
},
)
.await;
}
#[actix_rt::test]
async fn experimental_feature_vector_store() {
let server = Server::new().await;

View File

@ -105,10 +105,15 @@ pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2;
pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1;
#[derive(Clone, Copy)]
#[derive(Clone)]
pub struct TimeBudget {
started_at: std::time::Instant,
budget: std::time::Duration,
/// When testing the time budget, ensuring we did more than iteration of the bucket sort can be useful.
/// But to avoid being flaky, the only option is to add the ability to stop after a specific number of calls instead of a `Duration`.
#[cfg(test)]
stop_after: Option<(std::sync::Arc<std::sync::atomic::AtomicUsize>, usize)>,
}
impl fmt::Debug for TimeBudget {
@ -129,18 +134,40 @@ impl Default for TimeBudget {
impl TimeBudget {
pub fn new(budget: std::time::Duration) -> Self {
Self { started_at: std::time::Instant::now(), budget }
Self {
started_at: std::time::Instant::now(),
budget,
#[cfg(test)]
stop_after: None,
}
}
pub fn max() -> Self {
Self::new(std::time::Duration::from_secs(u64::MAX))
}
pub fn exceeded(&self) -> bool {
self.must_stop()
#[cfg(test)]
pub fn with_stop_after(mut self, stop_after: usize) -> Self {
use std::sync::atomic::AtomicUsize;
use std::sync::Arc;
self.stop_after = Some((Arc::new(AtomicUsize::new(0)), stop_after));
self
}
pub fn exceeded(&self) -> bool {
#[cfg(test)]
if let Some((current, stop_after)) = &self.stop_after {
let current = current.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
if current >= *stop_after {
return true;
} else {
// if a number has been specified then we ignore entirely the time budget
return false;
}
}
pub fn must_stop(&self) -> bool {
self.started_at.elapsed() > self.budget
}
}

View File

@ -17,6 +17,9 @@ pub enum ScoreDetails {
Sort(Sort),
Vector(Vector),
GeoSort(GeoSort),
/// Returned when we don't have the time to finish applying all the subsequent ranking-rules
Skipped,
}
#[derive(Clone, Copy)]
@ -50,6 +53,7 @@ impl ScoreDetails {
ScoreDetails::Sort(_) => None,
ScoreDetails::GeoSort(_) => None,
ScoreDetails::Vector(_) => None,
ScoreDetails::Skipped => Some(Rank { rank: 0, max_rank: 1 }),
}
}
@ -97,6 +101,7 @@ impl ScoreDetails {
ScoreDetails::Vector(vector) => RankOrValue::Score(
vector.value_similarity.as_ref().map(|(_, s)| *s as f64).unwrap_or(0.0f64),
),
ScoreDetails::Skipped => RankOrValue::Score(0.),
}
}
@ -256,6 +261,13 @@ impl ScoreDetails {
details_map.insert(vector, details);
order += 1;
}
ScoreDetails::Skipped => {
details_map.insert(
"skipped".to_string(),
serde_json::Number::from_f64(0.).unwrap().into(),
);
order += 1;
}
}
}
details_map

View File

@ -132,7 +132,7 @@ impl<'a> Search<'a> {
index: self.index,
distribution_shift: self.distribution_shift,
embedder_name: self.embedder_name.clone(),
time_budget: self.time_budget,
time_budget: self.time_budget.clone(),
};
let vector_query = search.vector.take();

View File

@ -195,7 +195,7 @@ impl<'a> Search<'a> {
self.limit,
self.distribution_shift,
embedder_name,
self.time_budget,
self.time_budget.clone(),
)?,
None => execute_search(
&mut ctx,
@ -211,7 +211,7 @@ impl<'a> Search<'a> {
Some(self.words_limit),
&mut DefaultSearchLogger,
&mut DefaultSearchLogger,
self.time_budget,
self.time_budget.clone(),
)?,
};

View File

@ -161,11 +161,21 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
while valid_docids.len() < length {
if time_budget.exceeded() {
loop {
let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]);
ranking_rule_scores.push(ScoreDetails::Skipped);
maybe_add_to_results!(bucket);
ranking_rule_scores.pop();
if cur_ranking_rule_index == 0 {
break;
}
back!();
}
return Ok(BucketSortOutput {
scores: vec![Default::default(); valid_docids.len()],
scores: valid_scores,
docids: valid_docids,
all_candidates,
degraded: true,

View File

@ -0,0 +1,419 @@
//! This module test the search cutoff and ensure a few things:
//! 1. A basic test works and mark the search as degraded
//! 2. A test that ensure the filters are affectively applied even with a cutoff of 0
//! 3. A test that ensure the cutoff works well with the ranking scores
use std::time::Duration;
use big_s::S;
use maplit::hashset;
use meili_snap::snapshot;
use crate::index::tests::TempIndex;
use crate::{Criterion, Filter, Search, TimeBudget};
fn create_index() -> TempIndex {
let index = TempIndex::new();
index
.update_settings(|s| {
s.set_primary_key("id".to_owned());
s.set_searchable_fields(vec!["text".to_owned()]);
s.set_filterable_fields(hashset! { S("id") });
s.set_criteria(vec![Criterion::Words, Criterion::Typo]);
})
.unwrap();
// reverse the ID / insertion order so we see better what was sorted from what got the insertion order ordering
index
.add_documents(documents!([
{
"id": 4,
"text": "hella puppo kefir",
},
{
"id": 3,
"text": "hella puppy kefir",
},
{
"id": 2,
"text": "hello",
},
{
"id": 1,
"text": "hello puppy",
},
{
"id": 0,
"text": "hello puppy kefir",
},
]))
.unwrap();
index
}
#[test]
fn basic_degraded_search() {
let index = create_index();
let rtxn = index.read_txn().unwrap();
let mut search = Search::new(&rtxn, &index);
search.query("hello puppy kefir");
search.limit(3);
search.time_budget(TimeBudget::new(Duration::from_millis(0)));
let result = search.execute().unwrap();
assert!(result.degraded);
}
#[test]
fn degraded_search_cannot_skip_filter() {
let index = create_index();
let rtxn = index.read_txn().unwrap();
let mut search = Search::new(&rtxn, &index);
search.query("hello puppy kefir");
search.limit(100);
search.time_budget(TimeBudget::new(Duration::from_millis(0)));
let filter_condition = Filter::from_str("id > 2").unwrap().unwrap();
search.filter(filter_condition);
let result = search.execute().unwrap();
assert!(result.degraded);
snapshot!(format!("{:?}\n{:?}", result.candidates, result.documents_ids), @r###"
RoaringBitmap<[0, 1]>
[0, 1]
"###);
}
#[test]
fn degraded_search_and_score_details() {
let index = create_index();
let rtxn = index.read_txn().unwrap();
let mut search = Search::new(&rtxn, &index);
search.query("hello puppy kefir");
search.limit(4);
search.time_budget(TimeBudget::max());
let result = search.execute().unwrap();
snapshot!(format!("{:#?}\n{:#?}", result.documents_ids, result.document_scores), @r###"
[
4,
1,
0,
3,
]
[
[
Words(
Words {
matching_words: 3,
max_matching_words: 3,
},
),
Typo(
Typo {
typo_count: 0,
max_typo_count: 3,
},
),
],
[
Words(
Words {
matching_words: 3,
max_matching_words: 3,
},
),
Typo(
Typo {
typo_count: 1,
max_typo_count: 3,
},
),
],
[
Words(
Words {
matching_words: 3,
max_matching_words: 3,
},
),
],
[
Words(
Words {
matching_words: 2,
max_matching_words: 3,
},
),
],
]
"###);
// Do ONE loop iteration. Not much can be deduced, almost everyone matched the words first bucket.
search.time_budget(TimeBudget::max().with_stop_after(1));
let result = search.execute().unwrap();
snapshot!(format!("{:#?}\n{:#?}", result.documents_ids, result.document_scores), @r###"
[
0,
1,
4,
2,
]
[
[
Words(
Words {
matching_words: 3,
max_matching_words: 3,
},
),
Skipped,
],
[
Words(
Words {
matching_words: 3,
max_matching_words: 3,
},
),
Skipped,
],
[
Words(
Words {
matching_words: 3,
max_matching_words: 3,
},
),
Skipped,
],
[
Skipped,
],
]
"###);
// Do TWO loop iterations. The first document should be entirely sorted
search.time_budget(TimeBudget::max().with_stop_after(2));
let result = search.execute().unwrap();
snapshot!(format!("{:#?}\n{:#?}", result.documents_ids, result.document_scores), @r###"
[
4,
0,
1,
2,
]
[
[
Words(
Words {
matching_words: 3,
max_matching_words: 3,
},
),
Typo(
Typo {
typo_count: 0,
max_typo_count: 3,
},
),
],
[
Words(
Words {
matching_words: 3,
max_matching_words: 3,
},
),
Skipped,
],
[
Words(
Words {
matching_words: 3,
max_matching_words: 3,
},
),
Skipped,
],
[
Skipped,
],
]
"###);
// Do THREE loop iterations. The second document should be entirely sorted as well
search.time_budget(TimeBudget::max().with_stop_after(3));
let result = search.execute().unwrap();
snapshot!(format!("{:#?}\n{:#?}", result.documents_ids, result.document_scores), @r###"
[
4,
1,
0,
2,
]
[
[
Words(
Words {
matching_words: 3,
max_matching_words: 3,
},
),
Typo(
Typo {
typo_count: 0,
max_typo_count: 3,
},
),
],
[
Words(
Words {
matching_words: 3,
max_matching_words: 3,
},
),
Typo(
Typo {
typo_count: 1,
max_typo_count: 3,
},
),
],
[
Words(
Words {
matching_words: 3,
max_matching_words: 3,
},
),
Skipped,
],
[
Skipped,
],
]
"###);
// Do FOUR loop iterations. The third document should be entirely sorted as well
// The words bucket have still not progressed thus the last document doesn't have any info yet.
search.time_budget(TimeBudget::max().with_stop_after(4));
let result = search.execute().unwrap();
snapshot!(format!("{:#?}\n{:#?}", result.documents_ids, result.document_scores), @r###"
[
4,
1,
0,
2,
]
[
[
Words(
Words {
matching_words: 3,
max_matching_words: 3,
},
),
Typo(
Typo {
typo_count: 0,
max_typo_count: 3,
},
),
],
[
Words(
Words {
matching_words: 3,
max_matching_words: 3,
},
),
Typo(
Typo {
typo_count: 1,
max_typo_count: 3,
},
),
],
[
Words(
Words {
matching_words: 3,
max_matching_words: 3,
},
),
],
[
Skipped,
],
]
"###);
// After FIVE loop iteration. The words ranking rule gave us a new bucket.
// Since we reached the limit we were able to early exit without checking the typo ranking rule.
search.time_budget(TimeBudget::max().with_stop_after(5));
let result = search.execute().unwrap();
snapshot!(format!("{:#?}\n{:#?}", result.documents_ids, result.document_scores), @r###"
[
4,
1,
0,
3,
]
[
[
Words(
Words {
matching_words: 3,
max_matching_words: 3,
},
),
Typo(
Typo {
typo_count: 0,
max_typo_count: 3,
},
),
],
[
Words(
Words {
matching_words: 3,
max_matching_words: 3,
},
),
Typo(
Typo {
typo_count: 1,
max_typo_count: 3,
},
),
],
[
Words(
Words {
matching_words: 3,
max_matching_words: 3,
},
),
],
[
Words(
Words {
matching_words: 2,
max_matching_words: 3,
},
),
],
]
"###);
}

View File

@ -1,5 +1,6 @@
pub mod attribute_fid;
pub mod attribute_position;
pub mod cutoff;
pub mod distinct;
pub mod exactness;
pub mod geo_sort;

View File

@ -1,19 +1,14 @@
use std::cmp::Reverse;
use std::collections::HashSet;
use std::io::Cursor;
use std::time::Duration;
use big_s::S;
use either::{Either, Left, Right};
use heed::EnvOpenOptions;
use maplit::{btreemap, hashset};
use meili_snap::snapshot;
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
use milli::{
AscDesc, Criterion, DocumentId, Filter, Index, Member, Object, Search, TermsMatchingStrategy,
TimeBudget,
};
use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object, TermsMatchingStrategy};
use serde::{Deserialize, Deserializer};
use slice_group_by::GroupBy;
@ -354,41 +349,3 @@ where
let result = serde_json::Value::deserialize(deserializer)?;
Ok(Some(result))
}
#[test]
fn basic_degraded_search() {
use Criterion::*;
let criteria = vec![Words, Typo, Proximity, Attribute, Exactness];
let index = setup_search_index_with_criteria(&criteria);
let rtxn = index.read_txn().unwrap();
let mut search = Search::new(&rtxn, &index);
search.query(TEST_QUERY);
search.limit(EXTERNAL_DOCUMENTS_IDS.len());
search.time_budget(TimeBudget::new(Duration::from_millis(0)));
let result = search.execute().unwrap();
assert!(result.degraded);
}
#[test]
fn degraded_search_cannot_skip_filter() {
use Criterion::*;
let criteria = vec![Words, Typo, Proximity, Attribute, Exactness];
let index = setup_search_index_with_criteria(&criteria);
let rtxn = index.read_txn().unwrap();
let mut search = Search::new(&rtxn, &index);
search.query(TEST_QUERY);
search.limit(EXTERNAL_DOCUMENTS_IDS.len());
search.time_budget(TimeBudget::new(Duration::from_millis(0)));
let filter_condition = Filter::from_str("tag = etiopia").unwrap().unwrap();
search.filter(filter_condition);
let result = search.execute().unwrap();
assert!(result.degraded);
snapshot!(format!("{:?}\n{:?}", result.candidates, result.documents_ids), @r###"
RoaringBitmap<[0, 2, 5, 8, 11, 14]>
[0, 2, 5, 8, 11, 14]
"###);
}