mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 11:57:07 +02:00
Merge #4466
4466: Implements the search cutoff r=irevoire a=irevoire # Pull Request ## Related issue Fixes https://github.com/meilisearch/meilisearch/issues/4488 ## What does this PR do? - Adds a cutoff to the bucket sort after 150ms has been spent - Adds a new setting to customize the default value of 150ms - When the time is exceeded, we exit early with what we had the time to sort - If the cutoff has been reached, the search details are updated with a new `Skip` ranking details for the ranking rules that were skipped - Adds analytics to measure the total number of degraded search requests - Adds the number of degraded search requests to the Prometheus metrics and Grafana dashboard - The cutoff **must not** skip the filters; otherwise, we would leak documents to people who don’t have the right to see them Co-authored-by: Tamo <tamo@meilisearch.com> Co-authored-by: Louis Dureuil <louis@meilisearch.com>
This commit is contained in:
commit
fc1c3f4a29
28 changed files with 1023 additions and 89 deletions
|
@ -67,6 +67,7 @@ pub mod main_key {
|
|||
pub const PAGINATION_MAX_TOTAL_HITS: &str = "pagination-max-total-hits";
|
||||
pub const PROXIMITY_PRECISION: &str = "proximity-precision";
|
||||
pub const EMBEDDING_CONFIGS: &str = "embedding_configs";
|
||||
pub const SEARCH_CUTOFF: &str = "search_cutoff";
|
||||
}
|
||||
|
||||
pub mod db_name {
|
||||
|
@ -1505,6 +1506,18 @@ impl Index {
|
|||
_ => "default".to_owned(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn put_search_cutoff(&self, wtxn: &mut RwTxn<'_>, cutoff: u64) -> heed::Result<()> {
|
||||
self.main.remap_types::<Str, BEU64>().put(wtxn, main_key::SEARCH_CUTOFF, &cutoff)
|
||||
}
|
||||
|
||||
pub fn search_cutoff(&self, rtxn: &RoTxn<'_>) -> Result<Option<u64>> {
|
||||
Ok(self.main.remap_types::<Str, BEU64>().get(rtxn, main_key::SEARCH_CUTOFF)?)
|
||||
}
|
||||
|
||||
pub(crate) fn delete_search_cutoff(&self, wtxn: &mut RwTxn<'_>) -> heed::Result<bool> {
|
||||
self.main.remap_key_type::<Str>().delete(wtxn, main_key::SEARCH_CUTOFF)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
@ -2421,6 +2434,7 @@ pub(crate) mod tests {
|
|||
candidates: _,
|
||||
document_scores: _,
|
||||
mut documents_ids,
|
||||
degraded: _,
|
||||
} = search.execute().unwrap();
|
||||
let primary_key_id = index.fields_ids_map(&rtxn).unwrap().id("primary_key").unwrap();
|
||||
documents_ids.sort_unstable();
|
||||
|
|
|
@ -30,6 +30,7 @@ pub mod snapshot_tests;
|
|||
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::convert::{TryFrom, TryInto};
|
||||
use std::fmt;
|
||||
use std::hash::BuildHasherDefault;
|
||||
|
||||
use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer};
|
||||
|
@ -104,6 +105,73 @@ pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2;
|
|||
|
||||
pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct TimeBudget {
|
||||
started_at: std::time::Instant,
|
||||
budget: std::time::Duration,
|
||||
|
||||
/// When testing the time budget, ensuring we did more than iteration of the bucket sort can be useful.
|
||||
/// But to avoid being flaky, the only option is to add the ability to stop after a specific number of calls instead of a `Duration`.
|
||||
#[cfg(test)]
|
||||
stop_after: Option<(std::sync::Arc<std::sync::atomic::AtomicUsize>, usize)>,
|
||||
}
|
||||
|
||||
impl fmt::Debug for TimeBudget {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("TimeBudget")
|
||||
.field("started_at", &self.started_at)
|
||||
.field("budget", &self.budget)
|
||||
.field("left", &(self.budget - self.started_at.elapsed()))
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for TimeBudget {
|
||||
fn default() -> Self {
|
||||
Self::new(std::time::Duration::from_millis(150))
|
||||
}
|
||||
}
|
||||
|
||||
impl TimeBudget {
|
||||
pub fn new(budget: std::time::Duration) -> Self {
|
||||
Self {
|
||||
started_at: std::time::Instant::now(),
|
||||
budget,
|
||||
|
||||
#[cfg(test)]
|
||||
stop_after: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn max() -> Self {
|
||||
Self::new(std::time::Duration::from_secs(u64::MAX))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn with_stop_after(mut self, stop_after: usize) -> Self {
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::sync::Arc;
|
||||
|
||||
self.stop_after = Some((Arc::new(AtomicUsize::new(0)), stop_after));
|
||||
self
|
||||
}
|
||||
|
||||
pub fn exceeded(&self) -> bool {
|
||||
#[cfg(test)]
|
||||
if let Some((current, stop_after)) = &self.stop_after {
|
||||
let current = current.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
if current >= *stop_after {
|
||||
return true;
|
||||
} else {
|
||||
// if a number has been specified then we ignore entirely the time budget
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
self.started_at.elapsed() > self.budget
|
||||
}
|
||||
}
|
||||
|
||||
// Convert an absolute word position into a relative position.
|
||||
// Return the field id of the attribute related to the absolute position
|
||||
// and the relative position in the attribute.
|
||||
|
|
|
@ -17,6 +17,9 @@ pub enum ScoreDetails {
|
|||
Sort(Sort),
|
||||
Vector(Vector),
|
||||
GeoSort(GeoSort),
|
||||
|
||||
/// Returned when we don't have the time to finish applying all the subsequent ranking-rules
|
||||
Skipped,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
|
@ -50,6 +53,7 @@ impl ScoreDetails {
|
|||
ScoreDetails::Sort(_) => None,
|
||||
ScoreDetails::GeoSort(_) => None,
|
||||
ScoreDetails::Vector(_) => None,
|
||||
ScoreDetails::Skipped => Some(Rank { rank: 0, max_rank: 1 }),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -97,6 +101,7 @@ impl ScoreDetails {
|
|||
ScoreDetails::Vector(vector) => RankOrValue::Score(
|
||||
vector.value_similarity.as_ref().map(|(_, s)| *s as f64).unwrap_or(0.0f64),
|
||||
),
|
||||
ScoreDetails::Skipped => RankOrValue::Rank(Rank { rank: 0, max_rank: 1 }),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -256,6 +261,11 @@ impl ScoreDetails {
|
|||
details_map.insert(vector, details);
|
||||
order += 1;
|
||||
}
|
||||
ScoreDetails::Skipped => {
|
||||
details_map
|
||||
.insert("skipped".to_string(), serde_json::json!({ "order": order }));
|
||||
order += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
details_map
|
||||
|
|
|
@ -10,6 +10,7 @@ struct ScoreWithRatioResult {
|
|||
matching_words: MatchingWords,
|
||||
candidates: RoaringBitmap,
|
||||
document_scores: Vec<(u32, ScoreWithRatio)>,
|
||||
degraded: bool,
|
||||
}
|
||||
|
||||
type ScoreWithRatio = (Vec<ScoreDetails>, f32);
|
||||
|
@ -49,8 +50,12 @@ fn compare_scores(
|
|||
order => return order,
|
||||
}
|
||||
}
|
||||
(Some(ScoreValue::Score(_)), Some(_)) => return Ordering::Greater,
|
||||
(Some(_), Some(ScoreValue::Score(_))) => return Ordering::Less,
|
||||
(Some(ScoreValue::Score(x)), Some(_)) => {
|
||||
return if x == 0. { Ordering::Less } else { Ordering::Greater }
|
||||
}
|
||||
(Some(_), Some(ScoreValue::Score(x))) => {
|
||||
return if x == 0. { Ordering::Greater } else { Ordering::Less }
|
||||
}
|
||||
// if we have this, we're bad
|
||||
(Some(ScoreValue::GeoSort(_)), Some(ScoreValue::Sort(_)))
|
||||
| (Some(ScoreValue::Sort(_)), Some(ScoreValue::GeoSort(_))) => {
|
||||
|
@ -72,6 +77,7 @@ impl ScoreWithRatioResult {
|
|||
matching_words: results.matching_words,
|
||||
candidates: results.candidates,
|
||||
document_scores,
|
||||
degraded: results.degraded,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -106,6 +112,7 @@ impl ScoreWithRatioResult {
|
|||
candidates: left.candidates | right.candidates,
|
||||
documents_ids,
|
||||
document_scores,
|
||||
degraded: left.degraded | right.degraded,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -131,6 +138,7 @@ impl<'a> Search<'a> {
|
|||
index: self.index,
|
||||
distribution_shift: self.distribution_shift,
|
||||
embedder_name: self.embedder_name.clone(),
|
||||
time_budget: self.time_budget.clone(),
|
||||
};
|
||||
|
||||
let vector_query = search.vector.take();
|
||||
|
|
|
@ -11,7 +11,7 @@ use crate::score_details::{ScoreDetails, ScoringStrategy};
|
|||
use crate::vector::DistributionShift;
|
||||
use crate::{
|
||||
execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, Index, Result,
|
||||
SearchContext,
|
||||
SearchContext, TimeBudget,
|
||||
};
|
||||
|
||||
// Building these factories is not free.
|
||||
|
@ -43,6 +43,8 @@ pub struct Search<'a> {
|
|||
index: &'a Index,
|
||||
distribution_shift: Option<DistributionShift>,
|
||||
embedder_name: Option<String>,
|
||||
|
||||
time_budget: TimeBudget,
|
||||
}
|
||||
|
||||
impl<'a> Search<'a> {
|
||||
|
@ -64,6 +66,7 @@ impl<'a> Search<'a> {
|
|||
index,
|
||||
distribution_shift: None,
|
||||
embedder_name: None,
|
||||
time_budget: TimeBudget::max(),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -143,6 +146,11 @@ impl<'a> Search<'a> {
|
|||
self
|
||||
}
|
||||
|
||||
pub fn time_budget(&mut self, time_budget: TimeBudget) -> &mut Search<'a> {
|
||||
self.time_budget = time_budget;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn execute_for_candidates(&self, has_vector_search: bool) -> Result<RoaringBitmap> {
|
||||
if has_vector_search {
|
||||
let ctx = SearchContext::new(self.index, self.rtxn);
|
||||
|
@ -169,36 +177,43 @@ impl<'a> Search<'a> {
|
|||
}
|
||||
|
||||
let universe = filtered_universe(&ctx, &self.filter)?;
|
||||
let PartialSearchResult { located_query_terms, candidates, documents_ids, document_scores } =
|
||||
match self.vector.as_ref() {
|
||||
Some(vector) => execute_vector_search(
|
||||
&mut ctx,
|
||||
vector,
|
||||
self.scoring_strategy,
|
||||
universe,
|
||||
&self.sort_criteria,
|
||||
self.geo_strategy,
|
||||
self.offset,
|
||||
self.limit,
|
||||
self.distribution_shift,
|
||||
embedder_name,
|
||||
)?,
|
||||
None => execute_search(
|
||||
&mut ctx,
|
||||
self.query.as_deref(),
|
||||
self.terms_matching_strategy,
|
||||
self.scoring_strategy,
|
||||
self.exhaustive_number_hits,
|
||||
universe,
|
||||
&self.sort_criteria,
|
||||
self.geo_strategy,
|
||||
self.offset,
|
||||
self.limit,
|
||||
Some(self.words_limit),
|
||||
&mut DefaultSearchLogger,
|
||||
&mut DefaultSearchLogger,
|
||||
)?,
|
||||
};
|
||||
let PartialSearchResult {
|
||||
located_query_terms,
|
||||
candidates,
|
||||
documents_ids,
|
||||
document_scores,
|
||||
degraded,
|
||||
} = match self.vector.as_ref() {
|
||||
Some(vector) => execute_vector_search(
|
||||
&mut ctx,
|
||||
vector,
|
||||
self.scoring_strategy,
|
||||
universe,
|
||||
&self.sort_criteria,
|
||||
self.geo_strategy,
|
||||
self.offset,
|
||||
self.limit,
|
||||
self.distribution_shift,
|
||||
embedder_name,
|
||||
self.time_budget.clone(),
|
||||
)?,
|
||||
None => execute_search(
|
||||
&mut ctx,
|
||||
self.query.as_deref(),
|
||||
self.terms_matching_strategy,
|
||||
self.scoring_strategy,
|
||||
self.exhaustive_number_hits,
|
||||
universe,
|
||||
&self.sort_criteria,
|
||||
self.geo_strategy,
|
||||
self.offset,
|
||||
self.limit,
|
||||
Some(self.words_limit),
|
||||
&mut DefaultSearchLogger,
|
||||
&mut DefaultSearchLogger,
|
||||
self.time_budget.clone(),
|
||||
)?,
|
||||
};
|
||||
|
||||
// consume context and located_query_terms to build MatchingWords.
|
||||
let matching_words = match located_query_terms {
|
||||
|
@ -206,7 +221,7 @@ impl<'a> Search<'a> {
|
|||
None => MatchingWords::default(),
|
||||
};
|
||||
|
||||
Ok(SearchResult { matching_words, candidates, document_scores, documents_ids })
|
||||
Ok(SearchResult { matching_words, candidates, document_scores, documents_ids, degraded })
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -229,6 +244,7 @@ impl fmt::Debug for Search<'_> {
|
|||
index: _,
|
||||
distribution_shift,
|
||||
embedder_name,
|
||||
time_budget,
|
||||
} = self;
|
||||
f.debug_struct("Search")
|
||||
.field("query", query)
|
||||
|
@ -244,6 +260,7 @@ impl fmt::Debug for Search<'_> {
|
|||
.field("words_limit", words_limit)
|
||||
.field("distribution_shift", distribution_shift)
|
||||
.field("embedder_name", embedder_name)
|
||||
.field("time_budget", time_budget)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
@ -254,6 +271,7 @@ pub struct SearchResult {
|
|||
pub candidates: RoaringBitmap,
|
||||
pub documents_ids: Vec<DocumentId>,
|
||||
pub document_scores: Vec<Vec<ScoreDetails>>,
|
||||
pub degraded: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
|
|
|
@ -5,12 +5,14 @@ use super::ranking_rules::{BoxRankingRule, RankingRuleQueryTrait};
|
|||
use super::SearchContext;
|
||||
use crate::score_details::{ScoreDetails, ScoringStrategy};
|
||||
use crate::search::new::distinct::{apply_distinct_rule, distinct_single_docid, DistinctOutput};
|
||||
use crate::Result;
|
||||
use crate::{Result, TimeBudget};
|
||||
|
||||
pub struct BucketSortOutput {
|
||||
pub docids: Vec<u32>,
|
||||
pub scores: Vec<Vec<ScoreDetails>>,
|
||||
pub all_candidates: RoaringBitmap,
|
||||
|
||||
pub degraded: bool,
|
||||
}
|
||||
|
||||
// TODO: would probably be good to regroup some of these inside of a struct?
|
||||
|
@ -25,6 +27,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
|||
length: usize,
|
||||
scoring_strategy: ScoringStrategy,
|
||||
logger: &mut dyn SearchLogger<Q>,
|
||||
time_budget: TimeBudget,
|
||||
) -> Result<BucketSortOutput> {
|
||||
logger.initial_query(query);
|
||||
logger.ranking_rules(&ranking_rules);
|
||||
|
@ -41,6 +44,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
|||
docids: vec![],
|
||||
scores: vec![],
|
||||
all_candidates: universe.clone(),
|
||||
degraded: false,
|
||||
});
|
||||
}
|
||||
if ranking_rules.is_empty() {
|
||||
|
@ -74,6 +78,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
|||
scores: vec![Default::default(); results.len()],
|
||||
docids: results,
|
||||
all_candidates,
|
||||
degraded: false,
|
||||
});
|
||||
} else {
|
||||
let docids: Vec<u32> = universe.iter().skip(from).take(length).collect();
|
||||
|
@ -81,6 +86,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
|||
scores: vec![Default::default(); docids.len()],
|
||||
docids,
|
||||
all_candidates: universe.clone(),
|
||||
degraded: false,
|
||||
});
|
||||
};
|
||||
}
|
||||
|
@ -154,6 +160,28 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
|||
}
|
||||
|
||||
while valid_docids.len() < length {
|
||||
if time_budget.exceeded() {
|
||||
loop {
|
||||
let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]);
|
||||
ranking_rule_scores.push(ScoreDetails::Skipped);
|
||||
maybe_add_to_results!(bucket);
|
||||
ranking_rule_scores.pop();
|
||||
|
||||
if cur_ranking_rule_index == 0 {
|
||||
break;
|
||||
}
|
||||
|
||||
back!();
|
||||
}
|
||||
|
||||
return Ok(BucketSortOutput {
|
||||
scores: valid_scores,
|
||||
docids: valid_docids,
|
||||
all_candidates,
|
||||
degraded: true,
|
||||
});
|
||||
}
|
||||
|
||||
// The universe for this bucket is zero, so we don't need to sort
|
||||
// anything, just go back to the parent ranking rule.
|
||||
if ranking_rule_universes[cur_ranking_rule_index].is_empty()
|
||||
|
@ -219,7 +247,12 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
|||
)?;
|
||||
}
|
||||
|
||||
Ok(BucketSortOutput { docids: valid_docids, scores: valid_scores, all_candidates })
|
||||
Ok(BucketSortOutput {
|
||||
docids: valid_docids,
|
||||
scores: valid_scores,
|
||||
all_candidates,
|
||||
degraded: false,
|
||||
})
|
||||
}
|
||||
|
||||
/// Add the candidates to the results. Take `distinct`, `from`, `length`, and `cur_offset`
|
||||
|
|
|
@ -502,7 +502,7 @@ mod tests {
|
|||
|
||||
use super::*;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::{execute_search, filtered_universe, SearchContext};
|
||||
use crate::{execute_search, filtered_universe, SearchContext, TimeBudget};
|
||||
|
||||
impl<'a> MatcherBuilder<'a> {
|
||||
fn new_test(rtxn: &'a heed::RoTxn, index: &'a TempIndex, query: &str) -> Self {
|
||||
|
@ -522,6 +522,7 @@ mod tests {
|
|||
Some(10),
|
||||
&mut crate::DefaultSearchLogger,
|
||||
&mut crate::DefaultSearchLogger,
|
||||
TimeBudget::max(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
|
|
|
@ -52,7 +52,8 @@ use crate::score_details::{ScoreDetails, ScoringStrategy};
|
|||
use crate::search::new::distinct::apply_distinct_rule;
|
||||
use crate::vector::DistributionShift;
|
||||
use crate::{
|
||||
AscDesc, DocumentId, FieldId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError,
|
||||
AscDesc, DocumentId, FieldId, Filter, Index, Member, Result, TermsMatchingStrategy, TimeBudget,
|
||||
UserError,
|
||||
};
|
||||
|
||||
/// A structure used throughout the execution of a search query.
|
||||
|
@ -518,6 +519,7 @@ pub fn execute_vector_search(
|
|||
length: usize,
|
||||
distribution_shift: Option<DistributionShift>,
|
||||
embedder_name: &str,
|
||||
time_budget: TimeBudget,
|
||||
) -> Result<PartialSearchResult> {
|
||||
check_sort_criteria(ctx, sort_criteria.as_ref())?;
|
||||
|
||||
|
@ -537,7 +539,7 @@ pub fn execute_vector_search(
|
|||
let placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery> =
|
||||
&mut placeholder_search_logger;
|
||||
|
||||
let BucketSortOutput { docids, scores, all_candidates } = bucket_sort(
|
||||
let BucketSortOutput { docids, scores, all_candidates, degraded } = bucket_sort(
|
||||
ctx,
|
||||
ranking_rules,
|
||||
&PlaceholderQuery,
|
||||
|
@ -546,6 +548,7 @@ pub fn execute_vector_search(
|
|||
length,
|
||||
scoring_strategy,
|
||||
placeholder_search_logger,
|
||||
time_budget,
|
||||
)?;
|
||||
|
||||
Ok(PartialSearchResult {
|
||||
|
@ -553,6 +556,7 @@ pub fn execute_vector_search(
|
|||
document_scores: scores,
|
||||
documents_ids: docids,
|
||||
located_query_terms: None,
|
||||
degraded,
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -572,6 +576,7 @@ pub fn execute_search(
|
|||
words_limit: Option<usize>,
|
||||
placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery>,
|
||||
query_graph_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
time_budget: TimeBudget,
|
||||
) -> Result<PartialSearchResult> {
|
||||
check_sort_criteria(ctx, sort_criteria.as_ref())?;
|
||||
|
||||
|
@ -648,6 +653,7 @@ pub fn execute_search(
|
|||
length,
|
||||
scoring_strategy,
|
||||
query_graph_logger,
|
||||
time_budget,
|
||||
)?
|
||||
} else {
|
||||
let ranking_rules =
|
||||
|
@ -661,10 +667,11 @@ pub fn execute_search(
|
|||
length,
|
||||
scoring_strategy,
|
||||
placeholder_search_logger,
|
||||
time_budget,
|
||||
)?
|
||||
};
|
||||
|
||||
let BucketSortOutput { docids, scores, mut all_candidates } = bucket_sort_output;
|
||||
let BucketSortOutput { docids, scores, mut all_candidates, degraded } = bucket_sort_output;
|
||||
let fields_ids_map = ctx.index.fields_ids_map(ctx.txn)?;
|
||||
|
||||
// The candidates is the universe unless the exhaustive number of hits
|
||||
|
@ -682,6 +689,7 @@ pub fn execute_search(
|
|||
document_scores: scores,
|
||||
documents_ids: docids,
|
||||
located_query_terms,
|
||||
degraded,
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -742,4 +750,6 @@ pub struct PartialSearchResult {
|
|||
pub candidates: RoaringBitmap,
|
||||
pub documents_ids: Vec<DocumentId>,
|
||||
pub document_scores: Vec<Vec<ScoreDetails>>,
|
||||
|
||||
pub degraded: bool,
|
||||
}
|
||||
|
|
429
milli/src/search/new/tests/cutoff.rs
Normal file
429
milli/src/search/new/tests/cutoff.rs
Normal file
|
@ -0,0 +1,429 @@
|
|||
//! This module test the search cutoff and ensure a few things:
|
||||
//! 1. A basic test works and mark the search as degraded
|
||||
//! 2. A test that ensure the filters are affectively applied even with a cutoff of 0
|
||||
//! 3. A test that ensure the cutoff works well with the ranking scores
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
use big_s::S;
|
||||
use maplit::hashset;
|
||||
use meili_snap::snapshot;
|
||||
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::score_details::{ScoreDetails, ScoringStrategy};
|
||||
use crate::{Criterion, Filter, Search, TimeBudget};
|
||||
|
||||
fn create_index() -> TempIndex {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_primary_key("id".to_owned());
|
||||
s.set_searchable_fields(vec!["text".to_owned()]);
|
||||
s.set_filterable_fields(hashset! { S("id") });
|
||||
s.set_criteria(vec![Criterion::Words, Criterion::Typo]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
// reverse the ID / insertion order so we see better what was sorted from what got the insertion order ordering
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{
|
||||
"id": 4,
|
||||
"text": "hella puppo kefir",
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"text": "hella puppy kefir",
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"text": "hello",
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"text": "hello puppy",
|
||||
},
|
||||
{
|
||||
"id": 0,
|
||||
"text": "hello puppy kefir",
|
||||
},
|
||||
]))
|
||||
.unwrap();
|
||||
index
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn basic_degraded_search() {
|
||||
let index = create_index();
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let mut search = Search::new(&rtxn, &index);
|
||||
search.query("hello puppy kefir");
|
||||
search.limit(3);
|
||||
search.time_budget(TimeBudget::new(Duration::from_millis(0)));
|
||||
|
||||
let result = search.execute().unwrap();
|
||||
assert!(result.degraded);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn degraded_search_cannot_skip_filter() {
|
||||
let index = create_index();
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let mut search = Search::new(&rtxn, &index);
|
||||
search.query("hello puppy kefir");
|
||||
search.limit(100);
|
||||
search.time_budget(TimeBudget::new(Duration::from_millis(0)));
|
||||
let filter_condition = Filter::from_str("id > 2").unwrap().unwrap();
|
||||
search.filter(filter_condition);
|
||||
|
||||
let result = search.execute().unwrap();
|
||||
assert!(result.degraded);
|
||||
snapshot!(format!("{:?}\n{:?}", result.candidates, result.documents_ids), @r###"
|
||||
RoaringBitmap<[0, 1]>
|
||||
[0, 1]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[allow(clippy::format_collect)] // the test is already quite big
|
||||
fn degraded_search_and_score_details() {
|
||||
let index = create_index();
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let mut search = Search::new(&rtxn, &index);
|
||||
search.query("hello puppy kefir");
|
||||
search.limit(4);
|
||||
search.scoring_strategy(ScoringStrategy::Detailed);
|
||||
search.time_budget(TimeBudget::max());
|
||||
|
||||
let result = search.execute().unwrap();
|
||||
snapshot!(format!("IDs: {:?}\nScores: {}\nScore Details:\n{:#?}", result.documents_ids, result.document_scores.iter().map(|scores| format!("{:.4} ", ScoreDetails::global_score(scores.iter()))).collect::<String>(), result.document_scores), @r###"
|
||||
IDs: [4, 1, 0, 3]
|
||||
Scores: 1.0000 0.9167 0.8333 0.6667
|
||||
Score Details:
|
||||
[
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 0,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 1,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 2,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 2,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 0,
|
||||
max_typo_count: 2,
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
"###);
|
||||
|
||||
// Do ONE loop iteration. Not much can be deduced, almost everyone matched the words first bucket.
|
||||
search.time_budget(TimeBudget::max().with_stop_after(1));
|
||||
|
||||
let result = search.execute().unwrap();
|
||||
snapshot!(format!("IDs: {:?}\nScores: {}\nScore Details:\n{:#?}", result.documents_ids, result.document_scores.iter().map(|scores| format!("{:.4} ", ScoreDetails::global_score(scores.iter()))).collect::<String>(), result.document_scores), @r###"
|
||||
IDs: [0, 1, 4, 2]
|
||||
Scores: 0.6667 0.6667 0.6667 0.0000
|
||||
Score Details:
|
||||
[
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Skipped,
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Skipped,
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Skipped,
|
||||
],
|
||||
[
|
||||
Skipped,
|
||||
],
|
||||
]
|
||||
"###);
|
||||
|
||||
// Do TWO loop iterations. The first document should be entirely sorted
|
||||
search.time_budget(TimeBudget::max().with_stop_after(2));
|
||||
|
||||
let result = search.execute().unwrap();
|
||||
snapshot!(format!("IDs: {:?}\nScores: {}\nScore Details:\n{:#?}", result.documents_ids, result.document_scores.iter().map(|scores| format!("{:.4} ", ScoreDetails::global_score(scores.iter()))).collect::<String>(), result.document_scores), @r###"
|
||||
IDs: [4, 0, 1, 2]
|
||||
Scores: 1.0000 0.6667 0.6667 0.0000
|
||||
Score Details:
|
||||
[
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 0,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Skipped,
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Skipped,
|
||||
],
|
||||
[
|
||||
Skipped,
|
||||
],
|
||||
]
|
||||
"###);
|
||||
|
||||
// Do THREE loop iterations. The second document should be entirely sorted as well
|
||||
search.time_budget(TimeBudget::max().with_stop_after(3));
|
||||
|
||||
let result = search.execute().unwrap();
|
||||
snapshot!(format!("IDs: {:?}\nScores: {}\nScore Details:\n{:#?}", result.documents_ids, result.document_scores.iter().map(|scores| format!("{:.4} ", ScoreDetails::global_score(scores.iter()))).collect::<String>(), result.document_scores), @r###"
|
||||
IDs: [4, 1, 0, 2]
|
||||
Scores: 1.0000 0.9167 0.6667 0.0000
|
||||
Score Details:
|
||||
[
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 0,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 1,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Skipped,
|
||||
],
|
||||
[
|
||||
Skipped,
|
||||
],
|
||||
]
|
||||
"###);
|
||||
|
||||
// Do FOUR loop iterations. The third document should be entirely sorted as well
|
||||
// The words bucket have still not progressed thus the last document doesn't have any info yet.
|
||||
search.time_budget(TimeBudget::max().with_stop_after(4));
|
||||
|
||||
let result = search.execute().unwrap();
|
||||
snapshot!(format!("IDs: {:?}\nScores: {}\nScore Details:\n{:#?}", result.documents_ids, result.document_scores.iter().map(|scores| format!("{:.4} ", ScoreDetails::global_score(scores.iter()))).collect::<String>(), result.document_scores), @r###"
|
||||
IDs: [4, 1, 0, 2]
|
||||
Scores: 1.0000 0.9167 0.8333 0.0000
|
||||
Score Details:
|
||||
[
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 0,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 1,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 2,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Skipped,
|
||||
],
|
||||
]
|
||||
"###);
|
||||
|
||||
// After SIX loop iteration. The words ranking rule gave us a new bucket.
|
||||
// Since we reached the limit we were able to early exit without checking the typo ranking rule.
|
||||
search.time_budget(TimeBudget::max().with_stop_after(6));
|
||||
|
||||
let result = search.execute().unwrap();
|
||||
snapshot!(format!("IDs: {:?}\nScores: {}\nScore Details:\n{:#?}", result.documents_ids, result.document_scores.iter().map(|scores| format!("{:.4} ", ScoreDetails::global_score(scores.iter()))).collect::<String>(), result.document_scores), @r###"
|
||||
IDs: [4, 1, 0, 3]
|
||||
Scores: 1.0000 0.9167 0.8333 0.3333
|
||||
Score Details:
|
||||
[
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 0,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 1,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 2,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 2,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Skipped,
|
||||
],
|
||||
]
|
||||
"###);
|
||||
}
|
|
@ -1,5 +1,6 @@
|
|||
pub mod attribute_fid;
|
||||
pub mod attribute_position;
|
||||
pub mod cutoff;
|
||||
pub mod distinct;
|
||||
pub mod exactness;
|
||||
pub mod geo_sort;
|
||||
|
|
|
@ -150,6 +150,7 @@ pub struct Settings<'a, 't, 'i> {
|
|||
pagination_max_total_hits: Setting<usize>,
|
||||
proximity_precision: Setting<ProximityPrecision>,
|
||||
embedder_settings: Setting<BTreeMap<String, Setting<EmbeddingSettings>>>,
|
||||
search_cutoff: Setting<u64>,
|
||||
}
|
||||
|
||||
impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
|
@ -183,6 +184,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
pagination_max_total_hits: Setting::NotSet,
|
||||
proximity_precision: Setting::NotSet,
|
||||
embedder_settings: Setting::NotSet,
|
||||
search_cutoff: Setting::NotSet,
|
||||
indexer_config,
|
||||
}
|
||||
}
|
||||
|
@ -373,6 +375,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
self.embedder_settings = Setting::Reset;
|
||||
}
|
||||
|
||||
pub fn set_search_cutoff(&mut self, value: u64) {
|
||||
self.search_cutoff = Setting::Set(value);
|
||||
}
|
||||
|
||||
pub fn reset_search_cutoff(&mut self) {
|
||||
self.search_cutoff = Setting::Reset;
|
||||
}
|
||||
|
||||
#[tracing::instrument(
|
||||
level = "trace"
|
||||
skip(self, progress_callback, should_abort, old_fields_ids_map),
|
||||
|
@ -1026,6 +1036,24 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
Ok(update)
|
||||
}
|
||||
|
||||
fn update_search_cutoff(&mut self) -> Result<bool> {
|
||||
let changed = match self.search_cutoff {
|
||||
Setting::Set(new) => {
|
||||
let old = self.index.search_cutoff(self.wtxn)?;
|
||||
if old == Some(new) {
|
||||
false
|
||||
} else {
|
||||
self.index.put_search_cutoff(self.wtxn, new)?;
|
||||
true
|
||||
}
|
||||
}
|
||||
Setting::Reset => self.index.delete_search_cutoff(self.wtxn)?,
|
||||
Setting::NotSet => false,
|
||||
};
|
||||
|
||||
Ok(changed)
|
||||
}
|
||||
|
||||
pub fn execute<FP, FA>(mut self, progress_callback: FP, should_abort: FA) -> Result<()>
|
||||
where
|
||||
FP: Fn(UpdateIndexingStep) + Sync,
|
||||
|
@ -1079,6 +1107,9 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||
// 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage
|
||||
let embedding_configs_updated = self.update_embedding_configs()?;
|
||||
|
||||
// never trigger re-indexing
|
||||
self.update_search_cutoff()?;
|
||||
|
||||
if stop_words_updated
|
||||
|| non_separator_tokens_updated
|
||||
|| separator_tokens_updated
|
||||
|
@ -2035,6 +2066,7 @@ mod tests {
|
|||
pagination_max_total_hits,
|
||||
proximity_precision,
|
||||
embedder_settings,
|
||||
search_cutoff,
|
||||
} = settings;
|
||||
assert!(matches!(searchable_fields, Setting::NotSet));
|
||||
assert!(matches!(displayed_fields, Setting::NotSet));
|
||||
|
@ -2058,6 +2090,7 @@ mod tests {
|
|||
assert!(matches!(pagination_max_total_hits, Setting::NotSet));
|
||||
assert!(matches!(proximity_precision, Setting::NotSet));
|
||||
assert!(matches!(embedder_settings, Setting::NotSet));
|
||||
assert!(matches!(search_cutoff, Setting::NotSet));
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue