mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 04:17:10 +02:00
Merge #4666
4666: Add a score threshold search parameter r=ManyTheFish a=dureuill # Pull Request ## Related issue Fixes https://github.com/meilisearch/meilisearch/issues/4609 ## What does this PR do? - See [usage](https://meilisearch.notion.site/Filter-by-score-usage-224a183ce7b24ca99b6a9a8da755668a?pvs=25#95b76ded400342ba9ab3d67c734836f0) and [the known limitation](https://meilisearch.notion.site/Filter-by-score-usage-224a183ce7b24ca99b6a9a8da755668a?pvs=25#e4e32195bf0e4195b5daecdbb7a97a17) Co-authored-by: Louis Dureuil <louis@meilisearch.com>
This commit is contained in:
commit
fc584f1db3
18 changed files with 804 additions and 32 deletions
|
@ -169,6 +169,7 @@ impl<'a> Search<'a> {
|
|||
index: self.index,
|
||||
semantic: self.semantic.clone(),
|
||||
time_budget: self.time_budget.clone(),
|
||||
ranking_score_threshold: self.ranking_score_threshold,
|
||||
};
|
||||
|
||||
let semantic = search.semantic.take();
|
||||
|
|
|
@ -50,6 +50,7 @@ pub struct Search<'a> {
|
|||
index: &'a Index,
|
||||
semantic: Option<SemanticSearch>,
|
||||
time_budget: TimeBudget,
|
||||
ranking_score_threshold: Option<f64>,
|
||||
}
|
||||
|
||||
impl<'a> Search<'a> {
|
||||
|
@ -70,6 +71,7 @@ impl<'a> Search<'a> {
|
|||
index,
|
||||
semantic: None,
|
||||
time_budget: TimeBudget::max(),
|
||||
ranking_score_threshold: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -146,6 +148,11 @@ impl<'a> Search<'a> {
|
|||
self
|
||||
}
|
||||
|
||||
pub fn ranking_score_threshold(&mut self, ranking_score_threshold: f64) -> &mut Search<'a> {
|
||||
self.ranking_score_threshold = Some(ranking_score_threshold);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn execute_for_candidates(&self, has_vector_search: bool) -> Result<RoaringBitmap> {
|
||||
if has_vector_search {
|
||||
let ctx = SearchContext::new(self.index, self.rtxn)?;
|
||||
|
@ -184,6 +191,7 @@ impl<'a> Search<'a> {
|
|||
embedder_name,
|
||||
embedder,
|
||||
self.time_budget.clone(),
|
||||
self.ranking_score_threshold,
|
||||
)?
|
||||
}
|
||||
_ => execute_search(
|
||||
|
@ -201,6 +209,7 @@ impl<'a> Search<'a> {
|
|||
&mut DefaultSearchLogger,
|
||||
&mut DefaultSearchLogger,
|
||||
self.time_budget.clone(),
|
||||
self.ranking_score_threshold,
|
||||
)?,
|
||||
};
|
||||
|
||||
|
@ -239,6 +248,7 @@ impl fmt::Debug for Search<'_> {
|
|||
index: _,
|
||||
semantic,
|
||||
time_budget,
|
||||
ranking_score_threshold,
|
||||
} = self;
|
||||
f.debug_struct("Search")
|
||||
.field("query", query)
|
||||
|
@ -257,6 +267,7 @@ impl fmt::Debug for Search<'_> {
|
|||
&semantic.as_ref().map(|semantic| &semantic.embedder_name),
|
||||
)
|
||||
.field("time_budget", time_budget)
|
||||
.field("ranking_score_threshold", ranking_score_threshold)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
|
|
@ -28,6 +28,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
|||
scoring_strategy: ScoringStrategy,
|
||||
logger: &mut dyn SearchLogger<Q>,
|
||||
time_budget: TimeBudget,
|
||||
ranking_score_threshold: Option<f64>,
|
||||
) -> Result<BucketSortOutput> {
|
||||
logger.initial_query(query);
|
||||
logger.ranking_rules(&ranking_rules);
|
||||
|
@ -164,7 +165,19 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
|||
loop {
|
||||
let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]);
|
||||
ranking_rule_scores.push(ScoreDetails::Skipped);
|
||||
|
||||
// remove candidates from the universe without adding them to result if their score is below the threshold
|
||||
if let Some(ranking_score_threshold) = ranking_score_threshold {
|
||||
let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
|
||||
if current_score < ranking_score_threshold {
|
||||
all_candidates -= bucket | &ranking_rule_universes[cur_ranking_rule_index];
|
||||
back!();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
maybe_add_to_results!(bucket);
|
||||
|
||||
ranking_rule_scores.pop();
|
||||
|
||||
if cur_ranking_rule_index == 0 {
|
||||
|
@ -220,6 +233,18 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
|||
debug_assert!(
|
||||
ranking_rule_universes[cur_ranking_rule_index].is_superset(&next_bucket.candidates)
|
||||
);
|
||||
|
||||
// remove candidates from the universe without adding them to result if their score is below the threshold
|
||||
if let Some(ranking_score_threshold) = ranking_score_threshold {
|
||||
let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
|
||||
if current_score < ranking_score_threshold {
|
||||
all_candidates -=
|
||||
next_bucket.candidates | &ranking_rule_universes[cur_ranking_rule_index];
|
||||
back!();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates;
|
||||
|
||||
if cur_ranking_rule_index == ranking_rules_len - 1
|
||||
|
|
|
@ -523,6 +523,7 @@ mod tests {
|
|||
&mut crate::DefaultSearchLogger,
|
||||
&mut crate::DefaultSearchLogger,
|
||||
TimeBudget::max(),
|
||||
None,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
|
|
|
@ -573,6 +573,7 @@ pub fn execute_vector_search(
|
|||
embedder_name: &str,
|
||||
embedder: &Embedder,
|
||||
time_budget: TimeBudget,
|
||||
ranking_score_threshold: Option<f64>,
|
||||
) -> Result<PartialSearchResult> {
|
||||
check_sort_criteria(ctx, sort_criteria.as_ref())?;
|
||||
|
||||
|
@ -602,6 +603,7 @@ pub fn execute_vector_search(
|
|||
scoring_strategy,
|
||||
placeholder_search_logger,
|
||||
time_budget,
|
||||
ranking_score_threshold,
|
||||
)?;
|
||||
|
||||
Ok(PartialSearchResult {
|
||||
|
@ -631,6 +633,7 @@ pub fn execute_search(
|
|||
placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery>,
|
||||
query_graph_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
time_budget: TimeBudget,
|
||||
ranking_score_threshold: Option<f64>,
|
||||
) -> Result<PartialSearchResult> {
|
||||
check_sort_criteria(ctx, sort_criteria.as_ref())?;
|
||||
|
||||
|
@ -719,6 +722,7 @@ pub fn execute_search(
|
|||
scoring_strategy,
|
||||
query_graph_logger,
|
||||
time_budget,
|
||||
ranking_score_threshold,
|
||||
)?
|
||||
} else {
|
||||
let ranking_rules =
|
||||
|
@ -733,6 +737,7 @@ pub fn execute_search(
|
|||
scoring_strategy,
|
||||
placeholder_search_logger,
|
||||
time_budget,
|
||||
ranking_score_threshold,
|
||||
)?
|
||||
};
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@ pub struct Similar<'a> {
|
|||
index: &'a Index,
|
||||
embedder_name: String,
|
||||
embedder: Arc<Embedder>,
|
||||
ranking_score_threshold: Option<f64>,
|
||||
}
|
||||
|
||||
impl<'a> Similar<'a> {
|
||||
|
@ -29,7 +30,17 @@ impl<'a> Similar<'a> {
|
|||
embedder_name: String,
|
||||
embedder: Arc<Embedder>,
|
||||
) -> Self {
|
||||
Self { id, filter: None, offset, limit, rtxn, index, embedder_name, embedder }
|
||||
Self {
|
||||
id,
|
||||
filter: None,
|
||||
offset,
|
||||
limit,
|
||||
rtxn,
|
||||
index,
|
||||
embedder_name,
|
||||
embedder,
|
||||
ranking_score_threshold: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn filter(&mut self, filter: Filter<'a>) -> &mut Self {
|
||||
|
@ -37,8 +48,18 @@ impl<'a> Similar<'a> {
|
|||
self
|
||||
}
|
||||
|
||||
pub fn ranking_score_threshold(&mut self, ranking_score_threshold: f64) -> &mut Self {
|
||||
self.ranking_score_threshold = Some(ranking_score_threshold);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn execute(&self) -> Result<SearchResult> {
|
||||
let universe = filtered_universe(self.index, self.rtxn, &self.filter)?;
|
||||
let mut universe = filtered_universe(self.index, self.rtxn, &self.filter)?;
|
||||
|
||||
// we never want to receive the docid
|
||||
universe.remove(self.id);
|
||||
|
||||
let universe = universe;
|
||||
|
||||
let embedder_index =
|
||||
self.index
|
||||
|
@ -77,6 +98,8 @@ impl<'a> Similar<'a> {
|
|||
let mut documents_seen = RoaringBitmap::new();
|
||||
documents_seen.insert(self.id);
|
||||
|
||||
let mut candidates = universe;
|
||||
|
||||
for (docid, distance) in results
|
||||
.into_iter()
|
||||
// skip documents we've already seen & mark that we saw the current document
|
||||
|
@ -85,8 +108,6 @@ impl<'a> Similar<'a> {
|
|||
// take **after** filter and skip so that we get exactly limit elements if available
|
||||
.take(self.limit)
|
||||
{
|
||||
documents_ids.push(docid);
|
||||
|
||||
let score = 1.0 - distance;
|
||||
let score = self
|
||||
.embedder
|
||||
|
@ -94,14 +115,28 @@ impl<'a> Similar<'a> {
|
|||
.map(|distribution| distribution.shift(score))
|
||||
.unwrap_or(score);
|
||||
|
||||
let score = ScoreDetails::Vector(score_details::Vector { similarity: Some(score) });
|
||||
let score_details =
|
||||
vec![ScoreDetails::Vector(score_details::Vector { similarity: Some(score) })];
|
||||
|
||||
document_scores.push(vec![score]);
|
||||
let score = ScoreDetails::global_score(score_details.iter());
|
||||
|
||||
if let Some(ranking_score_threshold) = &self.ranking_score_threshold {
|
||||
if score < *ranking_score_threshold {
|
||||
// this document is no longer a candidate
|
||||
candidates.remove(docid);
|
||||
// any document after this one is no longer a candidate either, so restrict the set to documents already seen.
|
||||
candidates &= documents_seen;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
documents_ids.push(docid);
|
||||
document_scores.push(score_details);
|
||||
}
|
||||
|
||||
Ok(SearchResult {
|
||||
matching_words: Default::default(),
|
||||
candidates: universe,
|
||||
candidates,
|
||||
documents_ids,
|
||||
document_scores,
|
||||
degraded: false,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue