Merge remote-tracking branch 'origin/main' into tmp-release-v1.15.1

This commit is contained in:
Clément Renault 2025-06-12 10:21:07 +02:00
commit 9bda9a9a64
No known key found for this signature in database
GPG key ID: F250A4C4E3AE5F5F
58 changed files with 2312 additions and 1756 deletions

View file

@ -1,11 +1,13 @@
use std::cmp::Ordering;
use heed::RoTxn;
use itertools::Itertools;
use roaring::RoaringBitmap;
use crate::score_details::{ScoreDetails, ScoreValue, ScoringStrategy};
use crate::search::new::{distinct_fid, distinct_single_docid};
use crate::search::SemanticSearch;
use crate::{MatchingWords, Result, Search, SearchResult};
use crate::{Index, MatchingWords, Result, Search, SearchResult};
struct ScoreWithRatioResult {
matching_words: MatchingWords,
@ -91,7 +93,10 @@ impl ScoreWithRatioResult {
keyword_results: Self,
from: usize,
length: usize,
) -> (SearchResult, u32) {
distinct: Option<&str>,
index: &Index,
rtxn: &RoTxn<'_>,
) -> Result<(SearchResult, u32)> {
#[derive(Clone, Copy)]
enum ResultSource {
Semantic,
@ -106,8 +111,9 @@ impl ScoreWithRatioResult {
vector_results.document_scores.len() + keyword_results.document_scores.len(),
);
let mut documents_seen = RoaringBitmap::new();
for ((docid, (main_score, _sub_score)), source) in vector_results
let distinct_fid = distinct_fid(distinct, index, rtxn)?;
let mut excluded_documents = RoaringBitmap::new();
for res in vector_results
.document_scores
.into_iter()
.zip(std::iter::repeat(ResultSource::Semantic))
@ -121,13 +127,33 @@ impl ScoreWithRatioResult {
compare_scores(left, right).is_ge()
},
)
// remove documents we already saw
.filter(|((docid, _), _)| documents_seen.insert(*docid))
// remove documents we already saw and apply distinct rule
.filter_map(|item @ ((docid, _), _)| {
if !excluded_documents.insert(docid) {
// the document was already added, or is indistinct from an already-added document.
return None;
}
if let Some(distinct_fid) = distinct_fid {
if let Err(error) = distinct_single_docid(
index,
rtxn,
distinct_fid,
docid,
&mut excluded_documents,
) {
return Some(Err(error));
}
}
Some(Ok(item))
})
// start skipping **after** the filter
.skip(from)
// take **after** skipping
.take(length)
{
let ((docid, (main_score, _sub_score)), source) = res?;
if let ResultSource::Semantic = source {
semantic_hit_count += 1;
}
@ -136,10 +162,24 @@ impl ScoreWithRatioResult {
document_scores.push(main_score);
}
(
// compute the set of candidates from both sets
let candidates = vector_results.candidates | keyword_results.candidates;
let must_remove_redundant_candidates = distinct_fid.is_some();
let candidates = if must_remove_redundant_candidates {
// patch-up the candidates to remove the indistinct documents, then add back the actual hits
let mut candidates = candidates - excluded_documents;
for docid in &documents_ids {
candidates.insert(*docid);
}
candidates
} else {
candidates
};
Ok((
SearchResult {
matching_words: keyword_results.matching_words,
candidates: vector_results.candidates | keyword_results.candidates,
candidates,
documents_ids,
document_scores,
degraded: vector_results.degraded | keyword_results.degraded,
@ -147,7 +187,7 @@ impl ScoreWithRatioResult {
| keyword_results.used_negative_operator,
},
semantic_hit_count,
)
))
}
}
@ -226,8 +266,15 @@ impl Search<'_> {
let keyword_results = ScoreWithRatioResult::new(keyword_results, 1.0 - semantic_ratio);
let vector_results = ScoreWithRatioResult::new(vector_results, semantic_ratio);
let (merge_results, semantic_hit_count) =
ScoreWithRatioResult::merge(vector_results, keyword_results, self.offset, self.limit);
let (merge_results, semantic_hit_count) = ScoreWithRatioResult::merge(
vector_results,
keyword_results,
self.offset,
self.limit,
search.distinct.as_deref(),
search.index,
search.rtxn,
)?;
assert!(merge_results.documents_ids.len() <= self.limit);
Ok((merge_results, Some(semantic_hit_count)))
}

View file

@ -4,7 +4,9 @@ use super::logger::SearchLogger;
use super::ranking_rules::{BoxRankingRule, RankingRuleQueryTrait};
use super::SearchContext;
use crate::score_details::{ScoreDetails, ScoringStrategy};
use crate::search::new::distinct::{apply_distinct_rule, distinct_single_docid, DistinctOutput};
use crate::search::new::distinct::{
apply_distinct_rule, distinct_fid, distinct_single_docid, DistinctOutput,
};
use crate::{Result, TimeBudget};
pub struct BucketSortOutput {
@ -35,16 +37,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
logger.ranking_rules(&ranking_rules);
logger.initial_universe(universe);
let distinct_field = match distinct {
Some(distinct) => Some(distinct),
None => ctx.index.distinct_field(ctx.txn)?,
};
let distinct_fid = if let Some(field) = distinct_field {
ctx.index.fields_ids_map(ctx.txn)?.id(field)
} else {
None
};
let distinct_fid = distinct_fid(distinct, ctx.index, ctx.txn)?;
if universe.len() < from as u64 {
return Ok(BucketSortOutput {

View file

@ -9,7 +9,7 @@ use crate::heed_codec::facet::{
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetCodec,
};
use crate::heed_codec::BytesRefCodec;
use crate::{Index, Result, SearchContext};
use crate::{FieldId, Index, Result, SearchContext};
pub struct DistinctOutput {
pub remaining: RoaringBitmap,
@ -121,3 +121,18 @@ pub fn facet_string_values<'a>(
fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE] {
concat_arrays::concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes())
}
pub fn distinct_fid(
query_distinct_field: Option<&str>,
index: &Index,
rtxn: &RoTxn<'_>,
) -> Result<Option<FieldId>> {
let distinct_field = match query_distinct_field {
Some(distinct) => Some(distinct),
None => index.distinct_field(rtxn)?,
};
let distinct_fid =
if let Some(field) = distinct_field { index.fields_ids_map(rtxn)?.id(field) } else { None };
Ok(distinct_fid)
}

View file

@ -28,6 +28,7 @@ use std::time::Duration;
use bucket_sort::{bucket_sort, BucketSortOutput};
use charabia::{Language, TokenizerBuilder};
use db_cache::DatabaseCache;
pub use distinct::{distinct_fid, distinct_single_docid};
use exact_attribute::ExactAttribute;
use graph_based_ranking_rule::{Exactness, Fid, Position, Proximity, Typo};
use heed::RoTxn;
@ -51,6 +52,7 @@ pub use self::geo_sort::{Parameter as GeoSortParameter, Strategy as GeoSortStrat
use self::graph_based_ranking_rule::Words;
use self::interner::Interned;
use self::vector_sort::VectorSort;
use crate::attribute_patterns::{match_pattern, PatternMatch};
use crate::constants::RESERVED_GEO_FIELD_NAME;
use crate::index::PrefixSearch;
use crate::localized_attributes_rules::LocalizedFieldIds;
@ -119,17 +121,37 @@ impl<'ctx> SearchContext<'ctx> {
let searchable_fields_weights = self.index.searchable_fields_and_weights(self.txn)?;
let exact_attributes_ids = self.index.exact_attributes_ids(self.txn)?;
let mut wildcard = false;
let mut universal_wildcard = false;
let mut restricted_fids = RestrictedFids::default();
for field_name in attributes_to_search_on {
if field_name == "*" {
wildcard = true;
universal_wildcard = true;
// we cannot early exit as we want to returns error in case of unknown fields
continue;
}
let searchable_weight =
searchable_fields_weights.iter().find(|(name, _, _)| name == field_name);
// The field is not searchable but may contain a wildcard pattern
if searchable_weight.is_none() && field_name.contains("*") {
let matching_searchable_weights: Vec<_> = searchable_fields_weights
.iter()
.filter(|(name, _, _)| match_pattern(field_name, name) == PatternMatch::Match)
.collect();
if !matching_searchable_weights.is_empty() {
for (_name, fid, weight) in matching_searchable_weights {
if exact_attributes_ids.contains(fid) {
restricted_fids.exact.push((*fid, *weight));
} else {
restricted_fids.tolerant.push((*fid, *weight));
}
}
continue;
}
}
let (fid, weight) = match searchable_weight {
// The Field id exist and the field is searchable
Some((_name, fid, weight)) => (*fid, *weight),
@ -159,7 +181,7 @@ impl<'ctx> SearchContext<'ctx> {
};
}
if wildcard {
if universal_wildcard {
self.restricted_fids = None;
} else {
self.restricted_fids = Some(restricted_fids);

View file

@ -72,7 +72,7 @@ fn test_2gram_simple() {
let index = create_index();
index
.update_settings(|s| {
s.set_autorize_typos(false);
s.set_authorize_typos(false);
})
.unwrap();
@ -103,7 +103,7 @@ fn test_3gram_simple() {
let index = create_index();
index
.update_settings(|s| {
s.set_autorize_typos(false);
s.set_authorize_typos(false);
})
.unwrap();
@ -153,7 +153,7 @@ fn test_no_disable_ngrams() {
let index = create_index();
index
.update_settings(|s| {
s.set_autorize_typos(false);
s.set_authorize_typos(false);
})
.unwrap();
@ -179,7 +179,7 @@ fn test_2gram_prefix() {
let index = create_index();
index
.update_settings(|s| {
s.set_autorize_typos(false);
s.set_authorize_typos(false);
})
.unwrap();
@ -208,7 +208,7 @@ fn test_3gram_prefix() {
let index = create_index();
index
.update_settings(|s| {
s.set_autorize_typos(false);
s.set_authorize_typos(false);
})
.unwrap();
@ -260,7 +260,7 @@ fn test_disable_split_words() {
let index = create_index();
index
.update_settings(|s| {
s.set_autorize_typos(false);
s.set_authorize_typos(false);
})
.unwrap();

View file

@ -151,7 +151,7 @@ fn test_no_typo() {
let index = create_index();
index
.update_settings(|s| {
s.set_autorize_typos(false);
s.set_authorize_typos(false);
})
.unwrap();