Refactor geo_max_bucket_size injection

This commit is contained in:
hdt3213 2025-04-15 20:05:44 +08:00
parent 7c1c4f9c26
commit fd7fbfa9eb
6 changed files with 51 additions and 50 deletions

View File

@ -164,7 +164,7 @@ impl Search<'_> {
sort_criteria: self.sort_criteria.clone(),
distinct: self.distinct.clone(),
searchable_attributes: self.searchable_attributes,
geo_strategy: self.geo_strategy,
geo_param: self.geo_param,
terms_matching_strategy: self.terms_matching_strategy,
scoring_strategy: ScoringStrategy::Detailed,
words_limit: self.words_limit,

View File

@ -45,7 +45,7 @@ pub struct Search<'a> {
sort_criteria: Option<Vec<AscDesc>>,
distinct: Option<String>,
searchable_attributes: Option<&'a [String]>,
geo_strategy: new::GeoSortStrategy,
geo_param: new::GeoSortParameter,
terms_matching_strategy: TermsMatchingStrategy,
scoring_strategy: ScoringStrategy,
words_limit: usize,
@ -68,7 +68,7 @@ impl<'a> Search<'a> {
sort_criteria: None,
distinct: None,
searchable_attributes: None,
geo_strategy: new::GeoSortStrategy::default(),
geo_param: new::GeoSortParameter::default(),
terms_matching_strategy: TermsMatchingStrategy::default(),
scoring_strategy: Default::default(),
exhaustive_number_hits: false,
@ -145,7 +145,13 @@ impl<'a> Search<'a> {
#[cfg(test)]
pub fn geo_sort_strategy(&mut self, strategy: new::GeoSortStrategy) -> &mut Search<'a> {
self.geo_strategy = strategy;
self.geo_param.strategy = strategy;
self
}
#[cfg(test)]
pub fn geo_max_bucket_size(&mut self, max_size: u64) -> &mut Search<'a> {
self.geo_param.max_bucket_size = max_size;
self
}
@ -232,7 +238,7 @@ impl<'a> Search<'a> {
universe,
&self.sort_criteria,
&self.distinct,
self.geo_strategy,
self.geo_param,
self.offset,
self.limit,
embedder_name,
@ -251,7 +257,7 @@ impl<'a> Search<'a> {
universe,
&self.sort_criteria,
&self.distinct,
self.geo_strategy,
self.geo_param,
self.offset,
self.limit,
Some(self.words_limit),
@ -290,7 +296,7 @@ impl fmt::Debug for Search<'_> {
sort_criteria,
distinct,
searchable_attributes,
geo_strategy: _,
geo_param: _,
terms_matching_strategy,
scoring_strategy,
words_limit,

View File

@ -39,6 +39,22 @@ fn facet_number_values<'a>(
Ok(iter)
}
#[derive(Debug, Clone, Copy)]
pub struct Parameter {
// Define the strategy used by the geo sort
pub strategy: Strategy,
// Limit the number of docs in a single bucket to avoid unexpectedly large overhead
pub max_bucket_size: u64,
// Considering the errors of GPS and geographical calculations, distances less than distance_error_margin will be treated as equal
pub distance_error_margin: f64,
}
impl Default for Parameter {
fn default() -> Self {
Self { strategy: Strategy::default(), max_bucket_size: 1000, distance_error_margin: 1.0 }
}
}
/// Define the strategy used by the geo sort.
/// The parameter represents the cache size, and, in the case of the Dynamic strategy,
/// the point where we move from using the iterative strategy to the rtree.
@ -71,26 +87,6 @@ impl Strategy {
}
}
#[cfg(not(test))]
fn default_max_bucket_size() -> u64 {
1000
}
#[cfg(test)]
static DEFAULT_MAX_BUCKET_SIZE: std::sync::Mutex<u64> = std::sync::Mutex::new(1000);
#[cfg(test)]
pub fn set_default_max_bucket_size(n: u64) {
let mut size = DEFAULT_MAX_BUCKET_SIZE.lock().unwrap();
*size = n;
}
#[cfg(test)]
fn default_max_bucket_size() -> u64 {
let max_size = *(DEFAULT_MAX_BUCKET_SIZE.lock().unwrap());
max_size
}
pub struct GeoSort<Q: RankingRuleQueryTrait> {
query: Option<Q>,
@ -111,22 +107,23 @@ pub struct GeoSort<Q: RankingRuleQueryTrait> {
impl<Q: RankingRuleQueryTrait> GeoSort<Q> {
pub fn new(
strategy: Strategy,
parameter: &Parameter,
geo_faceted_docids: RoaringBitmap,
point: [f64; 2],
ascending: bool,
) -> Result<Self> {
let Parameter { strategy, max_bucket_size, distance_error_margin } = parameter;
Ok(Self {
query: None,
strategy,
strategy: *strategy,
ascending,
point,
geo_candidates: geo_faceted_docids,
field_ids: None,
rtree: None,
cached_sorted_docids: VecDeque::new(),
max_bucket_size: default_max_bucket_size(),
distance_error_margin: 1.0,
max_bucket_size: *max_bucket_size,
distance_error_margin: *distance_error_margin,
})
}

View File

@ -513,7 +513,7 @@ mod tests {
universe,
&None,
&None,
crate::search::new::GeoSortStrategy::default(),
crate::search::new::GeoSortParameter::default(),
0,
100,
Some(10),

View File

@ -45,6 +45,7 @@ use sort::Sort;
use self::distinct::facet_string_values;
use self::geo_sort::GeoSort;
pub use self::geo_sort::Parameter as GeoSortParameter;
pub use self::geo_sort::Strategy as GeoSortStrategy;
use self::graph_based_ranking_rule::Words;
use self::interner::Interned;
@ -274,7 +275,7 @@ fn resolve_negative_phrases(
fn get_ranking_rules_for_placeholder_search<'ctx>(
ctx: &SearchContext<'ctx>,
sort_criteria: &Option<Vec<AscDesc>>,
geo_strategy: geo_sort::Strategy,
geo_param: geo_sort::Parameter,
) -> Result<Vec<BoxRankingRule<'ctx, PlaceholderQuery>>> {
let mut sort = false;
let mut sorted_fields = HashSet::new();
@ -299,7 +300,7 @@ fn get_ranking_rules_for_placeholder_search<'ctx>(
&mut ranking_rules,
&mut sorted_fields,
&mut geo_sorted,
geo_strategy,
&geo_param,
)?;
sort = true;
}
@ -326,7 +327,7 @@ fn get_ranking_rules_for_placeholder_search<'ctx>(
fn get_ranking_rules_for_vector<'ctx>(
ctx: &SearchContext<'ctx>,
sort_criteria: &Option<Vec<AscDesc>>,
geo_strategy: geo_sort::Strategy,
geo_param: geo_sort::Parameter,
limit_plus_offset: usize,
target: &[f32],
embedder_name: &str,
@ -375,7 +376,7 @@ fn get_ranking_rules_for_vector<'ctx>(
&mut ranking_rules,
&mut sorted_fields,
&mut geo_sorted,
geo_strategy,
&geo_param,
)?;
sort = true;
}
@ -403,7 +404,7 @@ fn get_ranking_rules_for_vector<'ctx>(
fn get_ranking_rules_for_query_graph_search<'ctx>(
ctx: &SearchContext<'ctx>,
sort_criteria: &Option<Vec<AscDesc>>,
geo_strategy: geo_sort::Strategy,
geo_param: geo_sort::Parameter,
terms_matching_strategy: TermsMatchingStrategy,
) -> Result<Vec<BoxRankingRule<'ctx, QueryGraph>>> {
// query graph search
@ -477,7 +478,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
&mut ranking_rules,
&mut sorted_fields,
&mut geo_sorted,
geo_strategy,
&geo_param,
)?;
sort = true;
}
@ -514,7 +515,7 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>(
ranking_rules: &mut Vec<BoxRankingRule<'ctx, Query>>,
sorted_fields: &mut HashSet<String>,
geo_sorted: &mut bool,
geo_strategy: geo_sort::Strategy,
geo_param: &geo_sort::Parameter,
) -> Result<()> {
let sort_criteria = sort_criteria.clone().unwrap_or_default();
ranking_rules.reserve(sort_criteria.len());
@ -540,7 +541,7 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>(
}
let geo_faceted_docids = ctx.index.geo_faceted_documents_ids(ctx.txn)?;
ranking_rules.push(Box::new(GeoSort::new(
geo_strategy,
geo_param,
geo_faceted_docids,
point,
true,
@ -552,7 +553,7 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>(
}
let geo_faceted_docids = ctx.index.geo_faceted_documents_ids(ctx.txn)?;
ranking_rules.push(Box::new(GeoSort::new(
geo_strategy,
geo_param,
geo_faceted_docids,
point,
false,
@ -584,7 +585,7 @@ pub fn execute_vector_search(
universe: RoaringBitmap,
sort_criteria: &Option<Vec<AscDesc>>,
distinct: &Option<String>,
geo_strategy: geo_sort::Strategy,
geo_param: geo_sort::Parameter,
from: usize,
length: usize,
embedder_name: &str,
@ -600,7 +601,7 @@ pub fn execute_vector_search(
let ranking_rules = get_ranking_rules_for_vector(
ctx,
sort_criteria,
geo_strategy,
geo_param,
from + length,
vector,
embedder_name,
@ -647,7 +648,7 @@ pub fn execute_search(
mut universe: RoaringBitmap,
sort_criteria: &Option<Vec<AscDesc>>,
distinct: &Option<String>,
geo_strategy: geo_sort::Strategy,
geo_param: geo_sort::Parameter,
from: usize,
length: usize,
words_limit: Option<usize>,
@ -761,7 +762,7 @@ pub fn execute_search(
let ranking_rules = get_ranking_rules_for_query_graph_search(
ctx,
sort_criteria,
geo_strategy,
geo_param,
terms_matching_strategy,
)?;
@ -783,7 +784,7 @@ pub fn execute_search(
)?
} else {
let ranking_rules =
get_ranking_rules_for_placeholder_search(ctx, sort_criteria, geo_strategy)?;
get_ranking_rules_for_placeholder_search(ctx, sort_criteria, geo_param)?;
bucket_sort(
ctx,
ranking_rules,

View File

@ -157,10 +157,10 @@ fn test_geo_sort_reached_max_bucket_size() {
]))
.unwrap();
crate::search::new::geo_sort::set_default_max_bucket_size(2);
let rtxn = index.read_txn().unwrap();
let mut s = Search::new(&rtxn, &index);
s.geo_max_bucket_size(2);
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
s.sort_criteria(vec![
AscDesc::Asc(Member::Geo([0., 0.])),
@ -200,9 +200,6 @@ fn test_geo_sort_reached_max_bucket_size() {
}
let no_geo_ids = rtree_ids[10..].iter().collect_vec();
insta::assert_snapshot!(format!("{no_geo_ids:?}"), @r#"["1", "4", "3", "2", "5"]"#);
// recover settings
crate::search::new::geo_sort::set_default_max_bucket_size(1000);
}
#[test]