Fix errors, clippy warnings, and add review comments

This commit is contained in:
Loïc Lecrenier 2023-04-29 11:40:00 +02:00
parent 48f5bb1693
commit 59b12fca87
3 changed files with 59 additions and 49 deletions

View File

@ -105,7 +105,7 @@ impl<Q: RankingRuleQueryTrait> GeoSort<Q> {
/// Refill the internal buffer of cached docids based on the strategy. /// Refill the internal buffer of cached docids based on the strategy.
/// Drop the rtree if we don't need it anymore. /// Drop the rtree if we don't need it anymore.
fn fill_buffer<'ctx>(&mut self, ctx: &mut SearchContext<'ctx>) -> Result<()> { fn fill_buffer(&mut self, ctx: &mut SearchContext) -> Result<()> {
debug_assert!(self.field_ids.is_some(), "fill_buffer can't be called without the lat&lng"); debug_assert!(self.field_ids.is_some(), "fill_buffer can't be called without the lat&lng");
debug_assert!(self.cached_sorted_docids.is_empty()); debug_assert!(self.cached_sorted_docids.is_empty());
@ -133,7 +133,13 @@ impl<Q: RankingRuleQueryTrait> GeoSort<Q> {
// and only keep the latest candidates. // and only keep the latest candidates.
for point in rtree.nearest_neighbor_iter(&point) { for point in rtree.nearest_neighbor_iter(&point) {
if self.geo_candidates.contains(point.data.0) { if self.geo_candidates.contains(point.data.0) {
self.cached_sorted_docids.pop_front(); // REVIEW COMMENT: that doesn't look right, because we only keep the furthest point in the cache.
// Then the cache will be exhausted after the first bucket and we'll need to repopulate it again immediately.
// I think it's okay if we keep every document id in the cache instead. It's a high memory usage,
// but we already have the whole rtree in memory, which is bigger than a vector of all document ids.
//
// self.cached_sorted_docids.pop_front();
//
self.cached_sorted_docids.push_back(point.data.0); self.cached_sorted_docids.push_back(point.data.0);
} }
} }
@ -163,7 +169,10 @@ impl<Q: RankingRuleQueryTrait> GeoSort<Q> {
)) ))
}) })
.collect::<Result<Vec<(u32, [f64; 2])>>>()?; .collect::<Result<Vec<(u32, [f64; 2])>>>()?;
documents.sort_by_key(|(_, p)| distance_between_two_points(&self.point, &p) as usize); // REVIEW COMMENT: the haversine distance function can be quite expensive, I think, so it's probably faster
// to use `sort_by_cached_key` instead of `sort_by_key`.
documents
.sort_by_cached_key(|(_, p)| distance_between_two_points(&self.point, p) as usize);
self.cached_sorted_docids.extend(documents.into_iter().map(|(doc_id, _)| doc_id)); self.cached_sorted_docids.extend(documents.into_iter().map(|(doc_id, _)| doc_id));
}; };
@ -193,7 +202,7 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort<Q> {
self.query = Some(query.clone()); self.query = Some(query.clone());
self.geo_candidates &= universe; self.geo_candidates &= universe;
if self.geo_candidates.len() == 0 { if self.geo_candidates.is_empty() {
return Ok(()); return Ok(());
} }
@ -203,6 +212,10 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort<Q> {
self.field_ids = Some([lat, lng]); self.field_ids = Some([lat, lng]);
if self.strategy.use_rtree(self.geo_candidates.len() as usize) { if self.strategy.use_rtree(self.geo_candidates.len() as usize) {
// REVIEW COMMENT: I would prefer to always keep the rtree in memory so that we don't have to deserialize it
// every time the geosort ranking rule starts iterating.
// So we'd initialize it in `::new` and never drop it.
//
self.rtree = Some(ctx.index.geo_rtree(ctx.txn)?.expect("geo candidates but no rtree")); self.rtree = Some(ctx.index.geo_rtree(ctx.txn)?.expect("geo candidates but no rtree"));
} }
@ -210,6 +223,7 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort<Q> {
Ok(()) Ok(())
} }
#[allow(clippy::only_used_in_recursion)]
fn next_bucket( fn next_bucket(
&mut self, &mut self,
ctx: &mut SearchContext<'ctx>, ctx: &mut SearchContext<'ctx>,

View File

@ -401,8 +401,12 @@ pub fn execute_search(
let graph = QueryGraph::from_query(ctx, &query_terms)?; let graph = QueryGraph::from_query(ctx, &query_terms)?;
located_query_terms = Some(query_terms); located_query_terms = Some(query_terms);
let ranking_rules = let ranking_rules = get_ranking_rules_for_query_graph_search(
get_ranking_rules_for_query_graph_search(ctx, sort_criteria, terms_matching_strategy)?; ctx,
sort_criteria,
geo_strategy,
terms_matching_strategy,
)?;
universe = universe =
resolve_universe(ctx, &universe, &graph, terms_matching_strategy, query_graph_logger)?; resolve_universe(ctx, &universe, &graph, terms_matching_strategy, query_graph_logger)?;

View File

@ -1,14 +1,9 @@
/*! /*!
This module tests the `geo_sort` ranking rule: This module tests the `geo_sort` ranking rule:
1. an error is returned if the sort ranking rule exists but no fields-to-sort were given at search time REVIEW COMMENT:
2. an error is returned if the fields-to-sort are not sortable - nice tests :)
3. it is possible to add multiple fields-to-sort at search time - add anything that seems not obvious about the behaviour of the geosort ranking rule here
4. custom sort ranking rules can be added to the settings, they interact with the generic `sort` ranking rule as expected
5. numbers appear before strings
6. documents with either: (1) no value, (2) null, or (3) an object for the field-to-sort appear at the end of the bucket
7. boolean values are translated to strings
8. if a field contains an array, it is sorted by the best value in the array according to the sort rule
*/ */
use big_s::S; use big_s::S;
@ -40,21 +35,21 @@ fn execute_iterative_and_rtree_returns_the_same<'a>(
) -> Vec<usize> { ) -> Vec<usize> {
search.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(2)); search.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(2));
let SearchResult { documents_ids, .. } = search.execute().unwrap(); let SearchResult { documents_ids, .. } = search.execute().unwrap();
let iterative_ids_bucketed = collect_field_values(&index, rtxn, "id", &documents_ids); let iterative_ids_bucketed = collect_field_values(index, rtxn, "id", &documents_ids);
search.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(1000)); search.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(1000));
let SearchResult { documents_ids, .. } = search.execute().unwrap(); let SearchResult { documents_ids, .. } = search.execute().unwrap();
let iterative_ids = collect_field_values(&index, rtxn, "id", &documents_ids); let iterative_ids = collect_field_values(index, rtxn, "id", &documents_ids);
assert_eq!(iterative_ids_bucketed, iterative_ids, "iterative bucket"); assert_eq!(iterative_ids_bucketed, iterative_ids, "iterative bucket");
search.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(2)); search.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(2));
let SearchResult { documents_ids, .. } = search.execute().unwrap(); let SearchResult { documents_ids, .. } = search.execute().unwrap();
let rtree_ids_bucketed = collect_field_values(&index, rtxn, "id", &documents_ids); let rtree_ids_bucketed = collect_field_values(index, rtxn, "id", &documents_ids);
search.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(1000)); search.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(1000));
let SearchResult { documents_ids, .. } = search.execute().unwrap(); let SearchResult { documents_ids, .. } = search.execute().unwrap();
let rtree_ids = collect_field_values(&index, rtxn, "id", &documents_ids); let rtree_ids = collect_field_values(index, rtxn, "id", &documents_ids);
assert_eq!(rtree_ids_bucketed, rtree_ids, "rtree bucket"); assert_eq!(rtree_ids_bucketed, rtree_ids, "rtree bucket");
@ -67,15 +62,24 @@ fn execute_iterative_and_rtree_returns_the_same<'a>(
fn test_geo_sort() { fn test_geo_sort() {
let index = create_index(); let index = create_index();
// REVIEW COMMENT:
// I prefer to make the external ids correspond to the internal ids so that
// we can check whether the ranking rules are actually doing work instead of
// returning documents in order of their internal ids.
//
index index
.add_documents(documents!([ .add_documents(documents!([
{ "id": 2, "_geo": { "lat": 2, "lng": -1 } }, { "id": 0, "_geo": { "lat": 2, "lng": -1 } },
{ "id": 3, "_geo": { "lat": -2, "lng": -2 } }, { "id": 1, "_geo": { "lat": -2, "lng": -2 } },
{ "id": 5, "_geo": { "lat": 6, "lng": -5 } }, { "id": 2, "_geo": { "lat": 6, "lng": -5 } },
{ "id": 4, "_geo": { "lat": 3, "lng": 5 } }, { "id": 3, "_geo": { "lat": 3, "lng": 5 } },
{ "id": 0, "_geo": { "lat": 0, "lng": 0 } }, { "id": 4, "_geo": { "lat": 0, "lng": 0 } },
{ "id": 1, "_geo": { "lat": 1, "lng": 1 } }, { "id": 5, "_geo": { "lat": 1, "lng": 1 } },
{ "id": 6 }, { "id": 8 }, { "id": 7 }, { "id": 10 }, { "id": 9 }, { "id": 6 },
{ "id": 7 },
{ "id": 8 },
{ "id": 9 },
{ "id": 10 },
])) ]))
.unwrap(); .unwrap();
@ -88,66 +92,54 @@ fn test_geo_sort() {
s.geo_sort_strategy(GeoSortStrategy::Dynamic(100)); s.geo_sort_strategy(GeoSortStrategy::Dynamic(100));
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
let ids = collect_field_values(&index, &txn, "id", &documents_ids); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 3, 2, 6, 7, 8, 9, 10]");
insta::assert_snapshot!(format!("{ids:?}"), @r###"["0", "1", "2", "3", "4", "5", "6", "8", "7", "10", "9"]"###);
s.geo_sort_strategy(GeoSortStrategy::Dynamic(3)); s.geo_sort_strategy(GeoSortStrategy::Dynamic(3));
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
let ids = collect_field_values(&index, &txn, "id", &documents_ids); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 3, 2, 6, 7, 8, 9, 10]");
insta::assert_snapshot!(format!("{ids:?}"), @r###"["0", "1", "2", "3", "4", "5", "6", "8", "7", "10", "9"]"###);
s.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(100)); s.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(100));
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
let ids = collect_field_values(&index, &txn, "id", &documents_ids); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 3, 2, 6, 7, 8, 9, 10]");
insta::assert_snapshot!(format!("{ids:?}"), @r###"["0", "1", "2", "3", "4", "5", "6", "8", "7", "10", "9"]"###);
s.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(3)); s.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(3));
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
let ids = collect_field_values(&index, &txn, "id", &documents_ids); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 3, 2, 6, 7, 8, 9, 10]");
insta::assert_snapshot!(format!("{ids:?}"), @r###"["0", "1", "2", "3", "4", "5", "6", "8", "7", "10", "9"]"###);
s.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(100)); s.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(100));
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
let ids = collect_field_values(&index, &txn, "id", &documents_ids); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 3, 2, 6, 7, 8, 9, 10]");
insta::assert_snapshot!(format!("{ids:?}"), @r###"["0", "1", "2", "3", "4", "5", "6", "8", "7", "10", "9"]"###);
s.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(3)); s.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(3));
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
let ids = collect_field_values(&index, &txn, "id", &documents_ids); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 3, 2, 6, 7, 8, 9, 10]");
insta::assert_snapshot!(format!("{ids:?}"), @r###"["0", "1", "2", "3", "4", "5", "6", "8", "7", "10", "9"]"###);
// --- desc // --- desc
s.sort_criteria(vec![AscDesc::Desc(Member::Geo([0., 0.]))]); s.sort_criteria(vec![AscDesc::Desc(Member::Geo([0., 0.]))]);
s.geo_sort_strategy(GeoSortStrategy::Dynamic(100)); s.geo_sort_strategy(GeoSortStrategy::Dynamic(100));
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
let ids = collect_field_values(&index, &txn, "id", &documents_ids); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 5, 4, 6, 7, 8, 9, 10]");
insta::assert_snapshot!(format!("{ids:?}"), @r###"["5", "4", "3", "2", "1", "0", "6", "8", "7", "10", "9"]"###);
s.geo_sort_strategy(GeoSortStrategy::Dynamic(3)); s.geo_sort_strategy(GeoSortStrategy::Dynamic(3));
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
let ids = collect_field_values(&index, &txn, "id", &documents_ids); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 5, 4, 6, 7, 8, 9, 10]");
insta::assert_snapshot!(format!("{ids:?}"), @r###"["5", "4", "3", "2", "1", "0", "6", "8", "7", "10", "9"]"###);
s.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(100)); s.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(100));
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
let ids = collect_field_values(&index, &txn, "id", &documents_ids); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 5, 4, 6, 7, 8, 9, 10]");
insta::assert_snapshot!(format!("{ids:?}"), @r###"["5", "4", "3", "2", "1", "0", "6", "8", "7", "10", "9"]"###);
s.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(3)); s.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(3));
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
let ids = collect_field_values(&index, &txn, "id", &documents_ids); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 5, 4, 6, 7, 8, 9, 10]");
insta::assert_snapshot!(format!("{ids:?}"), @r###"["5", "4", "3", "2", "1", "0", "6", "8", "7", "10", "9"]"###);
s.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(100)); s.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(100));
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
let ids = collect_field_values(&index, &txn, "id", &documents_ids); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 5, 4, 6, 7, 8, 9, 10]");
insta::assert_snapshot!(format!("{ids:?}"), @r###"["5", "4", "3", "2", "1", "0", "6", "8", "7", "10", "9"]"###);
s.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(3)); s.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(3));
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
let ids = collect_field_values(&index, &txn, "id", &documents_ids); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 5, 4, 6, 7, 8, 9, 10]");
insta::assert_snapshot!(format!("{ids:?}"), @r###"["5", "4", "3", "2", "1", "0", "6", "8", "7", "10", "9"]"###);
} }
#[test] #[test]