diff --git a/milli/src/search/new/geo_sort.rs b/milli/src/search/new/geo_sort.rs index b841dfe9c..9e1da4479 100644 --- a/milli/src/search/new/geo_sort.rs +++ b/milli/src/search/new/geo_sort.rs @@ -105,7 +105,7 @@ impl GeoSort { /// Refill the internal buffer of cached docids based on the strategy. /// Drop the rtree if we don't need it anymore. - fn fill_buffer<'ctx>(&mut self, ctx: &mut SearchContext<'ctx>) -> Result<()> { + fn fill_buffer(&mut self, ctx: &mut SearchContext) -> Result<()> { debug_assert!(self.field_ids.is_some(), "fill_buffer can't be called without the lat&lng"); debug_assert!(self.cached_sorted_docids.is_empty()); @@ -133,7 +133,13 @@ impl GeoSort { // and only keep the latest candidates. for point in rtree.nearest_neighbor_iter(&point) { if self.geo_candidates.contains(point.data.0) { - self.cached_sorted_docids.pop_front(); + // REVIEW COMMENT: that doesn't look right, because we only keep the furthest point in the cache. + // Then the cache will be exhausted after the first bucket and we'll need to repopulate it again immediately. + // I think it's okay if we keep every document id in the cache instead. It's a high memory usage, + // but we already have the whole rtree in memory, which is bigger than a vector of all document ids. + // + // self.cached_sorted_docids.pop_front(); + // self.cached_sorted_docids.push_back(point.data.0); } } @@ -163,7 +169,10 @@ impl GeoSort { )) }) .collect::>>()?; - documents.sort_by_key(|(_, p)| distance_between_two_points(&self.point, &p) as usize); + // REVIEW COMMENT: the haversine distance function can be quite expensive, I think, so it's probably faster + // to use `sort_by_cached_key` instead of `sort_by_key`. + documents + .sort_by_cached_key(|(_, p)| distance_between_two_points(&self.point, p) as usize); self.cached_sorted_docids.extend(documents.into_iter().map(|(doc_id, _)| doc_id)); }; @@ -193,7 +202,7 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort { self.query = Some(query.clone()); self.geo_candidates &= universe; - if self.geo_candidates.len() == 0 { + if self.geo_candidates.is_empty() { return Ok(()); } @@ -203,6 +212,10 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort { self.field_ids = Some([lat, lng]); if self.strategy.use_rtree(self.geo_candidates.len() as usize) { + // REVIEW COMMENT: I would prefer to always keep the rtree in memory so that we don't have to deserialize it + // every time the geosort ranking rule starts iterating. + // So we'd initialize it in `::new` and never drop it. + // self.rtree = Some(ctx.index.geo_rtree(ctx.txn)?.expect("geo candidates but no rtree")); } @@ -210,6 +223,7 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort { Ok(()) } + #[allow(clippy::only_used_in_recursion)] fn next_bucket( &mut self, ctx: &mut SearchContext<'ctx>, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index eb006fbf3..2faf20a1d 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -401,8 +401,12 @@ pub fn execute_search( let graph = QueryGraph::from_query(ctx, &query_terms)?; located_query_terms = Some(query_terms); - let ranking_rules = - get_ranking_rules_for_query_graph_search(ctx, sort_criteria, terms_matching_strategy)?; + let ranking_rules = get_ranking_rules_for_query_graph_search( + ctx, + sort_criteria, + geo_strategy, + terms_matching_strategy, + )?; universe = resolve_universe(ctx, &universe, &graph, terms_matching_strategy, query_graph_logger)?; diff --git a/milli/src/search/new/tests/geo_sort.rs b/milli/src/search/new/tests/geo_sort.rs index e49fd7c99..3072007db 100644 --- a/milli/src/search/new/tests/geo_sort.rs +++ b/milli/src/search/new/tests/geo_sort.rs @@ -1,14 +1,9 @@ /*! This module tests the `geo_sort` ranking rule: -1. an error is returned if the sort ranking rule exists but no fields-to-sort were given at search time -2. an error is returned if the fields-to-sort are not sortable -3. it is possible to add multiple fields-to-sort at search time -4. custom sort ranking rules can be added to the settings, they interact with the generic `sort` ranking rule as expected -5. numbers appear before strings -6. documents with either: (1) no value, (2) null, or (3) an object for the field-to-sort appear at the end of the bucket -7. boolean values are translated to strings -8. if a field contains an array, it is sorted by the best value in the array according to the sort rule +REVIEW COMMENT: + - nice tests :) + - add anything that seems not obvious about the behaviour of the geosort ranking rule here */ use big_s::S; @@ -40,21 +35,21 @@ fn execute_iterative_and_rtree_returns_the_same<'a>( ) -> Vec { search.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(2)); let SearchResult { documents_ids, .. } = search.execute().unwrap(); - let iterative_ids_bucketed = collect_field_values(&index, rtxn, "id", &documents_ids); + let iterative_ids_bucketed = collect_field_values(index, rtxn, "id", &documents_ids); search.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(1000)); let SearchResult { documents_ids, .. } = search.execute().unwrap(); - let iterative_ids = collect_field_values(&index, rtxn, "id", &documents_ids); + let iterative_ids = collect_field_values(index, rtxn, "id", &documents_ids); assert_eq!(iterative_ids_bucketed, iterative_ids, "iterative bucket"); search.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(2)); let SearchResult { documents_ids, .. } = search.execute().unwrap(); - let rtree_ids_bucketed = collect_field_values(&index, rtxn, "id", &documents_ids); + let rtree_ids_bucketed = collect_field_values(index, rtxn, "id", &documents_ids); search.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(1000)); let SearchResult { documents_ids, .. } = search.execute().unwrap(); - let rtree_ids = collect_field_values(&index, rtxn, "id", &documents_ids); + let rtree_ids = collect_field_values(index, rtxn, "id", &documents_ids); assert_eq!(rtree_ids_bucketed, rtree_ids, "rtree bucket"); @@ -67,15 +62,24 @@ fn execute_iterative_and_rtree_returns_the_same<'a>( fn test_geo_sort() { let index = create_index(); + // REVIEW COMMENT: + // I prefer to make the external ids correspond to the internal ids so that + // we can check whether the ranking rules are actually doing work instead of + // returning documents in order of their internal ids. + // index .add_documents(documents!([ - { "id": 2, "_geo": { "lat": 2, "lng": -1 } }, - { "id": 3, "_geo": { "lat": -2, "lng": -2 } }, - { "id": 5, "_geo": { "lat": 6, "lng": -5 } }, - { "id": 4, "_geo": { "lat": 3, "lng": 5 } }, - { "id": 0, "_geo": { "lat": 0, "lng": 0 } }, - { "id": 1, "_geo": { "lat": 1, "lng": 1 } }, - { "id": 6 }, { "id": 8 }, { "id": 7 }, { "id": 10 }, { "id": 9 }, + { "id": 0, "_geo": { "lat": 2, "lng": -1 } }, + { "id": 1, "_geo": { "lat": -2, "lng": -2 } }, + { "id": 2, "_geo": { "lat": 6, "lng": -5 } }, + { "id": 3, "_geo": { "lat": 3, "lng": 5 } }, + { "id": 4, "_geo": { "lat": 0, "lng": 0 } }, + { "id": 5, "_geo": { "lat": 1, "lng": 1 } }, + { "id": 6 }, + { "id": 7 }, + { "id": 8 }, + { "id": 9 }, + { "id": 10 }, ])) .unwrap(); @@ -88,66 +92,54 @@ fn test_geo_sort() { s.geo_sort_strategy(GeoSortStrategy::Dynamic(100)); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - let ids = collect_field_values(&index, &txn, "id", &documents_ids); - insta::assert_snapshot!(format!("{ids:?}"), @r###"["0", "1", "2", "3", "4", "5", "6", "8", "7", "10", "9"]"###); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 3, 2, 6, 7, 8, 9, 10]"); s.geo_sort_strategy(GeoSortStrategy::Dynamic(3)); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - let ids = collect_field_values(&index, &txn, "id", &documents_ids); - insta::assert_snapshot!(format!("{ids:?}"), @r###"["0", "1", "2", "3", "4", "5", "6", "8", "7", "10", "9"]"###); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 3, 2, 6, 7, 8, 9, 10]"); s.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(100)); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - let ids = collect_field_values(&index, &txn, "id", &documents_ids); - insta::assert_snapshot!(format!("{ids:?}"), @r###"["0", "1", "2", "3", "4", "5", "6", "8", "7", "10", "9"]"###); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 3, 2, 6, 7, 8, 9, 10]"); s.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(3)); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - let ids = collect_field_values(&index, &txn, "id", &documents_ids); - insta::assert_snapshot!(format!("{ids:?}"), @r###"["0", "1", "2", "3", "4", "5", "6", "8", "7", "10", "9"]"###); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 3, 2, 6, 7, 8, 9, 10]"); s.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(100)); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - let ids = collect_field_values(&index, &txn, "id", &documents_ids); - insta::assert_snapshot!(format!("{ids:?}"), @r###"["0", "1", "2", "3", "4", "5", "6", "8", "7", "10", "9"]"###); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 3, 2, 6, 7, 8, 9, 10]"); s.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(3)); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - let ids = collect_field_values(&index, &txn, "id", &documents_ids); - insta::assert_snapshot!(format!("{ids:?}"), @r###"["0", "1", "2", "3", "4", "5", "6", "8", "7", "10", "9"]"###); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 3, 2, 6, 7, 8, 9, 10]"); // --- desc s.sort_criteria(vec![AscDesc::Desc(Member::Geo([0., 0.]))]); s.geo_sort_strategy(GeoSortStrategy::Dynamic(100)); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - let ids = collect_field_values(&index, &txn, "id", &documents_ids); - insta::assert_snapshot!(format!("{ids:?}"), @r###"["5", "4", "3", "2", "1", "0", "6", "8", "7", "10", "9"]"###); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 5, 4, 6, 7, 8, 9, 10]"); s.geo_sort_strategy(GeoSortStrategy::Dynamic(3)); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - let ids = collect_field_values(&index, &txn, "id", &documents_ids); - insta::assert_snapshot!(format!("{ids:?}"), @r###"["5", "4", "3", "2", "1", "0", "6", "8", "7", "10", "9"]"###); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 5, 4, 6, 7, 8, 9, 10]"); s.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(100)); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - let ids = collect_field_values(&index, &txn, "id", &documents_ids); - insta::assert_snapshot!(format!("{ids:?}"), @r###"["5", "4", "3", "2", "1", "0", "6", "8", "7", "10", "9"]"###); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 5, 4, 6, 7, 8, 9, 10]"); s.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(3)); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - let ids = collect_field_values(&index, &txn, "id", &documents_ids); - insta::assert_snapshot!(format!("{ids:?}"), @r###"["5", "4", "3", "2", "1", "0", "6", "8", "7", "10", "9"]"###); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 5, 4, 6, 7, 8, 9, 10]"); s.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(100)); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - let ids = collect_field_values(&index, &txn, "id", &documents_ids); - insta::assert_snapshot!(format!("{ids:?}"), @r###"["5", "4", "3", "2", "1", "0", "6", "8", "7", "10", "9"]"###); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 5, 4, 6, 7, 8, 9, 10]"); s.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(3)); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - let ids = collect_field_values(&index, &txn, "id", &documents_ids); - insta::assert_snapshot!(format!("{ids:?}"), @r###"["5", "4", "3", "2", "1", "0", "6", "8", "7", "10", "9"]"###); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 5, 4, 6, 7, 8, 9, 10]"); } #[test]