From 8875d24a4858a8445ad80d2f91ce7aa4c7c6077f Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 2 May 2023 11:39:35 +0200 Subject: [PATCH] deserialize the rtree only when its needed, and keep it in memory once it has been deserialized --- milli/src/search/new/geo_sort.rs | 38 ++++++++------------------------ 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/milli/src/search/new/geo_sort.rs b/milli/src/search/new/geo_sort.rs index 9e1da4479..98eacd48a 100644 --- a/milli/src/search/new/geo_sort.rs +++ b/milli/src/search/new/geo_sort.rs @@ -111,12 +111,13 @@ impl GeoSort { // if we had an rtree and the strategy doesn't require one anymore we can drop it let use_rtree = self.strategy.use_rtree(self.geo_candidates.len() as usize); - if !use_rtree && self.rtree.is_some() { - self.rtree = None; + if use_rtree && self.rtree.is_none() { + self.rtree = Some(ctx.index.geo_rtree(ctx.txn)?.expect("geo candidates but no rtree")); } let cache_size = self.strategy.cache_size(); - if let Some(ref mut rtree) = self.rtree { + if use_rtree { + let rtree = self.rtree.as_ref().unwrap(); let point = lat_lng_to_xyz(&self.point); if self.ascending { @@ -169,18 +170,12 @@ impl GeoSort { )) }) .collect::>>()?; - // REVIEW COMMENT: the haversine distance function can be quite expensive, I think, so it's probably faster - // to use `sort_by_cached_key` instead of `sort_by_key`. + // computing the distance between two points is expensive thus we cache the result documents .sort_by_cached_key(|(_, p)| distance_between_two_points(&self.point, p) as usize); self.cached_sorted_docids.extend(documents.into_iter().map(|(doc_id, _)| doc_id)); }; - if self.cached_sorted_docids.is_empty() && matches!(self.strategy, Strategy::AlwaysRtree(_)) - { - // this shouldn't be possible - self.rtree = None; - } Ok(()) } } @@ -210,15 +205,6 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort { let lat = fid_map.id("_geo.lat").expect("geo candidates but no fid for lat"); let lng = fid_map.id("_geo.lng").expect("geo candidates but no fid for lng"); self.field_ids = Some([lat, lng]); - - if self.strategy.use_rtree(self.geo_candidates.len() as usize) { - // REVIEW COMMENT: I would prefer to always keep the rtree in memory so that we don't have to deserialize it - // every time the geosort ranking rule starts iterating. - // So we'd initialize it in `::new` and never drop it. - // - self.rtree = Some(ctx.index.geo_rtree(ctx.txn)?.expect("geo candidates but no rtree")); - } - self.fill_buffer(ctx)?; Ok(()) } @@ -256,20 +242,14 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort { } // if we got out of this loop it means we've exhausted our cache. - - if self.rtree.is_none() { - // with no rtree it means all geo candidates have been returned. We can return all the non geo-faceted documents - Ok(Some(RankingRuleOutput { query, candidates: universe.clone() })) - } else { - // else, we need to refill our bucket and run the function again - self.fill_buffer(ctx)?; - self.next_bucket(ctx, logger, universe) - } + // we need to refill it and run the function again. + self.fill_buffer(ctx)?; + self.next_bucket(ctx, logger, universe) } fn end_iteration(&mut self, _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger) { + // we do not reset the rtree here, it could be used in a next iteration self.query = None; - self.rtree = None; self.cached_sorted_docids.clear(); } }