MeiliSearch/milli/src/search/new/geo_sort.rs

276 lines
9.6 KiB
Rust
Raw Normal View History

2023-04-13 13:45:34 +02:00
use std::collections::VecDeque;
use std::iter::FromIterator;
use heed::types::{ByteSlice, Unit};
use heed::{RoPrefix, RoTxn};
use roaring::RoaringBitmap;
use rstar::RTree;
use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait};
use crate::heed_codec::facet::{FieldDocIdFacetCodec, OrderedF64Codec};
use crate::{
distance_between_two_points, lat_lng_to_xyz, GeoPoint, Index, Result, SearchContext,
SearchLogger,
};
const FID_SIZE: usize = 2;
const DOCID_SIZE: usize = 4;
#[allow(clippy::drop_non_drop)]
fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE] {
concat_arrays::concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes())
}
/// Return an iterator over each number value in the given field of the given document.
fn facet_number_values<'a>(
docid: u32,
field_id: u16,
index: &Index,
txn: &'a RoTxn,
) -> Result<RoPrefix<'a, FieldDocIdFacetCodec<OrderedF64Codec>, Unit>> {
let key = facet_values_prefix_key(field_id, docid);
let iter = index
.field_id_docid_facet_f64s
.remap_key_type::<ByteSlice>()
.prefix_iter(txn, &key)?
.remap_key_type();
Ok(iter)
}
/// Define the strategy used by the geo sort.
/// The paramater represents the cache size, and, in the case of the Dynamic strategy,
/// the point where we move from using the iterative strategy to the rtree.
#[derive(Debug, Clone, Copy)]
pub enum Strategy {
AlwaysIterative(usize),
AlwaysRtree(usize),
Dynamic(usize),
}
impl Default for Strategy {
fn default() -> Self {
Strategy::Dynamic(1000)
}
}
impl Strategy {
pub fn use_rtree(&self, candidates: usize) -> bool {
match self {
Strategy::AlwaysIterative(_) => false,
Strategy::AlwaysRtree(_) => true,
Strategy::Dynamic(i) => candidates >= *i,
}
}
pub fn cache_size(&self) -> usize {
match self {
Strategy::AlwaysIterative(i) | Strategy::AlwaysRtree(i) | Strategy::Dynamic(i) => *i,
}
}
}
pub struct GeoSort<Q: RankingRuleQueryTrait> {
query: Option<Q>,
strategy: Strategy,
ascending: bool,
point: [f64; 2],
field_ids: Option<[u16; 2]>,
rtree: Option<RTree<GeoPoint>>,
cached_sorted_docids: VecDeque<u32>,
geo_candidates: RoaringBitmap,
}
impl<Q: RankingRuleQueryTrait> GeoSort<Q> {
pub fn new(
strategy: Strategy,
geo_faceted_docids: RoaringBitmap,
point: [f64; 2],
ascending: bool,
) -> Result<Self> {
Ok(Self {
query: None,
strategy,
ascending,
point,
geo_candidates: geo_faceted_docids,
field_ids: None,
rtree: None,
cached_sorted_docids: VecDeque::new(),
})
}
/// Refill the internal buffer of cached docids based on the strategy.
/// Drop the rtree if we don't need it anymore.
fn fill_buffer(&mut self, ctx: &mut SearchContext) -> Result<()> {
2023-04-13 13:45:34 +02:00
debug_assert!(self.field_ids.is_some(), "fill_buffer can't be called without the lat&lng");
debug_assert!(self.cached_sorted_docids.is_empty());
// if we had an rtree and the strategy doesn't require one anymore we can drop it
let use_rtree = self.strategy.use_rtree(self.geo_candidates.len() as usize);
if !use_rtree && self.rtree.is_some() {
self.rtree = None;
}
let cache_size = self.strategy.cache_size();
if let Some(ref mut rtree) = self.rtree {
let point = lat_lng_to_xyz(&self.point);
if self.ascending {
for point in rtree.nearest_neighbor_iter(&point) {
if self.geo_candidates.contains(point.data.0) {
self.cached_sorted_docids.push_back(point.data.0);
if self.cached_sorted_docids.len() >= cache_size {
break;
}
}
}
} else {
// in the case of the desc geo sort we have to scan the whole database
// and only keep the latest candidates.
for point in rtree.nearest_neighbor_iter(&point) {
if self.geo_candidates.contains(point.data.0) {
// REVIEW COMMENT: that doesn't look right, because we only keep the furthest point in the cache.
// Then the cache will be exhausted after the first bucket and we'll need to repopulate it again immediately.
// I think it's okay if we keep every document id in the cache instead. It's a high memory usage,
// but we already have the whole rtree in memory, which is bigger than a vector of all document ids.
//
// self.cached_sorted_docids.pop_front();
//
2023-04-13 13:45:34 +02:00
self.cached_sorted_docids.push_back(point.data.0);
}
}
}
} else {
// the iterative version
let [lat, lng] = self.field_ids.unwrap();
let mut documents = self
.geo_candidates
.iter()
.map(|id| -> Result<_> {
Ok((
id,
[
facet_number_values(id, lat, ctx.index, ctx.txn)?
.next()
.expect("A geo faceted document doesn't contain any lat")?
.0
.2,
facet_number_values(id, lng, ctx.index, ctx.txn)?
.next()
.expect("A geo faceted document doesn't contain any lng")?
.0
.2,
],
))
})
.collect::<Result<Vec<(u32, [f64; 2])>>>()?;
// REVIEW COMMENT: the haversine distance function can be quite expensive, I think, so it's probably faster
// to use `sort_by_cached_key` instead of `sort_by_key`.
documents
.sort_by_cached_key(|(_, p)| distance_between_two_points(&self.point, p) as usize);
2023-04-13 13:45:34 +02:00
self.cached_sorted_docids.extend(documents.into_iter().map(|(doc_id, _)| doc_id));
};
if self.cached_sorted_docids.is_empty() && matches!(self.strategy, Strategy::AlwaysRtree(_))
{
// this shouldn't be possible
self.rtree = None;
}
Ok(())
}
}
impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort<Q> {
fn id(&self) -> String {
"geo_sort".to_owned()
}
fn start_iteration(
&mut self,
ctx: &mut SearchContext<'ctx>,
_logger: &mut dyn SearchLogger<Q>,
universe: &RoaringBitmap,
query: &Q,
) -> Result<()> {
assert!(self.query.is_none());
self.query = Some(query.clone());
self.geo_candidates &= universe;
if self.geo_candidates.is_empty() {
2023-04-13 13:45:34 +02:00
return Ok(());
}
let fid_map = ctx.index.fields_ids_map(ctx.txn)?;
let lat = fid_map.id("_geo.lat").expect("geo candidates but no fid for lat");
let lng = fid_map.id("_geo.lng").expect("geo candidates but no fid for lng");
self.field_ids = Some([lat, lng]);
if self.strategy.use_rtree(self.geo_candidates.len() as usize) {
// REVIEW COMMENT: I would prefer to always keep the rtree in memory so that we don't have to deserialize it
// every time the geosort ranking rule starts iterating.
// So we'd initialize it in `::new` and never drop it.
//
2023-04-13 13:45:34 +02:00
self.rtree = Some(ctx.index.geo_rtree(ctx.txn)?.expect("geo candidates but no rtree"));
}
self.fill_buffer(ctx)?;
Ok(())
}
#[allow(clippy::only_used_in_recursion)]
2023-04-13 13:45:34 +02:00
fn next_bucket(
&mut self,
ctx: &mut SearchContext<'ctx>,
logger: &mut dyn SearchLogger<Q>,
universe: &RoaringBitmap,
) -> Result<Option<RankingRuleOutput<Q>>> {
assert!(universe.len() > 1);
let query = self.query.as_ref().unwrap().clone();
self.geo_candidates &= universe;
if self.geo_candidates.is_empty() {
return Ok(Some(RankingRuleOutput { query, candidates: universe.clone() }));
}
let ascending = self.ascending;
let next = |cache: &mut VecDeque<_>| {
if ascending {
cache.pop_front()
} else {
cache.pop_back()
}
};
while let Some(id) = next(&mut self.cached_sorted_docids) {
if self.geo_candidates.contains(id) {
return Ok(Some(RankingRuleOutput {
query,
candidates: RoaringBitmap::from_iter([id]),
}));
}
}
// if we got out of this loop it means we've exhausted our cache.
if self.rtree.is_none() {
// with no rtree it means all geo candidates have been returned. We can return all the non geo-faceted documents
Ok(Some(RankingRuleOutput { query, candidates: universe.clone() }))
} else {
// else, we need to refill our bucket and run the function again
self.fill_buffer(ctx)?;
self.next_bucket(ctx, logger, universe)
}
}
fn end_iteration(&mut self, _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger<Q>) {
self.query = None;
self.rtree = None;
self.cached_sorted_docids.clear();
}
}