MeiliSearch/milli/src/search/new/geo_sort.rs

use std::collections::VecDeque;
use std::iter::FromIterator;

use heed::types::{ByteSlice, Unit};
use heed::{RoPrefix, RoTxn};
use roaring::RoaringBitmap;
use rstar::RTree;

use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait};
use crate::heed_codec::facet::{FieldDocIdFacetCodec, OrderedF64Codec};
use crate::{
    distance_between_two_points, lat_lng_to_xyz, GeoPoint, Index, Result, SearchContext,
    SearchLogger,
};

const FID_SIZE: usize = 2;
const DOCID_SIZE: usize = 4;

#[allow(clippy::drop_non_drop)]
fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE] {
    concat_arrays::concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes())
}

/// Return an iterator over each number value in the given field of the given document.
fn facet_number_values<'a>(
    docid: u32,
    field_id: u16,
    index: &Index,
    txn: &'a RoTxn,
) -> Result<RoPrefix<'a, FieldDocIdFacetCodec<OrderedF64Codec>, Unit>> {
    let key = facet_values_prefix_key(field_id, docid);

    let iter = index
        .field_id_docid_facet_f64s
        .remap_key_type::<ByteSlice>()
        .prefix_iter(txn, &key)?
        .remap_key_type();

    Ok(iter)
}

/// Define the strategy used by the geo sort.
/// The paramater represents the cache size, and, in the case of the Dynamic strategy,
/// the point where we move from using the iterative strategy to the rtree.
#[derive(Debug, Clone, Copy)]
pub enum Strategy {
    AlwaysIterative(usize),
    AlwaysRtree(usize),
    Dynamic(usize),
}

impl Default for Strategy {
    fn default() -> Self {
        Strategy::Dynamic(1000)
    }
}

impl Strategy {
    pub fn use_rtree(&self, candidates: usize) -> bool {
        match self {
            Strategy::AlwaysIterative(_) => false,
            Strategy::AlwaysRtree(_) => true,
            Strategy::Dynamic(i) => candidates >= *i,
        }
    }

    pub fn cache_size(&self) -> usize {
        match self {
            Strategy::AlwaysIterative(i) | Strategy::AlwaysRtree(i) | Strategy::Dynamic(i) => *i,
        }
    }
}

pub struct GeoSort<Q: RankingRuleQueryTrait> {
    query: Option<Q>,

    strategy: Strategy,
    ascending: bool,
    point: [f64; 2],
    field_ids: Option<[u16; 2]>,
    rtree: Option<RTree<GeoPoint>>,

    cached_sorted_docids: VecDeque<u32>,
    geo_candidates: RoaringBitmap,
}

impl<Q: RankingRuleQueryTrait> GeoSort<Q> {
    pub fn new(
        strategy: Strategy,
        geo_faceted_docids: RoaringBitmap,
        point: [f64; 2],
        ascending: bool,
    ) -> Result<Self> {
        Ok(Self {
            query: None,
            strategy,
            ascending,
            point,
            geo_candidates: geo_faceted_docids,
            field_ids: None,
            rtree: None,
            cached_sorted_docids: VecDeque::new(),
        })
    }

    /// Refill the internal buffer of cached docids based on the strategy.
    /// Drop the rtree if we don't need it anymore.
    fn fill_buffer(&mut self, ctx: &mut SearchContext) -> Result<()> {
        debug_assert!(self.field_ids.is_some(), "fill_buffer can't be called without the lat&lng");
        debug_assert!(self.cached_sorted_docids.is_empty());

        // if we had an rtree and the strategy doesn't require one anymore we can drop it
        let use_rtree = self.strategy.use_rtree(self.geo_candidates.len() as usize);
        if use_rtree && self.rtree.is_none() {
            self.rtree = Some(ctx.index.geo_rtree(ctx.txn)?.expect("geo candidates but no rtree"));
        }

        let cache_size = self.strategy.cache_size();
        if use_rtree {
            let rtree = self.rtree.as_ref().unwrap();
            let point = lat_lng_to_xyz(&self.point);

            if self.ascending {
                for point in rtree.nearest_neighbor_iter(&point) {
                    if self.geo_candidates.contains(point.data.0) {
                        self.cached_sorted_docids.push_back(point.data.0);
                        if self.cached_sorted_docids.len() >= cache_size {
                            break;
                        }
                    }
                }
            } else {
                // in the case of the desc geo sort we have to scan the whole database
                // and only keep the latest candidates.
                for point in rtree.nearest_neighbor_iter(&point) {
                    if self.geo_candidates.contains(point.data.0) {
                        // REVIEW COMMENT: that doesn't look right, because we only keep the furthest point in the cache.
                        // Then the cache will be exhausted after the first bucket and we'll need to repopulate it again immediately.
                        // I think it's okay if we keep every document id in the cache instead. It's a high memory usage,
                        // but we already have the whole rtree in memory, which is bigger than a vector of all document ids.
                        //
                        //      self.cached_sorted_docids.pop_front();
                        //
                        self.cached_sorted_docids.push_back(point.data.0);
                    }
                }
            }
        } else {
            // the iterative version
            let [lat, lng] = self.field_ids.unwrap();

            let mut documents = self
                .geo_candidates
                .iter()
                .map(|id| -> Result<_> {
                    Ok((
                        id,
                        [
                            facet_number_values(id, lat, ctx.index, ctx.txn)?
                                .next()
                                .expect("A geo faceted document doesn't contain any lat")?
                                .0
                                 .2,
                            facet_number_values(id, lng, ctx.index, ctx.txn)?
                                .next()
                                .expect("A geo faceted document doesn't contain any lng")?
                                .0
                                 .2,
                        ],
                    ))
                })
                .collect::<Result<Vec<(u32, [f64; 2])>>>()?;
            // computing the distance between two points is expensive thus we cache the result
            documents
                .sort_by_cached_key(|(_, p)| distance_between_two_points(&self.point, p) as usize);
            self.cached_sorted_docids.extend(documents.into_iter().map(|(doc_id, _)| doc_id));
        };

        Ok(())
    }
}

impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort<Q> {
    fn id(&self) -> String {
        "geo_sort".to_owned()
    }

    fn start_iteration(
        &mut self,
        ctx: &mut SearchContext<'ctx>,
        _logger: &mut dyn SearchLogger<Q>,
        universe: &RoaringBitmap,
        query: &Q,
    ) -> Result<()> {
        assert!(self.query.is_none());

        self.query = Some(query.clone());
        self.geo_candidates &= universe;

        if self.geo_candidates.is_empty() {
            return Ok(());
        }

        let fid_map = ctx.index.fields_ids_map(ctx.txn)?;
        let lat = fid_map.id("_geo.lat").expect("geo candidates but no fid for lat");
        let lng = fid_map.id("_geo.lng").expect("geo candidates but no fid for lng");
        self.field_ids = Some([lat, lng]);
        self.fill_buffer(ctx)?;
        Ok(())
    }

    #[allow(clippy::only_used_in_recursion)]
    fn next_bucket(
        &mut self,
        ctx: &mut SearchContext<'ctx>,
        logger: &mut dyn SearchLogger<Q>,
        universe: &RoaringBitmap,
    ) -> Result<Option<RankingRuleOutput<Q>>> {
        assert!(universe.len() > 1);
        let query = self.query.as_ref().unwrap().clone();
        self.geo_candidates &= universe;

        if self.geo_candidates.is_empty() {
            return Ok(Some(RankingRuleOutput { query, candidates: universe.clone() }));
        }

        let ascending = self.ascending;
        let next = |cache: &mut VecDeque<_>| {
            if ascending {
                cache.pop_front()
            } else {
                cache.pop_back()
            }
        };
        while let Some(id) = next(&mut self.cached_sorted_docids) {
            if self.geo_candidates.contains(id) {
                return Ok(Some(RankingRuleOutput {
                    query,
                    candidates: RoaringBitmap::from_iter([id]),
                }));
            }
        }

        // if we got out of this loop it means we've exhausted our cache.
        // we need to refill it and run the function again.
        self.fill_buffer(ctx)?;
        self.next_bucket(ctx, logger, universe)
    }

    fn end_iteration(&mut self, _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger<Q>) {
        // we do not reset the rtree here, it could be used in a next iteration
        self.query = None;
        self.cached_sorted_docids.clear();
    }
}
Implements the geo-sort ranking rule 2023-04-13 13:45:34 +02:00			`use std::collections::VecDeque;`
			`use std::iter::FromIterator;`

			`use heed::types::{ByteSlice, Unit};`
			`use heed::{RoPrefix, RoTxn};`
			`use roaring::RoaringBitmap;`
			`use rstar::RTree;`

			`use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait};`
			`use crate::heed_codec::facet::{FieldDocIdFacetCodec, OrderedF64Codec};`
			`use crate::{`
			`distance_between_two_points, lat_lng_to_xyz, GeoPoint, Index, Result, SearchContext,`
			`SearchLogger,`
			`};`

			`const FID_SIZE: usize = 2;`
			`const DOCID_SIZE: usize = 4;`

			`#[allow(clippy::drop_non_drop)]`
			`fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE] {`
			`concat_arrays::concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes())`
			`}`

			`/// Return an iterator over each number value in the given field of the given document.`
			`fn facet_number_values<'a>(`
			`docid: u32,`
			`field_id: u16,`
			`index: &Index,`
			`txn: &'a RoTxn,`
			`) -> Result<RoPrefix<'a, FieldDocIdFacetCodec<OrderedF64Codec>, Unit>> {`
			`let key = facet_values_prefix_key(field_id, docid);`

			`let iter = index`
			`.field_id_docid_facet_f64s`
			`.remap_key_type::<ByteSlice>()`
			`.prefix_iter(txn, &key)?`
			`.remap_key_type();`

			`Ok(iter)`
			`}`

			`/// Define the strategy used by the geo sort.`
			`/// The paramater represents the cache size, and, in the case of the Dynamic strategy,`
			`/// the point where we move from using the iterative strategy to the rtree.`
			`#[derive(Debug, Clone, Copy)]`
			`pub enum Strategy {`
			`AlwaysIterative(usize),`
			`AlwaysRtree(usize),`
			`Dynamic(usize),`
			`}`

			`impl Default for Strategy {`
			`fn default() -> Self {`
			`Strategy::Dynamic(1000)`
			`}`
			`}`

			`impl Strategy {`
			`pub fn use_rtree(&self, candidates: usize) -> bool {`
			`match self {`
			`Strategy::AlwaysIterative(_) => false,`
			`Strategy::AlwaysRtree(_) => true,`
			`Strategy::Dynamic(i) => candidates >= *i,`
			`}`
			`}`

			`pub fn cache_size(&self) -> usize {`
			`match self {`
			`Strategy::AlwaysIterative(i) \| Strategy::AlwaysRtree(i) \| Strategy::Dynamic(i) => *i,`
			`}`
			`}`
			`}`

			`pub struct GeoSort<Q: RankingRuleQueryTrait> {`
			`query: Option<Q>,`

			`strategy: Strategy,`
			`ascending: bool,`
			`point: [f64; 2],`
			`field_ids: Option<[u16; 2]>,`
			`rtree: Option<RTree<GeoPoint>>,`

			`cached_sorted_docids: VecDeque<u32>,`
			`geo_candidates: RoaringBitmap,`
			`}`

			`impl<Q: RankingRuleQueryTrait> GeoSort<Q> {`
			`pub fn new(`
			`strategy: Strategy,`
			`geo_faceted_docids: RoaringBitmap,`
			`point: [f64; 2],`
			`ascending: bool,`
			`) -> Result<Self> {`
			`Ok(Self {`
			`query: None,`
			`strategy,`
			`ascending,`
			`point,`
			`geo_candidates: geo_faceted_docids,`
			`field_ids: None,`
			`rtree: None,`
			`cached_sorted_docids: VecDeque::new(),`
			`})`
			`}`

			`/// Refill the internal buffer of cached docids based on the strategy.`
			`/// Drop the rtree if we don't need it anymore.`
Fix errors, clippy warnings, and add review comments 2023-04-29 11:40:00 +02:00			`fn fill_buffer(&mut self, ctx: &mut SearchContext) -> Result<()> {`
Implements the geo-sort ranking rule 2023-04-13 13:45:34 +02:00			`debug_assert!(self.field_ids.is_some(), "fill_buffer can't be called without the lat&lng");`
			`debug_assert!(self.cached_sorted_docids.is_empty());`

			`// if we had an rtree and the strategy doesn't require one anymore we can drop it`
			`let use_rtree = self.strategy.use_rtree(self.geo_candidates.len() as usize);`
deserialize the rtree only when its needed, and keep it in memory once it has been deserialized 2023-05-02 11:39:35 +02:00			`if use_rtree && self.rtree.is_none() {`
			`self.rtree = Some(ctx.index.geo_rtree(ctx.txn)?.expect("geo candidates but no rtree"));`
Implements the geo-sort ranking rule 2023-04-13 13:45:34 +02:00			`}`

			`let cache_size = self.strategy.cache_size();`
deserialize the rtree only when its needed, and keep it in memory once it has been deserialized 2023-05-02 11:39:35 +02:00			`if use_rtree {`
			`let rtree = self.rtree.as_ref().unwrap();`
Implements the geo-sort ranking rule 2023-04-13 13:45:34 +02:00			`let point = lat_lng_to_xyz(&self.point);`

			`if self.ascending {`
			`for point in rtree.nearest_neighbor_iter(&point) {`
			`if self.geo_candidates.contains(point.data.0) {`
			`self.cached_sorted_docids.push_back(point.data.0);`
			`if self.cached_sorted_docids.len() >= cache_size {`
			`break;`
			`}`
			`}`
			`}`
			`} else {`
			`// in the case of the desc geo sort we have to scan the whole database`
			`// and only keep the latest candidates.`
			`for point in rtree.nearest_neighbor_iter(&point) {`
			`if self.geo_candidates.contains(point.data.0) {`
Fix errors, clippy warnings, and add review comments 2023-04-29 11:40:00 +02:00			`// REVIEW COMMENT: that doesn't look right, because we only keep the furthest point in the cache.`
			`// Then the cache will be exhausted after the first bucket and we'll need to repopulate it again immediately.`
			`// I think it's okay if we keep every document id in the cache instead. It's a high memory usage,`
			`// but we already have the whole rtree in memory, which is bigger than a vector of all document ids.`
			`//`
			`// self.cached_sorted_docids.pop_front();`
			`//`
Implements the geo-sort ranking rule 2023-04-13 13:45:34 +02:00			`self.cached_sorted_docids.push_back(point.data.0);`
			`}`
			`}`
			`}`
			`} else {`
			`// the iterative version`
			`let [lat, lng] = self.field_ids.unwrap();`

			`let mut documents = self`
			`.geo_candidates`
			`.iter()`
			`.map(\|id\| -> Result<_> {`
			`Ok((`
			`id,`
			`[`
			`facet_number_values(id, lat, ctx.index, ctx.txn)?`
			`.next()`
			`.expect("A geo faceted document doesn't contain any lat")?`
			`.0`
			`.2,`
			`facet_number_values(id, lng, ctx.index, ctx.txn)?`
			`.next()`
			`.expect("A geo faceted document doesn't contain any lng")?`
			`.0`
			`.2,`
			`],`
			`))`
			`})`
			`.collect::<Result<Vec<(u32, [f64; 2])>>>()?;`
deserialize the rtree only when its needed, and keep it in memory once it has been deserialized 2023-05-02 11:39:35 +02:00			`// computing the distance between two points is expensive thus we cache the result`
Fix errors, clippy warnings, and add review comments 2023-04-29 11:40:00 +02:00			`documents`
			`.sort_by_cached_key(\|(_, p)\| distance_between_two_points(&self.point, p) as usize);`
Implements the geo-sort ranking rule 2023-04-13 13:45:34 +02:00			`self.cached_sorted_docids.extend(documents.into_iter().map(\|(doc_id, _)\| doc_id));`
			`};`

			`Ok(())`
			`}`
			`}`

			`impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort<Q> {`
			`fn id(&self) -> String {`
			`"geo_sort".to_owned()`
			`}`

			`fn start_iteration(`
			`&mut self,`
			`ctx: &mut SearchContext<'ctx>,`
			`_logger: &mut dyn SearchLogger<Q>,`
			`universe: &RoaringBitmap,`
			`query: &Q,`
			`) -> Result<()> {`
			`assert!(self.query.is_none());`

			`self.query = Some(query.clone());`
			`self.geo_candidates &= universe;`

Fix errors, clippy warnings, and add review comments 2023-04-29 11:40:00 +02:00			`if self.geo_candidates.is_empty() {`
Implements the geo-sort ranking rule 2023-04-13 13:45:34 +02:00			`return Ok(());`
			`}`

			`let fid_map = ctx.index.fields_ids_map(ctx.txn)?;`
			`let lat = fid_map.id("_geo.lat").expect("geo candidates but no fid for lat");`
			`let lng = fid_map.id("_geo.lng").expect("geo candidates but no fid for lng");`
			`self.field_ids = Some([lat, lng]);`
			`self.fill_buffer(ctx)?;`
			`Ok(())`
			`}`

Fix errors, clippy warnings, and add review comments 2023-04-29 11:40:00 +02:00			`#[allow(clippy::only_used_in_recursion)]`
Implements the geo-sort ranking rule 2023-04-13 13:45:34 +02:00			`fn next_bucket(`
			`&mut self,`
			`ctx: &mut SearchContext<'ctx>,`
			`logger: &mut dyn SearchLogger<Q>,`
			`universe: &RoaringBitmap,`
			`) -> Result<Option<RankingRuleOutput<Q>>> {`
			`assert!(universe.len() > 1);`
			`let query = self.query.as_ref().unwrap().clone();`
			`self.geo_candidates &= universe;`

			`if self.geo_candidates.is_empty() {`
			`return Ok(Some(RankingRuleOutput { query, candidates: universe.clone() }));`
			`}`

			`let ascending = self.ascending;`
			`let next = \|cache: &mut VecDeque<_>\| {`
			`if ascending {`
			`cache.pop_front()`
			`} else {`
			`cache.pop_back()`
			`}`
			`};`
			`while let Some(id) = next(&mut self.cached_sorted_docids) {`
			`if self.geo_candidates.contains(id) {`
			`return Ok(Some(RankingRuleOutput {`
			`query,`
			`candidates: RoaringBitmap::from_iter([id]),`
			`}));`
			`}`
			`}`

			`// if we got out of this loop it means we've exhausted our cache.`
deserialize the rtree only when its needed, and keep it in memory once it has been deserialized 2023-05-02 11:39:35 +02:00			`// we need to refill it and run the function again.`
			`self.fill_buffer(ctx)?;`
			`self.next_bucket(ctx, logger, universe)`
Implements the geo-sort ranking rule 2023-04-13 13:45:34 +02:00			`}`

			`fn end_iteration(&mut self, _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger<Q>) {`
deserialize the rtree only when its needed, and keep it in memory once it has been deserialized 2023-05-02 11:39:35 +02:00			`// we do not reset the rtree here, it could be used in a next iteration`
Implements the geo-sort ranking rule 2023-04-13 13:45:34 +02:00			`self.query = None;`
			`self.cached_sorted_docids.clear();`
			`}`
			`}`