mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 20:37:15 +02:00
Move geosort code out of search
This commit is contained in:
parent
63827bbee0
commit
e35d58b531
10 changed files with 257 additions and 243 deletions
|
@ -1,14 +1,70 @@
|
|||
use std::collections::VecDeque;
|
||||
|
||||
use heed::RoTxn;
|
||||
use crate::{
|
||||
distance_between_two_points,
|
||||
heed_codec::facet::{FieldDocIdFacetCodec, OrderedF64Codec},
|
||||
lat_lng_to_xyz,
|
||||
search::new::{facet_string_values, facet_values_prefix_key},
|
||||
GeoPoint, Index,
|
||||
};
|
||||
use heed::{
|
||||
types::{Bytes, Unit},
|
||||
RoPrefix, RoTxn,
|
||||
};
|
||||
use roaring::RoaringBitmap;
|
||||
use rstar::RTree;
|
||||
use std::collections::VecDeque;
|
||||
|
||||
use crate::{
|
||||
distance_between_two_points, lat_lng_to_xyz,
|
||||
search::new::geo_sort::{geo_value, opposite_of},
|
||||
GeoPoint, GeoSortStrategy, Index,
|
||||
};
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct GeoSortParameter {
|
||||
// Define the strategy used by the geo sort
|
||||
pub strategy: GeoSortStrategy,
|
||||
// Limit the number of docs in a single bucket to avoid unexpectedly large overhead
|
||||
pub max_bucket_size: u64,
|
||||
// Considering the errors of GPS and geographical calculations, distances less than distance_error_margin will be treated as equal
|
||||
pub distance_error_margin: f64,
|
||||
}
|
||||
|
||||
impl Default for GeoSortParameter {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
strategy: GeoSortStrategy::default(),
|
||||
max_bucket_size: 1000,
|
||||
distance_error_margin: 1.0,
|
||||
}
|
||||
}
|
||||
}
|
||||
/// Define the strategy used by the geo sort.
|
||||
/// The parameter represents the cache size, and, in the case of the Dynamic strategy,
|
||||
/// the point where we move from using the iterative strategy to the rtree.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum GeoSortStrategy {
|
||||
AlwaysIterative(usize),
|
||||
AlwaysRtree(usize),
|
||||
Dynamic(usize),
|
||||
}
|
||||
|
||||
impl Default for GeoSortStrategy {
|
||||
fn default() -> Self {
|
||||
GeoSortStrategy::Dynamic(1000)
|
||||
}
|
||||
}
|
||||
|
||||
impl GeoSortStrategy {
|
||||
pub fn use_rtree(&self, candidates: usize) -> bool {
|
||||
match self {
|
||||
GeoSortStrategy::AlwaysIterative(_) => false,
|
||||
GeoSortStrategy::AlwaysRtree(_) => true,
|
||||
GeoSortStrategy::Dynamic(i) => candidates >= *i,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn cache_size(&self) -> usize {
|
||||
match self {
|
||||
GeoSortStrategy::AlwaysIterative(i)
|
||||
| GeoSortStrategy::AlwaysRtree(i)
|
||||
| GeoSortStrategy::Dynamic(i) => *i,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Make it take a mut reference to cache
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
|
@ -74,7 +130,8 @@ pub fn fill_cache(
|
|||
.map(|id| -> crate::Result<_> { Ok((id, geo_value(id, lat, lng, index, txn)?)) })
|
||||
.collect::<crate::Result<Vec<(u32, [f64; 2])>>>()?;
|
||||
// computing the distance between two points is expensive thus we cache the result
|
||||
documents.sort_by_cached_key(|(_, p)| distance_between_two_points(&target_point, p) as usize);
|
||||
documents
|
||||
.sort_by_cached_key(|(_, p)| distance_between_two_points(&target_point, p) as usize);
|
||||
cached_sorted_docids.extend(documents);
|
||||
};
|
||||
|
||||
|
@ -86,19 +143,13 @@ pub fn next_bucket(
|
|||
index: &Index,
|
||||
txn: &RoTxn<heed::AnyTls>,
|
||||
universe: &RoaringBitmap,
|
||||
strategy: GeoSortStrategy,
|
||||
ascending: bool,
|
||||
target_point: [f64; 2],
|
||||
field_ids: &Option<[u16; 2]>,
|
||||
rtree: &mut Option<RTree<GeoPoint>>,
|
||||
|
||||
cached_sorted_docids: &mut VecDeque<(u32, [f64; 2])>,
|
||||
geo_candidates: &RoaringBitmap,
|
||||
|
||||
// Limit the number of docs in a single bucket to avoid unexpectedly large overhead
|
||||
max_bucket_size: u64,
|
||||
// Considering the errors of GPS and geographical calculations, distances less than distance_error_margin will be treated as equal
|
||||
distance_error_margin: f64,
|
||||
parameter: GeoSortParameter,
|
||||
) -> crate::Result<Option<(RoaringBitmap, Option<[f64; 2]>)>> {
|
||||
let mut geo_candidates = geo_candidates & universe;
|
||||
|
||||
|
@ -130,7 +181,7 @@ pub fn next_bucket(
|
|||
if geo_candidates.contains(id) {
|
||||
let distance = distance_between_two_points(&target_point, &point);
|
||||
if let Some((point0, bucket_distance)) = current_distance.as_ref() {
|
||||
if (bucket_distance - distance).abs() > distance_error_margin {
|
||||
if (bucket_distance - distance).abs() > parameter.distance_error_margin {
|
||||
// different distance, point belongs to next bucket
|
||||
put_back(cached_sorted_docids, (id, point));
|
||||
return Ok(Some((current_bucket, Some(point0.to_owned()))));
|
||||
|
@ -140,7 +191,7 @@ pub fn next_bucket(
|
|||
// remove from candidates to prevent it from being added to the cache again
|
||||
geo_candidates.remove(id);
|
||||
// current bucket size reaches limit, force return
|
||||
if current_bucket.len() == max_bucket_size {
|
||||
if current_bucket.len() == parameter.max_bucket_size {
|
||||
return Ok(Some((current_bucket, Some(point0.to_owned()))));
|
||||
}
|
||||
}
|
||||
|
@ -150,7 +201,7 @@ pub fn next_bucket(
|
|||
current_bucket.insert(id);
|
||||
geo_candidates.remove(id);
|
||||
// current bucket size reaches limit, force return
|
||||
if current_bucket.len() == max_bucket_size {
|
||||
if current_bucket.len() == parameter.max_bucket_size {
|
||||
return Ok(Some((current_bucket, Some(point.to_owned()))));
|
||||
}
|
||||
}
|
||||
|
@ -160,7 +211,7 @@ pub fn next_bucket(
|
|||
fill_cache(
|
||||
index,
|
||||
txn,
|
||||
strategy,
|
||||
parameter.strategy,
|
||||
ascending,
|
||||
target_point,
|
||||
field_ids,
|
||||
|
@ -180,3 +231,65 @@ pub fn next_bucket(
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Return an iterator over each number value in the given field of the given document.
|
||||
fn facet_number_values<'a>(
|
||||
docid: u32,
|
||||
field_id: u16,
|
||||
index: &Index,
|
||||
txn: &'a RoTxn<'a>,
|
||||
) -> crate::Result<RoPrefix<'a, FieldDocIdFacetCodec<OrderedF64Codec>, Unit>> {
|
||||
let key = facet_values_prefix_key(field_id, docid);
|
||||
|
||||
let iter = index
|
||||
.field_id_docid_facet_f64s
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(txn, &key)?
|
||||
.remap_key_type();
|
||||
|
||||
Ok(iter)
|
||||
}
|
||||
|
||||
/// Extracts the lat and long values from a single document.
|
||||
///
|
||||
/// If it is not able to find it in the facet number index it will extract it
|
||||
/// from the facet string index and parse it as f64 (as the geo extraction behaves).
|
||||
pub(crate) fn geo_value(
|
||||
docid: u32,
|
||||
field_lat: u16,
|
||||
field_lng: u16,
|
||||
index: &Index,
|
||||
rtxn: &RoTxn<'_>,
|
||||
) -> crate::Result<[f64; 2]> {
|
||||
let extract_geo = |geo_field: u16| -> crate::Result<f64> {
|
||||
match facet_number_values(docid, geo_field, index, rtxn)?.next() {
|
||||
Some(Ok(((_, _, geo), ()))) => Ok(geo),
|
||||
Some(Err(e)) => Err(e.into()),
|
||||
None => match facet_string_values(docid, geo_field, index, rtxn)?.next() {
|
||||
Some(Ok((_, geo))) => {
|
||||
Ok(geo.parse::<f64>().expect("cannot parse geo field as f64"))
|
||||
}
|
||||
Some(Err(e)) => Err(e.into()),
|
||||
None => panic!("A geo faceted document doesn't contain any lat or lng"),
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
let lat = extract_geo(field_lat)?;
|
||||
let lng = extract_geo(field_lng)?;
|
||||
|
||||
Ok([lat, lng])
|
||||
}
|
||||
|
||||
/// Compute the antipodal coordinate of `coord`
|
||||
pub(crate) fn opposite_of(mut coord: [f64; 2]) -> [f64; 2] {
|
||||
coord[0] *= -1.;
|
||||
// in the case of x,0 we want to return x,180
|
||||
if coord[1] > 0. {
|
||||
coord[1] -= 180.;
|
||||
} else {
|
||||
coord[1] += 180.;
|
||||
}
|
||||
|
||||
coord
|
||||
}
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
mod builder;
|
||||
mod enriched;
|
||||
pub mod geo_sort;
|
||||
mod primary_key;
|
||||
mod reader;
|
||||
mod serde_impl;
|
||||
pub mod geo_sort;
|
||||
|
||||
use std::fmt::Debug;
|
||||
use std::io;
|
||||
|
@ -20,6 +20,7 @@ pub use primary_key::{
|
|||
pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
pub use self::geo_sort::{GeoSortParameter, GeoSortStrategy};
|
||||
use crate::error::{FieldIdMapMissingEntry, InternalError};
|
||||
use crate::{FieldId, Object, Result};
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue