2021-08-23 18:41:48 +02:00
|
|
|
use std::fs::File;
|
2023-09-28 16:26:01 +02:00
|
|
|
use std::io::{self, BufReader};
|
2021-08-23 18:41:48 +02:00
|
|
|
|
|
|
|
use concat_arrays::concat_arrays;
|
2022-05-02 19:19:50 +02:00
|
|
|
use serde_json::Value;
|
2021-08-23 18:41:48 +02:00
|
|
|
|
|
|
|
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
2022-05-02 19:19:50 +02:00
|
|
|
use crate::error::GeoError;
|
2023-10-19 15:55:48 +02:00
|
|
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
2022-07-11 16:36:23 +02:00
|
|
|
use crate::update::index_documents::extract_finite_float_from_value;
|
2022-05-04 14:11:03 +02:00
|
|
|
use crate::{FieldId, InternalError, Result};
|
2021-08-23 18:41:48 +02:00
|
|
|
|
|
|
|
/// Extracts the geographical coordinates contained in each document under the `_geo` field.
|
|
|
|
///
|
|
|
|
/// Returns the generated grenad reader containing the docid as key associated to the (latitude, longitude)
|
2022-09-30 18:47:06 +02:00
|
|
|
#[logging_timer::time]
|
2022-02-16 15:28:48 +01:00
|
|
|
pub fn extract_geo_points<R: io::Read + io::Seek>(
|
2023-10-19 15:55:48 +02:00
|
|
|
// TODO grenad::Reader<Obkv<FieldId, Obkv<DelAdd, JsonValue>>>
|
2022-02-16 15:28:48 +01:00
|
|
|
obkv_documents: grenad::Reader<R>,
|
2021-08-23 18:41:48 +02:00
|
|
|
indexer: GrenadParameters,
|
2021-09-02 15:57:40 +02:00
|
|
|
primary_key_id: FieldId,
|
2022-03-23 17:28:41 +01:00
|
|
|
(lat_fid, lng_fid): (FieldId, FieldId),
|
2023-09-28 16:26:01 +02:00
|
|
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
2023-07-10 18:41:54 +02:00
|
|
|
puffin::profile_function!();
|
|
|
|
|
2022-02-16 15:28:48 +01:00
|
|
|
let mut writer = create_writer(
|
|
|
|
indexer.chunk_compression_type,
|
|
|
|
indexer.chunk_compression_level,
|
|
|
|
tempfile::tempfile()?,
|
|
|
|
);
|
2021-08-23 18:41:48 +02:00
|
|
|
|
2022-02-16 15:28:48 +01:00
|
|
|
let mut cursor = obkv_documents.into_cursor()?;
|
|
|
|
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
2021-08-23 18:41:48 +02:00
|
|
|
let obkv = obkv::KvReader::new(value);
|
2023-10-19 15:55:48 +02:00
|
|
|
// since we only need the primary key when we throw an error
|
|
|
|
// we create this getter to lazily get it when needed
|
2022-06-21 11:16:59 +02:00
|
|
|
let document_id = || -> Value {
|
|
|
|
let document_id = obkv.get(primary_key_id).unwrap();
|
|
|
|
serde_json::from_slice(document_id).unwrap()
|
2022-05-02 19:19:50 +02:00
|
|
|
};
|
|
|
|
|
2023-10-19 15:55:48 +02:00
|
|
|
// HELP we will receive two DelAdds here, one for the lat and one for the lng
|
|
|
|
// what happens if there is a missing Del or Add for one of them?
|
2022-05-02 19:19:50 +02:00
|
|
|
|
2023-10-19 15:55:48 +02:00
|
|
|
// first we get the two fields
|
|
|
|
match (obkv.get(lat_fid), obkv.get(lng_fid)) {
|
|
|
|
(Some(lat), Some(lng)) => {
|
|
|
|
let deladd_lat_obkv = KvReaderDelAdd::new(lat);
|
|
|
|
let deladd_lng_obkv = KvReaderDelAdd::new(lng);
|
2022-05-02 19:19:50 +02:00
|
|
|
|
2023-10-19 15:55:48 +02:00
|
|
|
// then we extract the values
|
|
|
|
let del_lat_lng = deladd_lat_obkv
|
|
|
|
.get(DelAdd::Deletion)
|
|
|
|
.zip(deladd_lng_obkv.get(DelAdd::Deletion))
|
|
|
|
.map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
|
|
|
|
.transpose()?;
|
|
|
|
let add_lat_lng = deladd_lat_obkv
|
|
|
|
.get(DelAdd::Addition)
|
|
|
|
.zip(deladd_lng_obkv.get(DelAdd::Addition))
|
|
|
|
.map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
|
|
|
|
.transpose()?;
|
2022-03-23 17:28:41 +01:00
|
|
|
|
2023-10-19 15:55:48 +02:00
|
|
|
let mut obkv = KvWriterDelAdd::memory();
|
|
|
|
if let Some([lat, lng]) = del_lat_lng {
|
|
|
|
#[allow(clippy::drop_non_drop)]
|
|
|
|
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
|
|
|
obkv.insert(DelAdd::Deletion, bytes)?;
|
|
|
|
}
|
|
|
|
if let Some([lat, lng]) = add_lat_lng {
|
|
|
|
#[allow(clippy::drop_non_drop)]
|
|
|
|
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
|
|
|
obkv.insert(DelAdd::Addition, bytes)?;
|
|
|
|
}
|
|
|
|
let bytes = obkv.into_inner()?;
|
|
|
|
writer.insert(docid_bytes, bytes)?;
|
|
|
|
}
|
|
|
|
(None, Some(_)) => {
|
|
|
|
return Err(GeoError::MissingLatitude { document_id: document_id() }.into())
|
|
|
|
}
|
|
|
|
(Some(_), None) => {
|
|
|
|
return Err(GeoError::MissingLongitude { document_id: document_id() }.into())
|
|
|
|
}
|
|
|
|
(None, None) => (),
|
2022-05-16 15:55:18 +02:00
|
|
|
}
|
2021-08-23 18:41:48 +02:00
|
|
|
}
|
|
|
|
|
2022-10-13 22:02:54 +02:00
|
|
|
writer_into_reader(writer)
|
2021-08-23 18:41:48 +02:00
|
|
|
}
|
2023-10-19 15:55:48 +02:00
|
|
|
|
|
|
|
/// Extract the finite floats lat and lng from two bytes slices.
|
|
|
|
fn extract_lat_lng(lat: &[u8], lng: &[u8], document_id: impl Fn() -> Value) -> Result<[f64; 2]> {
|
|
|
|
let lat = extract_finite_float_from_value(
|
|
|
|
serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
|
|
|
|
)
|
|
|
|
.map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
|
|
|
|
|
|
|
|
let lng = extract_finite_float_from_value(
|
|
|
|
serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
|
|
|
|
)
|
|
|
|
.map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
|
|
|
|
|
|
|
|
Ok([lat, lng])
|
|
|
|
}
|