From 8d9c2c4425de329fb593ec2b40fed545b15e03fa Mon Sep 17 00:00:00 2001 From: Irevoire Date: Mon, 23 Aug 2021 16:32:11 +0200 Subject: [PATCH 01/42] create a new db with getters and setters --- milli/Cargo.toml | 1 + milli/src/index.rs | 30 ++++++++++++++++++++++++++++-- milli/src/lib.rs | 1 + 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index c6fe3ea95..d2767afd4 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -27,6 +27,7 @@ once_cell = "1.5.2" ordered-float = "2.1.1" rayon = "1.5.0" roaring = "0.6.6" +rstar = { version = "0.9.1", features = ["serde"] } serde = { version = "1.0.123", features = ["derive"] } serde_json = { version = "1.0.62", features = ["preserve_order"] } slice-group-by = "0.2.6" diff --git a/milli/src/index.rs b/milli/src/index.rs index f3a2a3e05..d2b4598d3 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -8,6 +8,7 @@ use heed::flags::Flags; use heed::types::*; use heed::{Database, PolyDatabase, RoTxn, RwTxn}; use roaring::RoaringBitmap; +use rstar::RTree; use crate::error::{InternalError, UserError}; use crate::fields_ids_map::FieldsIdsMap; @@ -18,8 +19,8 @@ use crate::heed_codec::facet::{ use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, - FieldIdWordCountCodec, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, - StrLevelPositionCodec, StrStrU8Codec, BEU32, + FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, + Search, StrLevelPositionCodec, StrStrU8Codec, BEU32, }; pub mod main_key { @@ -31,6 +32,7 @@ pub mod main_key { pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; + pub const GEO_RTREE_KEY: &str = "geo"; pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; @@ -294,6 +296,30 @@ impl Index { .unwrap_or_default()) } + /* geo rtree */ + + pub(crate) fn put_geo_rtree>( + &self, + wtxn: &mut RwTxn, + rtree: &RTree, + ) -> heed::Result<()> { + self.main.put::<_, Str, SerdeBincode>>(wtxn, main_key::GEO_RTREE_KEY, rtree) + } + + pub(crate) fn delete_geo_rtree(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, main_key::GEO_RTREE_KEY) + } + + pub fn geo_rtree<'t>(&self, rtxn: &'t RoTxn) -> Result>> { + match self + .main + .get::<_, Str, SerdeBincode>>(rtxn, main_key::GEO_RTREE_KEY)? + { + Some(rtree) => Ok(Some(rtree)), + None => Ok(None), + } + } + /* field distribution */ /// Writes the field distribution which associates every field name with diff --git a/milli/src/lib.rs b/milli/src/lib.rs index af811fe08..2a55b6f3a 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -51,6 +51,7 @@ pub type DocumentId = u32; pub type FieldId = u16; pub type Position = u32; pub type FieldDistribution = BTreeMap; +pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 2], DocumentId>; /// Transform a raw obkv store into a JSON Object. pub fn obkv_to_json( From 44d6b6ae9e119b51c402bba3300c076e9b952bf4 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Mon, 23 Aug 2021 18:41:48 +0200 Subject: [PATCH 02/42] Index the geo points --- milli/src/index.rs | 2 +- .../extract/extract_geo_points.rs | 46 +++++++++++++++++++ .../src/update/index_documents/extract/mod.rs | 15 +++++- milli/src/update/index_documents/mod.rs | 3 ++ .../src/update/index_documents/typed_chunk.rs | 22 ++++++++- 5 files changed, 84 insertions(+), 4 deletions(-) create mode 100644 milli/src/update/index_documents/extract/extract_geo_points.rs diff --git a/milli/src/index.rs b/milli/src/index.rs index d2b4598d3..70aefa9be 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -298,7 +298,7 @@ impl Index { /* geo rtree */ - pub(crate) fn put_geo_rtree>( + pub(crate) fn put_geo_rtree( &self, wtxn: &mut RwTxn, rtree: &RTree, diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs new file mode 100644 index 000000000..9f6e43199 --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -0,0 +1,46 @@ +use std::fs::File; +use std::io; + +use concat_arrays::concat_arrays; +use log::warn; +use serde_json::Value; + +use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; +use crate::{FieldId, InternalError, Result}; + +/// Extracts the geographical coordinates contained in each document under the `_geo` field. +/// +/// Returns the generated grenad reader containing the docid as key associated to the (latitude, longitude) +pub fn extract_geo_points( + mut obkv_documents: grenad::Reader, + indexer: GrenadParameters, + geo_field_id: Option, // faire un grenad vide +) -> Result> { + let mut writer = tempfile::tempfile().and_then(|file| { + create_writer(indexer.chunk_compression_type, indexer.chunk_compression_level, file) + })?; + + // we never encountered any documents with a `_geo` field. We can skip entirely this step + if geo_field_id.is_none() { + return Ok(writer_into_reader(writer)?); + } + let geo_field_id = geo_field_id.unwrap(); + + while let Some((docid_bytes, value)) = obkv_documents.next()? { + let obkv = obkv::KvReader::new(value); + let point = obkv.get(geo_field_id).unwrap(); // TODO: TAMO where should we handle this error? + let point: Value = serde_json::from_slice(point).map_err(InternalError::SerdeJson)?; + + if let Some((lat, long)) = point["lat"].as_f64().zip(point["long"].as_f64()) { + // this will create an array of 16 bytes (two 8 bytes floats) + let bytes: [u8; 16] = concat_arrays![lat.to_le_bytes(), long.to_le_bytes()]; + writer.insert(docid_bytes, bytes)?; + } else { + // TAMO: improve the warn + warn!("Malformed `_geo` field"); + continue; + } + } + + Ok(writer_into_reader(writer)?) +} diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index bb49e3e51..90a279815 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -3,6 +3,7 @@ mod extract_facet_number_docids; mod extract_facet_string_docids; mod extract_fid_docid_facet_values; mod extract_fid_word_count_docids; +mod extract_geo_points; mod extract_word_docids; mod extract_word_level_position_docids; mod extract_word_pair_proximity_docids; @@ -19,6 +20,7 @@ use self::extract_facet_number_docids::extract_facet_number_docids; use self::extract_facet_string_docids::extract_facet_string_docids; use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values; use self::extract_fid_word_count_docids::extract_fid_word_count_docids; +use self::extract_geo_points::extract_geo_points; use self::extract_word_docids::extract_word_docids; use self::extract_word_level_position_docids::extract_word_level_position_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; @@ -37,6 +39,7 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx: Sender>, searchable_fields: Option>, faceted_fields: HashSet, + geo_field_id: Option, stop_words: Option>, ) -> Result<()> { let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks @@ -54,7 +57,7 @@ pub(crate) fn data_from_obkv_documents( .collect(); let ( - docid_word_positions_chunks, + (docid_word_positions_chunks), (docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks), ) = result?; @@ -118,6 +121,16 @@ pub(crate) fn data_from_obkv_documents( "field-id-facet-number-docids", ); + spawn_extraction_task( + documents_chunk, + indexer.clone(), + lmdb_writer_sx.clone(), + move |documents, indexer| extract_geo_points(documents, indexer, geo_field_id), + merge_cbo_roaring_bitmaps, + TypedChunk::GeoPoints, + "geo-points", + ); + Ok(()) } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 7800ae55a..44b108076 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -233,6 +233,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter); // get filterable fields for facet databases let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; + // get the fid of the `_geo` field. + let geo_field_id = self.index.fields_ids_map(self.wtxn)?.id("_geo"); let stop_words = self.index.stop_words(self.wtxn)?; // let stop_words = stop_words.as_ref(); @@ -261,6 +263,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { lmdb_writer_sx.clone(), searchable_fields, faceted_fields, + geo_field_id, stop_words, ) }); diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 5f28034fe..dcefee153 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -6,11 +6,12 @@ use heed::{BytesDecode, RwTxn}; use roaring::RoaringBitmap; use super::helpers::{ - roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap, + self, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key, + CursorClonableMmap, }; use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::update::index_documents::helpers::into_clonable_grenad; -use crate::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, Index, Result}; +use crate::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, GeoPoint, Index, Result}; pub(crate) enum TypedChunk { DocidWordPositions(grenad::Reader), @@ -24,6 +25,7 @@ pub(crate) enum TypedChunk { WordPairProximityDocids(grenad::Reader), FieldIdFacetStringDocids(grenad::Reader), FieldIdFacetNumberDocids(grenad::Reader), + GeoPoints(grenad::Reader), } /// Write typed chunk in the corresponding LMDB database of the provided index. @@ -177,6 +179,22 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } + TypedChunk::GeoPoints(mut geo_points) => { + // TODO: TAMO: we should create the rtree with the `RTree::bulk_load` function + let mut rtree = index.geo_rtree(&index.read_txn()?)?.unwrap_or_default(); + while let Some((key, value)) = geo_points.next()? { + // convert the key back to a u32 (4 bytes) + let (key, _) = helpers::try_split_array_at::(key).unwrap(); + let key = u32::from_le_bytes(key); + + // convert the latitude and longitude back to a f64 (8 bytes) + let (lat, tail) = helpers::try_split_array_at::(value).unwrap(); + let (long, _) = helpers::try_split_array_at::(tail).unwrap(); + let point = [f64::from_le_bytes(lat), f64::from_le_bytes(long)]; + rtree.insert(GeoPoint::new(point, key)); + } + index.put_geo_rtree(wtxn, &rtree)?; + } } Ok((RoaringBitmap::new(), is_merged_database)) From d344489c124287a0fb8a601f5983afe0e350cc67 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Wed, 25 Aug 2021 14:58:36 +0200 Subject: [PATCH 03/42] implement the deletion of geo points --- milli/src/update/delete_documents.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 874eed6ee..0c29b744d 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -380,6 +380,19 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); + if let Some(mut rtree) = self.index.geo_rtree(self.wtxn)? { + let points_to_remove: Vec<_> = rtree + .iter() + .filter(|&point| self.documents_ids.contains(point.data)) + .cloned() + .collect(); + points_to_remove.iter().for_each(|point| { + rtree.remove(&point); + }); + + self.index.put_geo_rtree(self.wtxn, &rtree)?; + } + // We delete the documents ids that are under the facet field id values. remove_docids_from_facet_field_id_number_docids( self.wtxn, From 3b9f1db061e12e53dc76ff8a1a3d86ec796be523 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Wed, 25 Aug 2021 15:32:41 +0200 Subject: [PATCH 04/42] implement the clear of the rtree --- milli/src/update/clear_documents.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 789970a8e..ef91991e8 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -48,6 +48,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?; + self.index.delete_geo_rtree(self.wtxn)?; // We clean all the faceted documents ids. let empty = RoaringBitmap::default(); @@ -93,7 +94,7 @@ mod tests { let content = &br#"[ { "id": 0, "name": "kevin", "age": 20 }, { "id": 1, "name": "kevina" }, - { "id": 2, "name": "benoit", "country": "France" } + { "id": 2, "name": "benoit", "country": "France", "_geo": { "lng": 42, "lat": 35 } } ]"#[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::Json); @@ -107,13 +108,14 @@ mod tests { let rtxn = index.read_txn().unwrap(); - assert_eq!(index.fields_ids_map(&rtxn).unwrap().len(), 4); + assert_eq!(index.fields_ids_map(&rtxn).unwrap().len(), 5); assert!(index.words_fst(&rtxn).unwrap().is_empty()); assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty()); assert!(index.external_documents_ids(&rtxn).unwrap().is_empty()); assert!(index.documents_ids(&rtxn).unwrap().is_empty()); assert!(index.field_distribution(&rtxn).unwrap().is_empty()); + assert!(index.geo_rtree(&rtxn).unwrap().is_none()); assert!(index.word_docids.is_empty(&rtxn).unwrap()); assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); From b4b6ba6d8285d95267ee496f0e10db12eca1bb64 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Wed, 25 Aug 2021 16:00:25 +0200 Subject: [PATCH 05/42] =?UTF-8?q?rename=20all=20the=20=E2=80=99long?= =?UTF-8?q?=E2=80=99=20into=20=E2=80=99lng=E2=80=99=20like=20written=20in?= =?UTF-8?q?=20the=20specification?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/update/index_documents/extract/extract_geo_points.rs | 4 ++-- milli/src/update/index_documents/typed_chunk.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 9f6e43199..0a73e5ed4 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -31,9 +31,9 @@ pub fn extract_geo_points( let point = obkv.get(geo_field_id).unwrap(); // TODO: TAMO where should we handle this error? let point: Value = serde_json::from_slice(point).map_err(InternalError::SerdeJson)?; - if let Some((lat, long)) = point["lat"].as_f64().zip(point["long"].as_f64()) { + if let Some((lat, lng)) = point["lat"].as_f64().zip(point["lng"].as_f64()) { // this will create an array of 16 bytes (two 8 bytes floats) - let bytes: [u8; 16] = concat_arrays![lat.to_le_bytes(), long.to_le_bytes()]; + let bytes: [u8; 16] = concat_arrays![lat.to_le_bytes(), lng.to_le_bytes()]; writer.insert(docid_bytes, bytes)?; } else { // TAMO: improve the warn diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index dcefee153..0dfeabece 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -189,8 +189,8 @@ pub(crate) fn write_typed_chunk_into_index( // convert the latitude and longitude back to a f64 (8 bytes) let (lat, tail) = helpers::try_split_array_at::(value).unwrap(); - let (long, _) = helpers::try_split_array_at::(tail).unwrap(); - let point = [f64::from_le_bytes(lat), f64::from_le_bytes(long)]; + let (lng, _) = helpers::try_split_array_at::(tail).unwrap(); + let point = [f64::from_le_bytes(lat), f64::from_le_bytes(lng)]; rtree.insert(GeoPoint::new(point, key)); } index.put_geo_rtree(wtxn, &rtree)?; From 70ab2c37c5115dee987fd3be46b8dc808a06022f Mon Sep 17 00:00:00 2001 From: Irevoire Date: Wed, 25 Aug 2021 16:59:38 +0200 Subject: [PATCH 06/42] remove multiple bugs --- .../extract/extract_geo_points.rs | 15 ++++++++------ .../src/update/index_documents/extract/mod.rs | 20 +++++++++---------- .../src/update/index_documents/typed_chunk.rs | 6 +++--- 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 0a73e5ed4..1849d5f5d 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -21,19 +21,22 @@ pub fn extract_geo_points( })?; // we never encountered any documents with a `_geo` field. We can skip entirely this step - if geo_field_id.is_none() { - return Ok(writer_into_reader(writer)?); - } - let geo_field_id = geo_field_id.unwrap(); + let geo_field_id = match geo_field_id { + Some(geo) => geo, + None => return Ok(writer_into_reader(writer)?), + }; while let Some((docid_bytes, value)) = obkv_documents.next()? { let obkv = obkv::KvReader::new(value); - let point = obkv.get(geo_field_id).unwrap(); // TODO: TAMO where should we handle this error? + let point = match obkv.get(geo_field_id) { + Some(point) => point, + None => continue, + }; let point: Value = serde_json::from_slice(point).map_err(InternalError::SerdeJson)?; if let Some((lat, lng)) = point["lat"].as_f64().zip(point["lng"].as_f64()) { // this will create an array of 16 bytes (two 8 bytes floats) - let bytes: [u8; 16] = concat_arrays![lat.to_le_bytes(), lng.to_le_bytes()]; + let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; writer.insert(docid_bytes, bytes)?; } else { // TAMO: improve the warn diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 90a279815..736060b15 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -51,13 +51,14 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx.clone(), &searchable_fields, &faceted_fields, + geo_field_id, &stop_words, ) }) .collect(); let ( - (docid_word_positions_chunks), + docid_word_positions_chunks, (docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks), ) = result?; @@ -121,16 +122,6 @@ pub(crate) fn data_from_obkv_documents( "field-id-facet-number-docids", ); - spawn_extraction_task( - documents_chunk, - indexer.clone(), - lmdb_writer_sx.clone(), - move |documents, indexer| extract_geo_points(documents, indexer, geo_field_id), - merge_cbo_roaring_bitmaps, - TypedChunk::GeoPoints, - "geo-points", - ); - Ok(()) } @@ -181,6 +172,7 @@ fn extract_documents_data( lmdb_writer_sx: Sender>, searchable_fields: &Option>, faceted_fields: &HashSet, + geo_field_id: Option, stop_words: &Option>, ) -> Result<( grenad::Reader, @@ -190,6 +182,12 @@ fn extract_documents_data( let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(documents_chunk.clone()))); + let (documents_chunk_cloned, lmdb_writer_sx_cloned) = (documents_chunk.clone(), lmdb_writer_sx.clone()); + rayon::spawn(move || { + let geo_points = extract_geo_points(documents_chunk_cloned, indexer, geo_field_id).unwrap(); + lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))).unwrap(); + }); + let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = rayon::join( || { diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 0dfeabece..9605fea7d 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -181,16 +181,16 @@ pub(crate) fn write_typed_chunk_into_index( } TypedChunk::GeoPoints(mut geo_points) => { // TODO: TAMO: we should create the rtree with the `RTree::bulk_load` function - let mut rtree = index.geo_rtree(&index.read_txn()?)?.unwrap_or_default(); + let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default(); while let Some((key, value)) = geo_points.next()? { // convert the key back to a u32 (4 bytes) let (key, _) = helpers::try_split_array_at::(key).unwrap(); - let key = u32::from_le_bytes(key); + let key = u32::from_be_bytes(key); // convert the latitude and longitude back to a f64 (8 bytes) let (lat, tail) = helpers::try_split_array_at::(value).unwrap(); let (lng, _) = helpers::try_split_array_at::(tail).unwrap(); - let point = [f64::from_le_bytes(lat), f64::from_le_bytes(lng)]; + let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; rtree.insert(GeoPoint::new(point, key)); } index.put_geo_rtree(wtxn, &rtree)?; From a21c8547904af27bcf9dd0d057f8981842a56a50 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Thu, 26 Aug 2021 11:43:17 +0200 Subject: [PATCH 07/42] handle errors --- milli/src/update/index_documents/extract/mod.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 736060b15..aefc0ff92 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -182,10 +182,13 @@ fn extract_documents_data( let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(documents_chunk.clone()))); - let (documents_chunk_cloned, lmdb_writer_sx_cloned) = (documents_chunk.clone(), lmdb_writer_sx.clone()); + let documents_chunk_cloned = documents_chunk.clone(); + let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); rayon::spawn(move || { - let geo_points = extract_geo_points(documents_chunk_cloned, indexer, geo_field_id).unwrap(); - lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))).unwrap(); + let _ = match extract_geo_points(documents_chunk_cloned, indexer, geo_field_id) { + Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))), + Err(error) => lmdb_writer_sx_cloned.send(Err(error)), + }; }); let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = From 216a8aa3b26d23d84034ce92af9101442c8ddf23 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Thu, 26 Aug 2021 13:27:32 +0200 Subject: [PATCH 08/42] add a tests for the indexation of the geosearch --- milli/src/update/delete_documents.rs | 70 +++++++++++++++++++++++++ milli/src/update/index_documents/mod.rs | 10 ++-- 2 files changed, 75 insertions(+), 5 deletions(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 0c29b744d..84fc3215f 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -670,4 +670,74 @@ mod tests { wtxn.commit().unwrap(); } + + #[test] + fn delete_documents_with_geo_points() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_primary_key(S("id")); + builder.execute(|_, _| ()).unwrap(); + + let content = &r#"[ + {"id":"1","city":"Lille", "_geo": { "lat": 50.629973371633746, "lng": 3.0569447399419570 } }, + {"id":"2","city":"Mons-en-Barœul", "_geo": { "lat": 50.641586120121050, "lng": 3.1106593480348670 } }, + {"id":"3","city":"Hellemmes", "_geo": { "lat": 50.631220965518080, "lng": 3.1106399673339933 } }, + {"id":"4","city":"Villeneuve-d'Ascq", "_geo": { "lat": 50.622468098014565, "lng": 3.1476425513437140 } }, + {"id":"5","city":"Hem", "_geo": { "lat": 50.655250871381355, "lng": 3.1897297266244130 } }, + {"id":"6","city":"Roubaix", "_geo": { "lat": 50.692473451896710, "lng": 3.1763326737747650 } }, + {"id":"7","city":"Tourcoing", "_geo": { "lat": 50.726397466736480, "lng": 3.1541653659578670 } }, + {"id":"8","city":"Mouscron", "_geo": { "lat": 50.745325554908610, "lng": 3.2206407854429853 } }, + {"id":"9","city":"Tournai", "_geo": { "lat": 50.605342528602630, "lng": 3.3758586941351414 } }, + {"id":"10","city":"Ghent", "_geo": { "lat": 51.053777403679035, "lng": 3.6957733119926930 } }, + {"id":"11","city":"Brussels", "_geo": { "lat": 50.846640974544690, "lng": 4.3370663564281840 } }, + {"id":"12","city":"Charleroi", "_geo": { "lat": 50.409570138889480, "lng": 4.4347354315085520 } }, + {"id":"13","city":"Mons", "_geo": { "lat": 50.450294178855420, "lng": 3.9623722870904690 } }, + {"id":"14","city":"Valenciennes", "_geo": { "lat": 50.351817774473545, "lng": 3.5326283646928800 } }, + {"id":"15","city":"Arras", "_geo": { "lat": 50.284487528579950, "lng": 2.7637515844478160 } }, + {"id":"16","city":"Cambrai", "_geo": { "lat": 50.179340577906700, "lng": 3.2189409952502930 } }, + {"id":"17","city":"Bapaume", "_geo": { "lat": 50.111276127236400, "lng": 2.8547894666083120 } }, + {"id":"18","city":"Amiens", "_geo": { "lat": 49.931472529669996, "lng": 2.2710499758317080 } }, + {"id":"19","city":"Compiègne", "_geo": { "lat": 49.444980887725656, "lng": 2.7913841281529015 } }, + {"id":"20","city":"Paris", "_geo": { "lat": 48.902100060895480, "lng": 2.3708400867406930 } } + ]"#[..]; + let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"]; + + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Json); + builder.execute(content.as_bytes(), |_, _| ()).unwrap(); + + let external_document_ids = index.external_documents_ids(&wtxn).unwrap(); + let ids_to_delete: Vec = external_ids_to_delete + .iter() + .map(|id| external_document_ids.get(id.as_bytes()).unwrap()) + .collect(); + + // Delete some documents. + let mut builder = DeleteDocuments::new(&mut wtxn, &index, 1).unwrap(); + external_ids_to_delete.iter().for_each(|id| drop(builder.delete_external_id(id))); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let rtree = index.geo_rtree(&rtxn).unwrap().unwrap(); + + let all_geo_ids = rtree.iter().map(|point| point.data).collect::>(); + let all_geo_documents = index.documents(&rtxn, all_geo_ids.iter().copied()).unwrap(); + + for (id, _) in all_geo_documents.iter() { + assert!(!ids_to_delete.contains(&id), "The document {} was supposed to be deleted", id); + } + + assert_eq!( + all_geo_ids.len(), + all_geo_documents.len(), + "We deleted documents that were not supposed to be deleted" + ); + } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 44b108076..ba550afb9 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -877,12 +877,12 @@ mod tests { // First we send 3 documents with an id for only one of them. let mut wtxn = index.write_txn().unwrap(); let documents = &r#"[ - { "id": 2, "title": "Pride and Prejudice", "author": "Jane Austin", "genre": "romance", "price": 3.5 }, + { "id": 2, "title": "Pride and Prejudice", "author": "Jane Austin", "genre": "romance", "price": 3.5, "_geo": { "lat": 12, "lng": 42 } }, { "id": 456, "title": "Le Petit Prince", "author": "Antoine de Saint-Exupéry", "genre": "adventure" , "price": 10.0 }, { "id": 1, "title": "Alice In Wonderland", "author": "Lewis Carroll", "genre": "fantasy", "price": 25.99 }, { "id": 1344, "title": "The Hobbit", "author": "J. R. R. Tolkien", "genre": "fantasy" }, { "id": 4, "title": "Harry Potter and the Half-Blood Prince", "author": "J. K. Rowling", "genre": "fantasy" }, - { "id": 42, "title": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams" } + { "id": 42, "title": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams", "_geo": { "lat": 35, "lng": 23 } } ]"#[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::Json); @@ -918,7 +918,7 @@ mod tests { { "objectId": 123, "title": "Pride and Prejudice", "comment": "A great book" }, { "objectId": 456, "title": "Le Petit Prince", "comment": "A french book" }, { "objectId": 1, "title": "Alice In Wonderland", "comment": "A weird book" }, - { "objectId": 30, "title": "Hamlet" } + { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } ]"#[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::Json); @@ -935,7 +935,7 @@ mod tests { assert!(external_documents_ids.get("30").is_none()); let content = &br#"[ - { "objectId": 30, "title": "Hamlet" } + { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } ]"#[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::Json); @@ -945,7 +945,7 @@ mod tests { assert!(external_documents_ids.get("30").is_some()); let content = &br#"[ - { "objectId": 30, "title": "Hamlet" } + { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } ]"#[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::Json); From 6d70978edc29f25bfa213c459db55cdab0ea2d29 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Thu, 26 Aug 2021 15:51:54 +0200 Subject: [PATCH 09/42] update the facet filter grammar --- milli/src/search/facet/grammar.pest | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/milli/src/search/facet/grammar.pest b/milli/src/search/facet/grammar.pest index 2096517d3..10783b632 100644 --- a/milli/src/search/facet/grammar.pest +++ b/milli/src/search/facet/grammar.pest @@ -16,10 +16,11 @@ neq = {key ~ "!=" ~ value} eq = {key ~ "=" ~ value} greater = {key ~ ">" ~ value} less = {key ~ "<" ~ value} +geo_radius = {"_geoRadius(" ~ value ~ "," ~ value ~ "," ~ value ~ ")"} prgm = {SOI ~ expr ~ EOI} expr = _{ ( term ~ (operation ~ term)* ) } -term = { ("(" ~ expr ~ ")") | condition | not } +term = { ("(" ~ expr ~ ")") | condition | not | geo_radius } operation = _{ and | or } and = {"AND"} or = {"OR"} From 4b459768a0360d7c0759f2c068d47feddae65879 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Thu, 26 Aug 2021 16:38:29 +0200 Subject: [PATCH 10/42] create the _geoRadius filter --- milli/src/search/facet/filter_condition.rs | 48 ++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index a92797e90..1480fc95a 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -32,6 +32,8 @@ pub enum Operator { LowerThan(f64), LowerThanOrEqual(f64), Between(f64, f64), + GeoLowerThan([f64; 2], f64), + GeoGreaterThan([f64; 2], f64), } impl Operator { @@ -46,6 +48,8 @@ impl Operator { LowerThan(n) => (GreaterThanOrEqual(n), None), LowerThanOrEqual(n) => (GreaterThan(n), None), Between(n, m) => (LowerThan(n), Some(GreaterThan(m))), + GeoLowerThan(point, distance) => (GeoGreaterThan(point, distance), None), + GeoGreaterThan(point, distance) => (GeoLowerThan(point, distance), None), } } } @@ -131,6 +135,7 @@ impl FilterCondition { Rule::leq => Ok(Self::lower_than_or_equal(fim, ff, pair)?), Rule::less => Ok(Self::lower_than(fim, ff, pair)?), Rule::between => Ok(Self::between(fim, ff, pair)?), + Rule::geo_radius => Ok(Self::geo_radius(fim, pair)?), Rule::not => Ok(Self::from_pairs(fim, ff, pair.into_inner())?.negate()), Rule::prgm => Self::from_pairs(fim, ff, pair.into_inner()), Rule::term => Self::from_pairs(fim, ff, pair.into_inner()), @@ -156,6 +161,23 @@ impl FilterCondition { } } + fn geo_radius(fields_ids_map: &FieldsIdsMap, item: Pair) -> Result { + let mut items = item.into_inner(); + let fid = match fields_ids_map.id("_geo") { + Some(fid) => fid, + None => return Ok(Empty), + }; + let (lat_result, _) = pest_parse(items.next().unwrap()); + let (lng_result, _) = pest_parse(items.next().unwrap()); + let lat = lat_result.map_err(UserError::InvalidFilter)?; + let lng = lng_result.map_err(UserError::InvalidFilter)?; + let point = [lat, lng]; + let (distance_result, _) = pest_parse(items.next().unwrap()); + let distance = distance_result.map_err(UserError::InvalidFilter)?; + + Ok(Operator(fid, GeoLowerThan(point, distance))) + } + fn between( fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, @@ -440,6 +462,32 @@ impl FilterCondition { LowerThan(val) => (Included(f64::MIN), Excluded(*val)), LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)), Between(left, right) => (Included(*left), Included(*right)), + GeoLowerThan(point, distance) => { + let mut result = RoaringBitmap::new(); + let rtree = match index.geo_rtree(rtxn)? { + Some(rtree) => rtree, + None => return Ok(result), + }; + + let iter = rtree + .nearest_neighbor_iter_with_distance_2(point) + .take_while(|(_, dist)| dist <= distance); + iter.for_each(|(point, _)| drop(result.insert(point.data))); + + return Ok(result); + } + GeoGreaterThan(point, distance) => { + let result = Self::evaluate_operator( + rtxn, + index, + numbers_db, + strings_db, + field_id, + &GeoLowerThan(point.clone(), *distance), + )?; + let geo_faceted_doc_ids = index.geo_faceted_documents_ids(rtxn)?; + return Ok(geo_faceted_doc_ids - result); + } }; // Ask for the biggest value that can exist for this specific field, if it exists From ea2f2ecf96dbfd03e419a7207e8c17202eeac03d Mon Sep 17 00:00:00 2001 From: Irevoire Date: Thu, 26 Aug 2021 17:49:50 +0200 Subject: [PATCH 11/42] create a new database containing all the documents that were geo-faceted --- milli/src/index.rs | 38 ++++++++++++++++++- milli/src/update/clear_documents.rs | 2 + milli/src/update/delete_documents.rs | 29 +++++++++++++- .../src/update/index_documents/typed_chunk.rs | 4 ++ 4 files changed, 70 insertions(+), 3 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 70aefa9be..f2ddba699 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -32,7 +32,8 @@ pub mod main_key { pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; - pub const GEO_RTREE_KEY: &str = "geo"; + pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids"; + pub const GEO_RTREE_KEY: &str = "geo-rtree"; pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; @@ -320,6 +321,41 @@ impl Index { } } + /* geo faceted */ + + /// Writes the documents ids that are faceted with a _geo field + pub(crate) fn put_geo_faceted_documents_ids( + &self, + wtxn: &mut RwTxn, + docids: &RoaringBitmap, + ) -> heed::Result<()> { + self.main.put::<_, Str, RoaringBitmapCodec>( + wtxn, + main_key::GEO_FACETED_DOCUMENTS_IDS_KEY, + docids, + ) + } + + /// Delete the documents ids that are faceted with a _geo field + pub(crate) fn delete_geo_faceted_documents_ids(&self, wtxn: &mut RwTxn) -> heed::Result<()> { + self.main.put::<_, Str, RoaringBitmapCodec>( + wtxn, + main_key::GEO_FACETED_DOCUMENTS_IDS_KEY, + &RoaringBitmap::new(), + ) + } + + /// Retrieve all the documents ids that faceted with a _geo field + pub fn geo_faceted_documents_ids(&self, rtxn: &RoTxn) -> heed::Result { + match self + .main + .get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::GEO_FACETED_DOCUMENTS_IDS_KEY)? + { + Some(docids) => Ok(docids), + None => Ok(RoaringBitmap::new()), + } + } + /* field distribution */ /// Writes the field distribution which associates every field name with diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index ef91991e8..e937cb65f 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -49,6 +49,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?; self.index.delete_geo_rtree(self.wtxn)?; + self.index.delete_geo_faceted_documents_ids(self.wtxn)?; // We clean all the faceted documents ids. let empty = RoaringBitmap::default(); @@ -116,6 +117,7 @@ mod tests { assert!(index.documents_ids(&rtxn).unwrap().is_empty()); assert!(index.field_distribution(&rtxn).unwrap().is_empty()); assert!(index.geo_rtree(&rtxn).unwrap().is_none()); + assert!(index.geo_faceted_documents_ids(&rtxn).unwrap().is_empty()); assert!(index.word_docids.is_empty(&rtxn).unwrap()); assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 84fc3215f..cfd777d11 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -381,6 +381,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); if let Some(mut rtree) = self.index.geo_rtree(self.wtxn)? { + let mut geo_faceted_doc_ids = self.index.geo_faceted_documents_ids(self.wtxn)?; + let points_to_remove: Vec<_> = rtree .iter() .filter(|&point| self.documents_ids.contains(point.data)) @@ -388,9 +390,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { .collect(); points_to_remove.iter().for_each(|point| { rtree.remove(&point); + geo_faceted_doc_ids.remove(point.data); }); self.index.put_geo_rtree(self.wtxn, &rtree)?; + self.index.put_geo_faceted_documents_ids(self.wtxn, &geo_faceted_doc_ids)?; } // We delete the documents ids that are under the facet field id values. @@ -555,6 +559,8 @@ where #[cfg(test)] mod tests { + use std::collections::HashSet; + use big_s::S; use heed::EnvOpenOptions; use maplit::hashset; @@ -726,11 +732,30 @@ mod tests { let rtxn = index.read_txn().unwrap(); let rtree = index.geo_rtree(&rtxn).unwrap().unwrap(); + let geo_faceted_doc_ids = index.geo_faceted_documents_ids(&rtxn).unwrap(); let all_geo_ids = rtree.iter().map(|point| point.data).collect::>(); - let all_geo_documents = index.documents(&rtxn, all_geo_ids.iter().copied()).unwrap(); + let all_geo_documents = index + .documents(&rtxn, all_geo_ids.iter().copied()) + .unwrap() + .iter() + .map(|(id, _)| *id) + .collect::>(); - for (id, _) in all_geo_documents.iter() { + let all_geo_faceted_ids = geo_faceted_doc_ids.iter().collect::>(); + let all_geo_faceted_documents = index + .documents(&rtxn, all_geo_faceted_ids.iter().copied()) + .unwrap() + .iter() + .map(|(id, _)| *id) + .collect::>(); + + assert_eq!( + all_geo_documents, all_geo_faceted_documents, + "There is an inconsistency between the geo_faceted database and the rtree" + ); + + for id in all_geo_documents.iter() { assert!(!ids_to_delete.contains(&id), "The document {} was supposed to be deleted", id); } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 9605fea7d..b09bee213 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -182,6 +182,8 @@ pub(crate) fn write_typed_chunk_into_index( TypedChunk::GeoPoints(mut geo_points) => { // TODO: TAMO: we should create the rtree with the `RTree::bulk_load` function let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default(); + let mut doc_ids = index.geo_faceted_documents_ids(wtxn)?; + while let Some((key, value)) = geo_points.next()? { // convert the key back to a u32 (4 bytes) let (key, _) = helpers::try_split_array_at::(key).unwrap(); @@ -192,8 +194,10 @@ pub(crate) fn write_typed_chunk_into_index( let (lng, _) = helpers::try_split_array_at::(tail).unwrap(); let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; rtree.insert(GeoPoint::new(point, key)); + doc_ids.insert(key); } index.put_geo_rtree(wtxn, &rtree)?; + index.put_geo_faceted_documents_ids(wtxn, &doc_ids)?; } } From 4fd0116a0d4a27c95091c46c29db24df6490e5e4 Mon Sep 17 00:00:00 2001 From: cvermand <33010418+bidoubiwa@users.noreply.github.com> Date: Thu, 26 Aug 2021 19:12:30 +0200 Subject: [PATCH 12/42] Stringify objects on dashboard to avoid [Object object] --- http-ui/public/script.js | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/http-ui/public/script.js b/http-ui/public/script.js index b621cd453..e4de86672 100644 --- a/http-ui/public/script.js +++ b/http-ui/public/script.js @@ -60,7 +60,13 @@ $('#query, #filters').on('input', function () { const content = document.createElement('div'); content.classList.add("content"); - content.innerHTML = element[prop]; + + // Stringify Objects and Arrays to avoid [Object object] + if (typeof element[prop] === 'object' && element[prop] !== null) { + content.innerHTML = JSON.stringify(element[prop]); + } else { + content.innerHTML = element[prop]; + } field.appendChild(attribute); field.appendChild(content); From f73273d71c3342801e52395d7ab52b337bb2d89c Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 30 Aug 2021 15:47:11 +0200 Subject: [PATCH 13/42] only call the extractor if needed --- .../extract/extract_geo_points.rs | 8 +------- .../src/update/index_documents/extract/mod.rs | 18 ++++++++++-------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 1849d5f5d..88ae7c177 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -14,18 +14,12 @@ use crate::{FieldId, InternalError, Result}; pub fn extract_geo_points( mut obkv_documents: grenad::Reader, indexer: GrenadParameters, - geo_field_id: Option, // faire un grenad vide + geo_field_id: FieldId, ) -> Result> { let mut writer = tempfile::tempfile().and_then(|file| { create_writer(indexer.chunk_compression_type, indexer.chunk_compression_level, file) })?; - // we never encountered any documents with a `_geo` field. We can skip entirely this step - let geo_field_id = match geo_field_id { - Some(geo) => geo, - None => return Ok(writer_into_reader(writer)?), - }; - while let Some((docid_bytes, value)) = obkv_documents.next()? { let obkv = obkv::KvReader::new(value); let point = match obkv.get(geo_field_id) { diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index aefc0ff92..4cb21c8e4 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -182,14 +182,16 @@ fn extract_documents_data( let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(documents_chunk.clone()))); - let documents_chunk_cloned = documents_chunk.clone(); - let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); - rayon::spawn(move || { - let _ = match extract_geo_points(documents_chunk_cloned, indexer, geo_field_id) { - Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))), - Err(error) => lmdb_writer_sx_cloned.send(Err(error)), - }; - }); + if let Some(geo_field_id) = geo_field_id { + let documents_chunk_cloned = documents_chunk.clone(); + let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); + rayon::spawn(move || { + let _ = match extract_geo_points(documents_chunk_cloned, indexer, geo_field_id) { + Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))), + Err(error) => lmdb_writer_sx_cloned.send(Err(error)), + }; + }); + } let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = rayon::join( From 5bb175fc90eb2d9cb3c8bc5c39b7deb8dc1bf5ed Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 30 Aug 2021 15:47:33 +0200 Subject: [PATCH 14/42] only index _geo if it's set as sortable OR filterable and only allow the filters if geo was set to filterable --- milli/src/search/facet/filter_condition.rs | 6 ++++++ milli/src/update/delete_documents.rs | 2 ++ milli/src/update/index_documents/mod.rs | 8 +++++++- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 1480fc95a..66a2ffac7 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -463,6 +463,9 @@ impl FilterCondition { LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)), Between(left, right) => (Included(*left), Included(*right)), GeoLowerThan(point, distance) => { + if index.filterable_fields(rtxn)?.contains("_geo") { + Err(UserError::AttributeLimitReached)?; // TODO: TAMO use a real error + } let mut result = RoaringBitmap::new(); let rtree = match index.geo_rtree(rtxn)? { Some(rtree) => rtree, @@ -477,6 +480,9 @@ impl FilterCondition { return Ok(result); } GeoGreaterThan(point, distance) => { + if index.filterable_fields(rtxn)?.contains("_geo") { + Err(UserError::AttributeLimitReached)?; // TODO: TAMO use a real error + } let result = Self::evaluate_operator( rtxn, index, diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index cfd777d11..639770bd6 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -687,6 +687,8 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_primary_key(S("id")); + builder.set_filterable_fields(hashset!(S("_geo"))); + builder.set_sortable_fields(hashset!(S("_geo"))); builder.execute(|_, _| ()).unwrap(); let content = &r#"[ diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ba550afb9..d4fd3570e 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -234,7 +234,13 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // get filterable fields for facet databases let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; // get the fid of the `_geo` field. - let geo_field_id = self.index.fields_ids_map(self.wtxn)?.id("_geo"); + let geo_field_id = if let Some(gfid) = self.index.fields_ids_map(self.wtxn)?.id("_geo") { + (self.index.sortable_fields_ids(self.wtxn)?.contains(&gfid) + || self.index.filterable_fields_ids(self.wtxn)?.contains(&gfid)) + .then(|| gfid) + } else { + None + }; let stop_words = self.index.stop_words(self.wtxn)?; // let stop_words = stop_words.as_ref(); From 13c78e5aa2dc9413f02c2df3d28d6c700f88fb93 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 30 Aug 2021 18:22:52 +0200 Subject: [PATCH 15/42] Implement the _geoPoint in the sortable --- milli/src/criterion.rs | 78 +++++++++++++++++++++++---- milli/src/search/criteria/asc_desc.rs | 75 +++++++++++++++++++------- milli/src/search/criteria/mod.rs | 10 ++-- milli/src/search/mod.rs | 14 ++--- 4 files changed, 138 insertions(+), 39 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index d91d4a7e1..2bca6948b 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -58,24 +58,84 @@ impl FromStr for Criterion { Err(error) => { Err(UserError::InvalidCriterionName { name: error.to_string() }.into()) } + Ok(AscDesc::Asc(Member::Geo(_))) | Ok(AscDesc::Desc(Member::Geo(_))) => { + Err(UserError::AttributeLimitReached)? // TODO: TAMO: use a real error + } + Err(error) => Err(error.into()), }, } } } -#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] +pub enum Member { + Field(String), + Geo([f64; 2]), +} + +impl FromStr for Member { + type Err = UserError; + + fn from_str(text: &str) -> Result { + if text.starts_with("_geoPoint(") { + let point = + text.strip_prefix("_geoPoint(") + .and_then(|point| point.strip_suffix(")")) + .ok_or_else(|| UserError::InvalidCriterionName { name: text.to_string() })?; + let point = point + .split(',') + .map(|el| el.parse()) + .collect::, _>>() + .map_err(|_| UserError::InvalidCriterionName { name: text.to_string() })?; + Ok(Member::Geo([point[0], point[1]])) + } else { + Ok(Member::Field(text.to_string())) + } + } +} + +impl fmt::Display for Member { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Member::Field(name) => write!(f, "{}", name), + Member::Geo([lat, lng]) => write!(f, "_geoPoint({}, {})", lat, lng), + } + } +} + +impl Member { + pub fn field(&self) -> Option<&str> { + match self { + Member::Field(field) => Some(field), + Member::Geo(_) => None, + } + } + + pub fn geo_point(&self) -> Option<&[f64; 2]> { + match self { + Member::Geo(point) => Some(point), + Member::Field(_) => None, + } + } +} + +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] pub enum AscDesc { - Asc(String), - Desc(String), + Asc(Member), + Desc(Member), } impl AscDesc { - pub fn field(&self) -> &str { + pub fn member(&self) -> &Member { match self { - AscDesc::Asc(field) => field, - AscDesc::Desc(field) => field, + AscDesc::Asc(member) => member, + AscDesc::Desc(member) => member, } } + + pub fn field(&self) -> Option<&str> { + self.member().field() + } } impl FromStr for AscDesc { @@ -85,9 +145,9 @@ impl FromStr for AscDesc { /// string and let the caller create his own error fn from_str(text: &str) -> Result { match text.rsplit_once(':') { - Some((field_name, "asc")) => Ok(AscDesc::Asc(field_name.to_string())), - Some((field_name, "desc")) => Ok(AscDesc::Desc(field_name.to_string())), - _ => Err(UserError::InvalidAscDescSyntax { name: text.to_string() }), + Some((left, "asc")) => Ok(AscDesc::Asc(left.parse()?)), + Some((left, "desc")) => Ok(AscDesc::Desc(left.parse()?)), + _ => Err(UserError::InvalidCriterionName { name: text.to_string() }), } } } diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 6d50c1bb5..b0951f655 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -4,12 +4,14 @@ use itertools::Itertools; use log::debug; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; +use rstar::RTree; use super::{Criterion, CriterionParameters, CriterionResult}; +use crate::criterion::Member; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::{FacetNumberIter, FacetStringIter}; use crate::search::query_tree::Operation; -use crate::{FieldId, Index, Result}; +use crate::{FieldId, GeoPoint, Index, Result}; /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. @@ -18,10 +20,11 @@ const CANDIDATES_THRESHOLD: u64 = 1000; pub struct AscDesc<'t> { index: &'t Index, rtxn: &'t heed::RoTxn<'t>, - field_name: String, + member: Member, field_id: Option, is_ascending: bool, query_tree: Option, + rtree: Option>, candidates: Box> + 't>, allowed_candidates: RoaringBitmap, bucket_candidates: RoaringBitmap, @@ -34,29 +37,29 @@ impl<'t> AscDesc<'t> { index: &'t Index, rtxn: &'t heed::RoTxn, parent: Box, - field_name: String, + member: Member, ) -> Result { - Self::new(index, rtxn, parent, field_name, true) + Self::new(index, rtxn, parent, member, true) } pub fn desc( index: &'t Index, rtxn: &'t heed::RoTxn, parent: Box, - field_name: String, + member: Member, ) -> Result { - Self::new(index, rtxn, parent, field_name, false) + Self::new(index, rtxn, parent, member, false) } fn new( index: &'t Index, rtxn: &'t heed::RoTxn, parent: Box, - field_name: String, + member: Member, is_ascending: bool, ) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; - let field_id = fields_ids_map.id(&field_name); + let field_id = member.field().and_then(|field| fields_ids_map.id(&field)); let faceted_candidates = match field_id { Some(field_id) => { let number_faceted = index.number_faceted_documents_ids(rtxn, field_id)?; @@ -65,14 +68,16 @@ impl<'t> AscDesc<'t> { } None => RoaringBitmap::default(), }; + let rtree = index.geo_rtree(rtxn)?; Ok(AscDesc { index, rtxn, - field_name, + member, field_id, is_ascending, query_tree: None, + rtree, candidates: Box::new(std::iter::empty()), allowed_candidates: RoaringBitmap::new(), faceted_candidates, @@ -92,7 +97,7 @@ impl<'t> Criterion for AscDesc<'t> { debug!( "Facet {}({}) iteration", if self.is_ascending { "Asc" } else { "Desc" }, - self.field_name + self.member ); match self.candidates.next().transpose()? { @@ -135,15 +140,31 @@ impl<'t> Criterion for AscDesc<'t> { } self.allowed_candidates = &candidates - params.excluded_candidates; - self.candidates = match self.field_id { - Some(field_id) => facet_ordered( - self.index, - self.rtxn, - field_id, - self.is_ascending, - candidates & &self.faceted_candidates, - )?, - None => Box::new(std::iter::empty()), + + match &self.member { + Member::Field(field_name) => { + self.candidates = match self.field_id { + Some(field_id) => facet_ordered( + self.index, + self.rtxn, + field_id, + self.is_ascending, + candidates & &self.faceted_candidates, + )?, + None => Box::new(std::iter::empty()), + } + } + Member::Geo(point) => { + self.candidates = match &self.rtree { + Some(rtree) => { + // TODO: TAMO how to remove that? + let rtree = Box::new(rtree.clone()); + let rtree = Box::leak(rtree); + geo_point(rtree, candidates, point.clone())? + } + None => Box::new(std::iter::empty()), + } + } }; } None => return Ok(None), @@ -163,6 +184,22 @@ impl<'t> Criterion for AscDesc<'t> { } } +fn geo_point<'t>( + rtree: &'t RTree, + candidates: RoaringBitmap, + point: [f64; 2], +) -> Result> + 't>> { + Ok(Box::new( + rtree + .nearest_neighbor_iter_with_distance_2(&point) + .filter_map(move |(point, _distance)| { + candidates.contains(point.data).then(|| point.data) + }) + .map(|id| std::iter::once(id).collect::()) + .map(Ok), + )) +} + /// Returns an iterator over groups of the given candidates in ascending or descending order. /// /// It will either use an iterative or a recursive method on the whole facet database depending diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 2a883de67..92c0d284a 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -12,7 +12,7 @@ use self::r#final::Final; use self::typo::Typo; use self::words::Words; use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; -use crate::criterion::AscDesc as AscDescName; +use crate::criterion::{AscDesc as AscDescName, Member}; use crate::search::{word_derivations, WordDerivationsCache}; use crate::{DocumentId, FieldId, Index, Result, TreeLevel}; @@ -294,13 +294,13 @@ impl<'t> CriteriaBuilder<'t> { &self.index, &self.rtxn, criterion, - field.to_string(), + field.clone(), )?), AscDescName::Desc(field) => Box::new(AscDesc::desc( &self.index, &self.rtxn, criterion, - field.to_string(), + field.clone(), )?), }; } @@ -312,10 +312,10 @@ impl<'t> CriteriaBuilder<'t> { Name::Attribute => Box::new(Attribute::new(self, criterion)), Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?), Name::Asc(field) => { - Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?) + Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, Member::Field(field))?) } Name::Desc(field) => { - Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?) + Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, Member::Field(field))?) } }; } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 207f46f8a..f752f5822 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -148,13 +148,15 @@ impl<'a> Search<'a> { if let Some(sort_criteria) = &self.sort_criteria { let sortable_fields = self.index.sortable_fields(self.rtxn)?; for asc_desc in sort_criteria { - let field = asc_desc.field(); - if !sortable_fields.contains(field) { - return Err(UserError::InvalidSortableAttribute { - field: field.to_string(), - valid_fields: sortable_fields, + // we are not supposed to find any geoPoint in the criterion + if let Some(field) = asc_desc.field() { + if !sortable_fields.contains(field) { + return Err(UserError::InvalidSortableAttribute { + field: field.to_string(), + valid_fields: sortable_fields, + } + .into()); } - .into()); } } } From 4820ac71a66de224979a03260c20bbb03796cf56 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 31 Aug 2021 18:48:40 +0200 Subject: [PATCH 16/42] allow spaces in a geoRadius --- milli/src/criterion.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 2bca6948b..35268461e 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -84,7 +84,7 @@ impl FromStr for Member { .ok_or_else(|| UserError::InvalidCriterionName { name: text.to_string() })?; let point = point .split(',') - .map(|el| el.parse()) + .map(|el| el.trim()).parse() .collect::, _>>() .map_err(|_| UserError::InvalidCriterionName { name: text.to_string() })?; Ok(Member::Geo([point[0], point[1]])) From 7483614b75b8ea57e3960f5ac94c62a6acb7a8b8 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 31 Aug 2021 18:49:06 +0200 Subject: [PATCH 17/42] [HTTP-UI] add the sorters --- http-ui/src/main.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 108ec0549..89f3dcab2 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -695,6 +695,7 @@ async fn main() -> anyhow::Result<()> { struct QueryBody { query: Option, filters: Option, + sorters: Option, facet_filters: Option, String>>>, facet_distribution: Option, limit: Option, @@ -754,6 +755,10 @@ async fn main() -> anyhow::Result<()> { search.limit(limit); } + if let Some(sort) = query.sorters { + search.sort_criteria(vec![sort.parse().unwrap()]); + } + let SearchResult { matching_words, candidates, documents_ids } = search.execute().unwrap(); From dc84ecc40b16eaf3ced19b9b8282d4723525e431 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 1 Sep 2021 10:56:19 +0200 Subject: [PATCH 18/42] fix a bug --- milli/src/criterion.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 35268461e..ea3214c8e 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -84,7 +84,7 @@ impl FromStr for Member { .ok_or_else(|| UserError::InvalidCriterionName { name: text.to_string() })?; let point = point .split(',') - .map(|el| el.trim()).parse() + .map(|el| el.trim().parse()) .collect::, _>>() .map_err(|_| UserError::InvalidCriterionName { name: text.to_string() })?; Ok(Member::Geo([point[0], point[1]])) From a8a1f5bd556aa28c7e1e0ea304a2919a1deb6161 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 1 Sep 2021 15:14:23 +0200 Subject: [PATCH 19/42] move the geosearch criteria out of asc_desc.rs --- milli/src/search/criteria/asc_desc.rs | 75 +++++------------ milli/src/search/criteria/geo.rs | 115 ++++++++++++++++++++++++++ milli/src/search/criteria/mod.rs | 40 +++++---- 3 files changed, 160 insertions(+), 70 deletions(-) create mode 100644 milli/src/search/criteria/geo.rs diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index b0951f655..6d50c1bb5 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -4,14 +4,12 @@ use itertools::Itertools; use log::debug; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; -use rstar::RTree; use super::{Criterion, CriterionParameters, CriterionResult}; -use crate::criterion::Member; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::{FacetNumberIter, FacetStringIter}; use crate::search::query_tree::Operation; -use crate::{FieldId, GeoPoint, Index, Result}; +use crate::{FieldId, Index, Result}; /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. @@ -20,11 +18,10 @@ const CANDIDATES_THRESHOLD: u64 = 1000; pub struct AscDesc<'t> { index: &'t Index, rtxn: &'t heed::RoTxn<'t>, - member: Member, + field_name: String, field_id: Option, is_ascending: bool, query_tree: Option, - rtree: Option>, candidates: Box> + 't>, allowed_candidates: RoaringBitmap, bucket_candidates: RoaringBitmap, @@ -37,29 +34,29 @@ impl<'t> AscDesc<'t> { index: &'t Index, rtxn: &'t heed::RoTxn, parent: Box, - member: Member, + field_name: String, ) -> Result { - Self::new(index, rtxn, parent, member, true) + Self::new(index, rtxn, parent, field_name, true) } pub fn desc( index: &'t Index, rtxn: &'t heed::RoTxn, parent: Box, - member: Member, + field_name: String, ) -> Result { - Self::new(index, rtxn, parent, member, false) + Self::new(index, rtxn, parent, field_name, false) } fn new( index: &'t Index, rtxn: &'t heed::RoTxn, parent: Box, - member: Member, + field_name: String, is_ascending: bool, ) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; - let field_id = member.field().and_then(|field| fields_ids_map.id(&field)); + let field_id = fields_ids_map.id(&field_name); let faceted_candidates = match field_id { Some(field_id) => { let number_faceted = index.number_faceted_documents_ids(rtxn, field_id)?; @@ -68,16 +65,14 @@ impl<'t> AscDesc<'t> { } None => RoaringBitmap::default(), }; - let rtree = index.geo_rtree(rtxn)?; Ok(AscDesc { index, rtxn, - member, + field_name, field_id, is_ascending, query_tree: None, - rtree, candidates: Box::new(std::iter::empty()), allowed_candidates: RoaringBitmap::new(), faceted_candidates, @@ -97,7 +92,7 @@ impl<'t> Criterion for AscDesc<'t> { debug!( "Facet {}({}) iteration", if self.is_ascending { "Asc" } else { "Desc" }, - self.member + self.field_name ); match self.candidates.next().transpose()? { @@ -140,31 +135,15 @@ impl<'t> Criterion for AscDesc<'t> { } self.allowed_candidates = &candidates - params.excluded_candidates; - - match &self.member { - Member::Field(field_name) => { - self.candidates = match self.field_id { - Some(field_id) => facet_ordered( - self.index, - self.rtxn, - field_id, - self.is_ascending, - candidates & &self.faceted_candidates, - )?, - None => Box::new(std::iter::empty()), - } - } - Member::Geo(point) => { - self.candidates = match &self.rtree { - Some(rtree) => { - // TODO: TAMO how to remove that? - let rtree = Box::new(rtree.clone()); - let rtree = Box::leak(rtree); - geo_point(rtree, candidates, point.clone())? - } - None => Box::new(std::iter::empty()), - } - } + self.candidates = match self.field_id { + Some(field_id) => facet_ordered( + self.index, + self.rtxn, + field_id, + self.is_ascending, + candidates & &self.faceted_candidates, + )?, + None => Box::new(std::iter::empty()), }; } None => return Ok(None), @@ -184,22 +163,6 @@ impl<'t> Criterion for AscDesc<'t> { } } -fn geo_point<'t>( - rtree: &'t RTree, - candidates: RoaringBitmap, - point: [f64; 2], -) -> Result> + 't>> { - Ok(Box::new( - rtree - .nearest_neighbor_iter_with_distance_2(&point) - .filter_map(move |(point, _distance)| { - candidates.contains(point.data).then(|| point.data) - }) - .map(|id| std::iter::once(id).collect::()) - .map(Ok), - )) -} - /// Returns an iterator over groups of the given candidates in ascending or descending order. /// /// It will either use an iterative or a recursive method on the whole facet database depending diff --git a/milli/src/search/criteria/geo.rs b/milli/src/search/criteria/geo.rs new file mode 100644 index 000000000..740bcb3a8 --- /dev/null +++ b/milli/src/search/criteria/geo.rs @@ -0,0 +1,115 @@ +use roaring::RoaringBitmap; +use rstar::RTree; + +use super::{Criterion, CriterionParameters, CriterionResult}; +use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; +use crate::{GeoPoint, Index, Result}; + +pub struct Geo<'t> { + index: &'t Index, + rtxn: &'t heed::RoTxn<'t>, + parent: Box, + candidates: Box>, + allowed_candidates: RoaringBitmap, + bucket_candidates: RoaringBitmap, + rtree: Option>, + point: [f64; 2], +} + +impl<'t> Geo<'t> { + pub fn new( + index: &'t Index, + rtxn: &'t heed::RoTxn<'t>, + parent: Box, + point: [f64; 2], + ) -> Result { + let candidates = Box::new(std::iter::empty()); + let allowed_candidates = index.geo_faceted_documents_ids(rtxn)?; + let bucket_candidates = RoaringBitmap::new(); + let rtree = index.geo_rtree(rtxn)?; + + Ok(Self { index, rtxn, parent, candidates, allowed_candidates, bucket_candidates, rtree, point }) + } +} + +impl<'t> Criterion for Geo<'t> { + fn next(&mut self, params: &mut CriterionParameters) -> Result> { + // if there is no rtree we have nothing to returns + let rtree = match self.rtree.as_ref() { + Some(rtree) => rtree, + None => return Ok(None), + }; + + loop { + match self.candidates.next() { + Some(mut candidates) => { + candidates -= params.excluded_candidates; + self.allowed_candidates -= &candidates; + return Ok(Some(CriterionResult { + query_tree: None, + candidates: Some(candidates), + filtered_candidates: None, + bucket_candidates: Some(self.bucket_candidates.clone()), + })); + } + None => { + match self.parent.next(params)? { + Some(CriterionResult { + query_tree, + candidates, + filtered_candidates, + bucket_candidates, + }) => { + let mut candidates = match (&query_tree, candidates) { + (_, Some(candidates)) => candidates, + (Some(qt), None) => { + let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; + resolve_query_tree(&context, qt, params.wdcache)? + } + // TODO: TAMO: why are we doing this? + (None, None) => self.index.documents_ids(self.rtxn)?, + }; + + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } + + match bucket_candidates { + // why not are we keeping elements from the previous bucket? + Some(bucket_candidates) => { + self.bucket_candidates |= bucket_candidates + } + None => self.bucket_candidates |= &candidates, + } + + if candidates.is_empty() { + continue; + } + let rtree = Box::new(rtree.clone()); + let rtree = Box::leak(rtree); + + self.allowed_candidates = &candidates - params.excluded_candidates; + self.candidates = geo_point(rtree, self.allowed_candidates.clone(), self.point)?; + } + None => return Ok(None), + } + } + } + } + } +} + +fn geo_point<'t>( + rtree: &'t RTree, + candidates: RoaringBitmap, + point: [f64; 2], +) -> Result + 't>> { + Ok(Box::new( + rtree + .nearest_neighbor_iter_with_distance_2(&point) + .filter_map(move |(point, _distance)| { + candidates.contains(point.data).then(|| point.data) + }) + .map(|id| std::iter::once(id).collect::()) + )) +} diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 92c0d284a..185761632 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -13,10 +13,12 @@ use self::typo::Typo; use self::words::Words; use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; use crate::criterion::{AscDesc as AscDescName, Member}; +use crate::search::criteria::geo::Geo; use crate::search::{word_derivations, WordDerivationsCache}; use crate::{DocumentId, FieldId, Index, Result, TreeLevel}; mod asc_desc; +mod geo; mod attribute; mod exactness; pub mod r#final; @@ -290,18 +292,28 @@ impl<'t> CriteriaBuilder<'t> { Some(ref sort_criteria) => { for asc_desc in sort_criteria { criterion = match asc_desc { - AscDescName::Asc(field) => Box::new(AscDesc::asc( - &self.index, - &self.rtxn, - criterion, - field.clone(), - )?), - AscDescName::Desc(field) => Box::new(AscDesc::desc( - &self.index, - &self.rtxn, - criterion, - field.clone(), - )?), + AscDescName::Asc(Member::Field(field)) => { + Box::new(AscDesc::asc( + &self.index, + &self.rtxn, + criterion, + field.to_string(), + )?) + } + AscDescName::Desc(Member::Field(field)) => { + Box::new(AscDesc::desc( + &self.index, + &self.rtxn, + criterion, + field.to_string(), + )?) + } + AscDescName::Asc(Member::Geo(point)) => { + Box::new(Geo::new(&self.index, &self.rtxn, criterion, point.clone())?) + } + AscDescName::Desc(Member::Geo(_point)) => { + panic!("You can't desc geosort"); // TODO: TAMO: remove this + } }; } criterion @@ -312,10 +324,10 @@ impl<'t> CriteriaBuilder<'t> { Name::Attribute => Box::new(Attribute::new(self, criterion)), Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?), Name::Asc(field) => { - Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, Member::Field(field))?) + Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?) } Name::Desc(field) => { - Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, Member::Field(field))?) + Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?) } }; } From aca707413c29d3e0c9d00e4549ae7c27a6735a43 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 1 Sep 2021 17:02:51 +0200 Subject: [PATCH 20/42] remove the memory leak --- milli/src/search/criteria/geo.rs | 100 +++++++++++++++---------------- 1 file changed, 50 insertions(+), 50 deletions(-) diff --git a/milli/src/search/criteria/geo.rs b/milli/src/search/criteria/geo.rs index 740bcb3a8..bf3ae7aba 100644 --- a/milli/src/search/criteria/geo.rs +++ b/milli/src/search/criteria/geo.rs @@ -28,7 +28,16 @@ impl<'t> Geo<'t> { let bucket_candidates = RoaringBitmap::new(); let rtree = index.geo_rtree(rtxn)?; - Ok(Self { index, rtxn, parent, candidates, allowed_candidates, bucket_candidates, rtree, point }) + Ok(Self { + index, + rtxn, + parent, + candidates, + allowed_candidates, + bucket_candidates, + rtree, + point, + }) } } @@ -52,64 +61,55 @@ impl<'t> Criterion for Geo<'t> { bucket_candidates: Some(self.bucket_candidates.clone()), })); } - None => { - match self.parent.next(params)? { - Some(CriterionResult { - query_tree, - candidates, - filtered_candidates, - bucket_candidates, - }) => { - let mut candidates = match (&query_tree, candidates) { - (_, Some(candidates)) => candidates, - (Some(qt), None) => { - let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; - resolve_query_tree(&context, qt, params.wdcache)? - } - // TODO: TAMO: why are we doing this? - (None, None) => self.index.documents_ids(self.rtxn)?, - }; - - if let Some(filtered_candidates) = filtered_candidates { - candidates &= filtered_candidates; + None => match self.parent.next(params)? { + Some(CriterionResult { + query_tree, + candidates, + filtered_candidates, + bucket_candidates, + }) => { + let mut candidates = match (&query_tree, candidates) { + (_, Some(candidates)) => candidates, + (Some(qt), None) => { + let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; + resolve_query_tree(&context, qt, params.wdcache)? } + (None, None) => self.index.documents_ids(self.rtxn)?, + }; - match bucket_candidates { - // why not are we keeping elements from the previous bucket? - Some(bucket_candidates) => { - self.bucket_candidates |= bucket_candidates - } - None => self.bucket_candidates |= &candidates, - } - - if candidates.is_empty() { - continue; - } - let rtree = Box::new(rtree.clone()); - let rtree = Box::leak(rtree); - - self.allowed_candidates = &candidates - params.excluded_candidates; - self.candidates = geo_point(rtree, self.allowed_candidates.clone(), self.point)?; + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; } - None => return Ok(None), + + match bucket_candidates { + Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, + None => self.bucket_candidates |= &candidates, + } + + if candidates.is_empty() { + continue; + } + self.allowed_candidates = &candidates - params.excluded_candidates; + self.candidates = + geo_point(rtree, self.allowed_candidates.clone(), self.point); } - } + None => return Ok(None), + }, } } } } -fn geo_point<'t>( - rtree: &'t RTree, +fn geo_point( + rtree: &RTree, candidates: RoaringBitmap, point: [f64; 2], -) -> Result + 't>> { - Ok(Box::new( - rtree - .nearest_neighbor_iter_with_distance_2(&point) - .filter_map(move |(point, _distance)| { - candidates.contains(point.data).then(|| point.data) - }) - .map(|id| std::iter::once(id).collect::()) - )) +) -> Box> { + let results = rtree + .nearest_neighbor_iter_with_distance_2(&point) + .filter_map(move |(point, _distance)| candidates.contains(point.data).then(|| point.data)) + .map(|id| std::iter::once(id).collect::()) + .collect::>(); + + Box::new(results.into_iter()) } From b1bf7d4f405b418ed5c071e12a600b212b808c1d Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 1 Sep 2021 17:03:42 +0200 Subject: [PATCH 21/42] reformat --- milli/src/search/criteria/mod.rs | 39 ++++++++++++++++---------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 185761632..782fedcc8 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -18,10 +18,10 @@ use crate::search::{word_derivations, WordDerivationsCache}; use crate::{DocumentId, FieldId, Index, Result, TreeLevel}; mod asc_desc; -mod geo; mod attribute; mod exactness; pub mod r#final; +mod geo; mod initial; mod proximity; mod typo; @@ -292,25 +292,24 @@ impl<'t> CriteriaBuilder<'t> { Some(ref sort_criteria) => { for asc_desc in sort_criteria { criterion = match asc_desc { - AscDescName::Asc(Member::Field(field)) => { - Box::new(AscDesc::asc( - &self.index, - &self.rtxn, - criterion, - field.to_string(), - )?) - } - AscDescName::Desc(Member::Field(field)) => { - Box::new(AscDesc::desc( - &self.index, - &self.rtxn, - criterion, - field.to_string(), - )?) - } - AscDescName::Asc(Member::Geo(point)) => { - Box::new(Geo::new(&self.index, &self.rtxn, criterion, point.clone())?) - } + AscDescName::Asc(Member::Field(field)) => Box::new(AscDesc::asc( + &self.index, + &self.rtxn, + criterion, + field.to_string(), + )?), + AscDescName::Desc(Member::Field(field)) => Box::new(AscDesc::desc( + &self.index, + &self.rtxn, + criterion, + field.to_string(), + )?), + AscDescName::Asc(Member::Geo(point)) => Box::new(Geo::new( + &self.index, + &self.rtxn, + criterion, + point.clone(), + )?), AscDescName::Desc(Member::Geo(_point)) => { panic!("You can't desc geosort"); // TODO: TAMO: remove this } From f0b74637dc2e41f8521946dc379def1866520b67 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 1 Sep 2021 17:43:18 +0200 Subject: [PATCH 22/42] fix all the tests --- milli/src/lib.rs | 2 +- milli/tests/search/mod.rs | 6 +++--- milli/tests/search/query_criteria.rs | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 2a55b6f3a..a3cede1fd 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -21,7 +21,7 @@ use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; use serde_json::{Map, Value}; -pub use self::criterion::{default_criteria, AscDesc, Criterion}; +pub use self::criterion::{default_criteria, AscDesc, Criterion, Member}; pub use self::error::{ Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError, }; diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index c34434c4e..b4dfb3080 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -6,7 +6,7 @@ use either::{Either, Left, Right}; use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; use milli::update::{Settings, UpdateBuilder, UpdateFormat}; -use milli::{AscDesc, Criterion, DocumentId, Index}; +use milli::{AscDesc, Criterion, DocumentId, Index, Member}; use serde::Deserialize; use slice_group_by::GroupBy; @@ -99,11 +99,11 @@ pub fn expected_order( new_groups .extend(group.linear_group_by_key(|d| d.proximity_rank).map(Vec::from)); } - Criterion::Sort if sort_by == [AscDesc::Asc(S("tag"))] => { + Criterion::Sort if sort_by == [AscDesc::Asc(Member::Field(S("tag")))] => { group.sort_by_key(|d| d.sort_by_rank); new_groups.extend(group.linear_group_by_key(|d| d.sort_by_rank).map(Vec::from)); } - Criterion::Sort if sort_by == [AscDesc::Desc(S("tag"))] => { + Criterion::Sort if sort_by == [AscDesc::Desc(Member::Field(S("tag")))] => { group.sort_by_key(|d| Reverse(d.sort_by_rank)); new_groups.extend(group.linear_group_by_key(|d| d.sort_by_rank).map(Vec::from)); } diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index c9720d652..cc08ec863 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -163,28 +163,28 @@ test_criterion!( DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Sort], - vec![AscDesc::Asc(S("tag"))] + vec![AscDesc::Asc(Member::Field(S("tag")))] ); test_criterion!( sort_by_asc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Sort], - vec![AscDesc::Asc(S("tag"))] + vec![AscDesc::Asc(Member::Field(S("tag")))] ); test_criterion!( sort_by_desc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Sort], - vec![AscDesc::Desc(S("tag"))] + vec![AscDesc::Desc(Member::Field(S("tag")))] ); test_criterion!( sort_by_desc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Sort], - vec![AscDesc::Desc(S("tag"))] + vec![AscDesc::Desc(Member::Field(S("tag")))] ); test_criterion!( default_criteria_order, From e8c093c1d028ce28277d3e92f3111493d3054225 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 2 Sep 2021 15:55:19 +0200 Subject: [PATCH 23/42] fix the error handling in the filters --- milli/src/search/facet/filter_condition.rs | 25 +++++++++++++++------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 66a2ffac7..f8ea2ca74 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -135,7 +135,7 @@ impl FilterCondition { Rule::leq => Ok(Self::lower_than_or_equal(fim, ff, pair)?), Rule::less => Ok(Self::lower_than(fim, ff, pair)?), Rule::between => Ok(Self::between(fim, ff, pair)?), - Rule::geo_radius => Ok(Self::geo_radius(fim, pair)?), + Rule::geo_radius => Ok(Self::geo_radius(fim, ff, pair)?), Rule::not => Ok(Self::from_pairs(fim, ff, pair.into_inner())?.negate()), Rule::prgm => Self::from_pairs(fim, ff, pair.into_inner()), Rule::term => Self::from_pairs(fim, ff, pair.into_inner()), @@ -161,7 +161,22 @@ impl FilterCondition { } } - fn geo_radius(fields_ids_map: &FieldsIdsMap, item: Pair) -> Result { + fn geo_radius( + fields_ids_map: &FieldsIdsMap, + filterable_fields: &HashSet, + item: Pair, + ) -> Result { + if !filterable_fields.contains("_geo") { + return Err(UserError::InvalidFilterAttribute(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!( + "attribute `_geo` is not filterable, available filterable attributes are: {}", + filterable_fields.iter().join(", "), + ), + }, + item.as_span(), + )))?; + } let mut items = item.into_inner(); let fid = match fields_ids_map.id("_geo") { Some(fid) => fid, @@ -463,9 +478,6 @@ impl FilterCondition { LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)), Between(left, right) => (Included(*left), Included(*right)), GeoLowerThan(point, distance) => { - if index.filterable_fields(rtxn)?.contains("_geo") { - Err(UserError::AttributeLimitReached)?; // TODO: TAMO use a real error - } let mut result = RoaringBitmap::new(); let rtree = match index.geo_rtree(rtxn)? { Some(rtree) => rtree, @@ -480,9 +492,6 @@ impl FilterCondition { return Ok(result); } GeoGreaterThan(point, distance) => { - if index.filterable_fields(rtxn)?.contains("_geo") { - Err(UserError::AttributeLimitReached)?; // TODO: TAMO use a real error - } let result = Self::evaluate_operator( rtxn, index, From bd4c2482921c2501f3c78ef52a160f9fe0950534 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 2 Sep 2021 15:57:40 +0200 Subject: [PATCH 24/42] improve the error handling in general and introduce the concept of reserved keywords --- milli/src/criterion.rs | 22 ++++++++++--------- milli/src/error.rs | 18 +++++++++++++++ .../extract/extract_geo_points.rs | 11 +++++----- .../src/update/index_documents/extract/mod.rs | 10 ++++++++- milli/src/update/index_documents/mod.rs | 4 ++++ .../src/update/index_documents/typed_chunk.rs | 2 +- 6 files changed, 50 insertions(+), 17 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index ea3214c8e..29c477473 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -3,7 +3,7 @@ use std::str::FromStr; use serde::{Deserialize, Serialize}; -use crate::error::{Error, UserError}; +use crate::error::{is_reserved_keyword, Error, UserError}; #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub enum Criterion { @@ -50,18 +50,20 @@ impl FromStr for Criterion { "sort" => Ok(Criterion::Sort), "exactness" => Ok(Criterion::Exactness), text => match AscDesc::from_str(text) { - Ok(AscDesc::Asc(field)) => Ok(Criterion::Asc(field)), - Ok(AscDesc::Desc(field)) => Ok(Criterion::Desc(field)), + Ok(AscDesc::Asc(Member::Field(field))) if is_reserved_keyword(&field) => { + Err(UserError::InvalidReservedRankingRuleName { name: text.to_string() })? + } + Ok(AscDesc::Asc(Member::Field(field))) => Ok(Criterion::Asc(field)), + Ok(AscDesc::Desc(Member::Field(field))) => Ok(Criterion::Desc(field)), + Ok(AscDesc::Asc(Member::Geo(_))) | Ok(AscDesc::Desc(Member::Geo(_))) => { + Err(UserError::InvalidRankingRuleName { name: text.to_string() })? + } Err(UserError::InvalidAscDescSyntax { name }) => { Err(UserError::InvalidCriterionName { name }.into()) } Err(error) => { Err(UserError::InvalidCriterionName { name: error.to_string() }.into()) } - Ok(AscDesc::Asc(Member::Geo(_))) | Ok(AscDesc::Desc(Member::Geo(_))) => { - Err(UserError::AttributeLimitReached)? // TODO: TAMO: use a real error - } - Err(error) => Err(error.into()), }, } } @@ -81,12 +83,12 @@ impl FromStr for Member { let point = text.strip_prefix("_geoPoint(") .and_then(|point| point.strip_suffix(")")) - .ok_or_else(|| UserError::InvalidCriterionName { name: text.to_string() })?; + .ok_or_else(|| UserError::InvalidRankingRuleName { name: text.to_string() })?; let point = point .split(',') .map(|el| el.trim().parse()) .collect::, _>>() - .map_err(|_| UserError::InvalidCriterionName { name: text.to_string() })?; + .map_err(|_| UserError::InvalidRankingRuleName { name: text.to_string() })?; Ok(Member::Geo([point[0], point[1]])) } else { Ok(Member::Field(text.to_string())) @@ -147,7 +149,7 @@ impl FromStr for AscDesc { match text.rsplit_once(':') { Some((left, "asc")) => Ok(AscDesc::Asc(left.parse()?)), Some((left, "desc")) => Ok(AscDesc::Desc(left.parse()?)), - _ => Err(UserError::InvalidCriterionName { name: text.to_string() }), + _ => Err(UserError::InvalidRankingRuleName { name: text.to_string() }), } } } diff --git a/milli/src/error.rs b/milli/src/error.rs index 3f473a673..f4601ea9a 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -12,6 +12,12 @@ use crate::{DocumentId, FieldId}; pub type Object = Map; +const RESERVED_KEYWORD: &[&'static str] = &["_geo", "_geoDistance"]; + +pub fn is_reserved_keyword(keyword: &str) -> bool { + RESERVED_KEYWORD.contains(&keyword) +} + #[derive(Debug)] pub enum Error { InternalError(InternalError), @@ -60,6 +66,9 @@ pub enum UserError { InvalidFilter(pest::error::Error), InvalidFilterAttribute(pest::error::Error), InvalidSortName { name: String }, + InvalidGeoField { document_id: Value, object: Value }, + InvalidRankingRuleName { name: String }, + InvalidReservedRankingRuleName { name: String }, InvalidSortableAttribute { field: String, valid_fields: HashSet }, SortRankingRuleMissing, InvalidStoreFile, @@ -222,6 +231,15 @@ impl fmt::Display for UserError { write!(f, "invalid asc/desc syntax for {}", name) } Self::InvalidCriterionName { name } => write!(f, "invalid criterion {}", name), + Self::InvalidGeoField { document_id, object } => write!( + f, + "the document with the id: {} contains an invalid _geo field: {}", + document_id, object + ), + Self::InvalidRankingRuleName { name } => write!(f, "invalid criterion {}", name), + Self::InvalidReservedRankingRuleName { name } => { + write!(f, "{} is a reserved keyword and thus can't be used as a ranking rule", name) + } Self::InvalidDocumentId { document_id } => { let json = serde_json::to_string(document_id).unwrap(); write!( diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 88ae7c177..c4bdce211 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -2,11 +2,10 @@ use std::fs::File; use std::io; use concat_arrays::concat_arrays; -use log::warn; use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; -use crate::{FieldId, InternalError, Result}; +use crate::{FieldId, InternalError, Result, UserError}; /// Extracts the geographical coordinates contained in each document under the `_geo` field. /// @@ -14,6 +13,7 @@ use crate::{FieldId, InternalError, Result}; pub fn extract_geo_points( mut obkv_documents: grenad::Reader, indexer: GrenadParameters, + primary_key_id: FieldId, geo_field_id: FieldId, ) -> Result> { let mut writer = tempfile::tempfile().and_then(|file| { @@ -33,9 +33,10 @@ pub fn extract_geo_points( let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; writer.insert(docid_bytes, bytes)?; } else { - // TAMO: improve the warn - warn!("Malformed `_geo` field"); - continue; + let primary_key = obkv.get(primary_key_id).unwrap(); // TODO: TAMO: is this valid? + let primary_key = + serde_json::from_slice(primary_key).map_err(InternalError::SerdeJson)?; + Err(UserError::InvalidGeoField { document_id: primary_key, object: point })? } } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 4cb21c8e4..36e3c870f 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -39,6 +39,7 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx: Sender>, searchable_fields: Option>, faceted_fields: HashSet, + primary_key_id: FieldId, geo_field_id: Option, stop_words: Option>, ) -> Result<()> { @@ -51,6 +52,7 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx.clone(), &searchable_fields, &faceted_fields, + primary_key_id, geo_field_id, &stop_words, ) @@ -172,6 +174,7 @@ fn extract_documents_data( lmdb_writer_sx: Sender>, searchable_fields: &Option>, faceted_fields: &HashSet, + primary_key_id: FieldId, geo_field_id: Option, stop_words: &Option>, ) -> Result<( @@ -186,7 +189,12 @@ fn extract_documents_data( let documents_chunk_cloned = documents_chunk.clone(); let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); rayon::spawn(move || { - let _ = match extract_geo_points(documents_chunk_cloned, indexer, geo_field_id) { + let _ = match extract_geo_points( + documents_chunk_cloned, + indexer, + primary_key_id, + geo_field_id, + ) { Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))), Err(error) => lmdb_writer_sx_cloned.send(Err(error)), }; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index d4fd3570e..38eea954b 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -228,6 +228,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { Receiver>, ) = crossbeam_channel::unbounded(); + // get the primary key field id + let primary_key_id = fields_ids_map.id(&primary_key).unwrap(); // TODO: TAMO: is this unwrap 100% valid? + // get searchable fields for word databases let searchable_fields = self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter); @@ -269,6 +272,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { lmdb_writer_sx.clone(), searchable_fields, faceted_fields, + primary_key_id, geo_field_id, stop_words, ) diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index b09bee213..5c27c195f 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -180,7 +180,7 @@ pub(crate) fn write_typed_chunk_into_index( is_merged_database = true; } TypedChunk::GeoPoints(mut geo_points) => { - // TODO: TAMO: we should create the rtree with the `RTree::bulk_load` function + // TODO: we should create the rtree with the `RTree::bulk_load` function let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default(); let mut doc_ids = index.geo_faceted_documents_ids(wtxn)?; From ebf82ac28cec7c5cdb3997fea20c083baff8bb36 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 6 Sep 2021 17:07:34 +0200 Subject: [PATCH 25/42] improve the error messages and add tests for the filters --- milli/src/search/facet/filter_condition.rs | 125 +++++++++++++++++++-- milli/src/search/facet/grammar.pest | 4 +- 2 files changed, 119 insertions(+), 10 deletions(-) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index f8ea2ca74..bfcf7d9c7 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -182,15 +182,42 @@ impl FilterCondition { Some(fid) => fid, None => return Ok(Empty), }; - let (lat_result, _) = pest_parse(items.next().unwrap()); - let (lng_result, _) = pest_parse(items.next().unwrap()); - let lat = lat_result.map_err(UserError::InvalidFilter)?; - let lng = lng_result.map_err(UserError::InvalidFilter)?; - let point = [lat, lng]; - let (distance_result, _) = pest_parse(items.next().unwrap()); - let distance = distance_result.map_err(UserError::InvalidFilter)?; - - Ok(Operator(fid, GeoLowerThan(point, distance))) + let parameters_item = items.next().unwrap(); + // We don't need more than 3 parameters, but to handle errors correctly we are still going + // to extract the first 4 parameters + let param_span = parameters_item.as_span(); + let parameters = parameters_item + .into_inner() + .take(4) + .map(|param| (param.clone(), param.as_span())) + .map(|(param, span)| pest_parse(param).0.map(|arg| (arg, span))) + .collect::, _>>() + .map_err(UserError::InvalidFilter)?; + if parameters.len() != 3 { + return Err(UserError::InvalidFilter(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"), + }, + // we want to point to the last parameters and if there was no parameters we + // point to the parenthesis + parameters.last().map(|param| param.1.clone()).unwrap_or(param_span), + )))?; + } + let (lat, lng, distance) = (¶meters[0], ¶meters[1], parameters[2].0); + if let Some(span) = (!(-181.0..181.).contains(&lat.0)) + .then(|| &lat.1) + .or((!(-181.0..181.).contains(&lng.0)).then(|| &lng.1)) + { + return Err(UserError::InvalidFilter(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!( + "Latitude and longitude must be contained between -180 to 180 degrees." + ), + }, + span.clone(), + )))?; + } + Ok(Operator(fid, GeoLowerThan([lat.0, lng.0], distance))) } fn between( @@ -726,6 +753,86 @@ mod tests { assert_eq!(condition, expected); } + #[test] + fn geo_radius() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order + builder.set_filterable_fields(hashset! { S("_geo"), S("price") }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + // basic test + let condition = + FilterCondition::from_str(&rtxn, &index, "_geoRadius(12, 13.0005, 2000)").unwrap(); + let expected = Operator(0, GeoLowerThan([12., 13.0005], 2000.)); + assert_eq!(condition, expected); + + // test the negation of the GeoLowerThan + let condition = + FilterCondition::from_str(&rtxn, &index, "NOT _geoRadius(50, 18, 2000.500)").unwrap(); + let expected = Operator(0, GeoGreaterThan([50., 18.], 2000.500)); + assert_eq!(condition, expected); + + // composition of multiple operations + let condition = FilterCondition::from_str( + &rtxn, + &index, + "(NOT _geoRadius(1, 2, 300) AND _geoRadius(1.001, 2.002, 1000.300)) OR price <= 10", + ) + .unwrap(); + let expected = Or( + Box::new(And( + Box::new(Operator(0, GeoGreaterThan([1., 2.], 300.))), + Box::new(Operator(0, GeoLowerThan([1.001, 2.002], 1000.300))), + )), + Box::new(Operator(1, LowerThanOrEqual(10.))), + ); + assert_eq!(condition, expected); + + // georadius don't have any parameters + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius()"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + + // georadius don't have enough parameters + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + + // georadius have too many parameters + let result = + FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + + // georadius have a bad latitude + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-200, 150, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error + .to_string() + .contains("Latitude and longitude must be contained between -180 to 180 degrees.")); + + // georadius have a bad longitude + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 181, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error + .to_string() + .contains("Latitude and longitude must be contained between -180 to 180 degrees.")); + } + #[test] fn from_array() { let path = tempfile::tempdir().unwrap(); diff --git a/milli/src/search/facet/grammar.pest b/milli/src/search/facet/grammar.pest index 10783b632..973fb5156 100644 --- a/milli/src/search/facet/grammar.pest +++ b/milli/src/search/facet/grammar.pest @@ -8,6 +8,8 @@ char = _{ !(PEEK | "\\") ~ ANY | "\\" ~ (PEEK | "\\" | "/" | "b" | "f" | "n" | "r" | "t") | "\\" ~ ("u" ~ ASCII_HEX_DIGIT{4})} +// we deliberately choose to allow empty parameters to generate more specific error message later +parameters ={"(" ~ (value ~ ",")* ~ value? ~ ")"} condition = _{between | eq | greater | less | geq | leq | neq} between = {key ~ value ~ "TO" ~ value} geq = {key ~ ">=" ~ value} @@ -16,7 +18,7 @@ neq = {key ~ "!=" ~ value} eq = {key ~ "=" ~ value} greater = {key ~ ">" ~ value} less = {key ~ "<" ~ value} -geo_radius = {"_geoRadius(" ~ value ~ "," ~ value ~ "," ~ value ~ ")"} +geo_radius = {"_geoRadius" ~ parameters } prgm = {SOI ~ expr ~ EOI} expr = _{ ( term ~ (operation ~ term)* ) } From 6d5762a6c85bc0d48acb579a13ef27c79aa01288 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 6 Sep 2021 17:28:49 +0200 Subject: [PATCH 26/42] handle the case where you forgot entirely the parenthesis --- milli/src/search/facet/filter_condition.rs | 6 ++++++ milli/src/search/facet/grammar.pest | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index bfcf7d9c7..c6dbbf056 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -797,6 +797,12 @@ mod tests { ); assert_eq!(condition, expected); + // georadius don't have any parameters + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + // georadius don't have any parameters let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius()"); assert!(result.is_err()); diff --git a/milli/src/search/facet/grammar.pest b/milli/src/search/facet/grammar.pest index 973fb5156..8285f81a6 100644 --- a/milli/src/search/facet/grammar.pest +++ b/milli/src/search/facet/grammar.pest @@ -9,7 +9,7 @@ char = _{ !(PEEK | "\\") ~ ANY | "\\" ~ ("u" ~ ASCII_HEX_DIGIT{4})} // we deliberately choose to allow empty parameters to generate more specific error message later -parameters ={"(" ~ (value ~ ",")* ~ value? ~ ")"} +parameters ={("(" ~ (value ~ ",")* ~ value? ~ ")") | ""} condition = _{between | eq | greater | less | geq | leq | neq} between = {key ~ value ~ "TO" ~ value} geq = {key ~ ">=" ~ value} From 7ae2a7341c270f6d84e66e7803017b34203d36d9 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 6 Sep 2021 18:15:20 +0200 Subject: [PATCH 27/42] introduce the reserved keywords in the filters --- milli/src/search/facet/filter_condition.rs | 19 +++++++++++++++++++ milli/src/search/facet/grammar.pest | 7 ++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index c6dbbf056..a36fddb01 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -595,6 +595,18 @@ fn field_id( ) -> StdResult, PestError> { // lexing ensures that we at least have a key let key = items.next().unwrap(); + if key.as_rule() == Rule::reserved { + return Err(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!( + "`{}` is a reserved keyword and thus can't be used as a filter expression. Available filterable attributes are: {}", + key.as_str(), + filterable_fields.iter().join(", "), + ), + }, + key.as_span(), + )); + } if !filterable_fields.contains(key.as_str()) { return Err(PestError::new_from_span( @@ -671,6 +683,13 @@ mod tests { let condition = FilterCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap(); let expected = Operator(0, Operator::NotEqual(None, S("ponce"))); assert_eq!(condition, expected); + + let result = FilterCondition::from_str(&rtxn, &index, "_geo = France"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains( + "`_geo` is a reserved keyword and thus can't be used as a filter expression." + )); } #[test] diff --git a/milli/src/search/facet/grammar.pest b/milli/src/search/facet/grammar.pest index 8285f81a6..d07d5bca5 100644 --- a/milli/src/search/facet/grammar.pest +++ b/milli/src/search/facet/grammar.pest @@ -1,5 +1,5 @@ -key = _{quoted | word} -value = _{quoted | word} +key = _{reserved | quoted | word } +value = _{quoted | word } quoted = _{ (PUSH("'") | PUSH("\"")) ~ string ~ POP } string = {char*} word = ${(LETTER | NUMBER | "_" | "-" | ".")+} @@ -8,8 +8,9 @@ char = _{ !(PEEK | "\\") ~ ANY | "\\" ~ (PEEK | "\\" | "/" | "b" | "f" | "n" | "r" | "t") | "\\" ~ ("u" ~ ASCII_HEX_DIGIT{4})} +reserved = { "_geo" | "_geoDistance" | "_geoPoint" | ("_geoPoint" ~ parameters) } // we deliberately choose to allow empty parameters to generate more specific error message later -parameters ={("(" ~ (value ~ ",")* ~ value? ~ ")") | ""} +parameters = {("(" ~ (value ~ ",")* ~ value? ~ ")") | ""} condition = _{between | eq | greater | less | geq | leq | neq} between = {key ~ value ~ "TO" ~ value} geq = {key ~ ">=" ~ value} From 4f69b190bc31620c1a03ade5f154a85a4c1fa850 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 7 Sep 2021 11:49:27 +0200 Subject: [PATCH 28/42] remove the distance from the search, the computation of the distance will be made on meilisearch side --- milli/src/search/criteria/geo.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/criteria/geo.rs b/milli/src/search/criteria/geo.rs index bf3ae7aba..6f8f1406a 100644 --- a/milli/src/search/criteria/geo.rs +++ b/milli/src/search/criteria/geo.rs @@ -106,8 +106,8 @@ fn geo_point( point: [f64; 2], ) -> Box> { let results = rtree - .nearest_neighbor_iter_with_distance_2(&point) - .filter_map(move |(point, _distance)| candidates.contains(point.data).then(|| point.data)) + .nearest_neighbor_iter(&point) + .filter_map(move |point| candidates.contains(point.data).then(|| point.data)) .map(|id| std::iter::once(id).collect::()) .collect::>(); From e5ef0cad9a78d6db3aa94c4f83486b9ee7ab5f8c Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 7 Sep 2021 12:11:03 +0200 Subject: [PATCH 29/42] use meters in the filters --- milli/Cargo.toml | 1 + milli/src/lib.rs | 8 ++++++++ milli/src/search/facet/filter_condition.rs | 13 ++++++++----- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index d2767afd4..be507332e 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -16,6 +16,7 @@ flate2 = "1.0.20" fst = "0.4.5" fxhash = "0.2.1" grenad = { version = "0.3.1", default-features = false, features = ["tempfile"] } +haversine = "0.2.1" heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index a3cede1fd..3c7713308 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -142,6 +142,14 @@ where Some((head, tail)) } +/// Return the distance between two points in meters +fn distance_between_two_points(a: &[f64; 2], b: &[f64; 2]) -> f64 { + let a = haversine::Location { latitude: a[0], longitude: a[1] }; + let b = haversine::Location { latitude: b[0], longitude: b[1] }; + + haversine::distance(a, b, haversine::Units::Kilometers) * 1000. +} + #[cfg(test)] mod tests { use serde_json::json; diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index a36fddb01..08a84899f 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -504,17 +504,20 @@ impl FilterCondition { LowerThan(val) => (Included(f64::MIN), Excluded(*val)), LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)), Between(left, right) => (Included(*left), Included(*right)), - GeoLowerThan(point, distance) => { + GeoLowerThan(base_point, distance) => { let mut result = RoaringBitmap::new(); let rtree = match index.geo_rtree(rtxn)? { Some(rtree) => rtree, None => return Ok(result), }; - let iter = rtree - .nearest_neighbor_iter_with_distance_2(point) - .take_while(|(_, dist)| dist <= distance); - iter.for_each(|(point, _)| drop(result.insert(point.data))); + rtree + .nearest_neighbor_iter(base_point) + .take_while(|point| { + dbg!(crate::distance_between_two_points(base_point, point.geom())) + < *distance + }) + .for_each(|point| drop(result.insert(point.data))); return Ok(result); } From 2988d3c76d8a03765bc5aa94ab4f930190686adc Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 8 Sep 2021 13:08:48 +0200 Subject: [PATCH 30/42] tests the geo filters --- milli/tests/assets/test_set.ndjson | 34 +++++++++++++++--------------- milli/tests/search/filters.rs | 5 +++++ milli/tests/search/mod.rs | 6 ++++++ 3 files changed, 28 insertions(+), 17 deletions(-) diff --git a/milli/tests/assets/test_set.ndjson b/milli/tests/assets/test_set.ndjson index 89d9f1109..9a0fe5b0a 100644 --- a/milli/tests/assets/test_set.ndjson +++ b/milli/tests/assets/test_set.ndjson @@ -1,17 +1,17 @@ -{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"sort_by_rank":0,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","":""} -{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"sort_by_rank":2,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","":""} -{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"sort_by_rank":0,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","":""} -{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"sort_by_rank":2,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","":""} -{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"sort_by_rank":1,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","":""} -{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"sort_by_rank":0,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","":""} -{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"sort_by_rank":2,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","":""} -{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","":""} -{"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"sort_by_rank":0,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"blue","":""} -{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","":""} -{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"title":"ello creation system","description":"in few word ello was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","":""} -{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","":""} -{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","":""} -{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","":""} -{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"sort_by_rank":0,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","":""} -{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"sort_by_rank":2,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"red","":""} -{"id":"Q","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"title":"hello world","description":"a hello world program generally is a computer program that outputs or displays the message hello world","tag":"green","":""} +{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":43,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","_geo": { "lat": 50.62984446145472, "lng": 3.085712705162039 },"":""} +{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":191,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","_geo": { "lat": 50.63047567664291, "lng": 3.088852230809636 },"":""} +{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"sort_by_rank":0,"geo_rank":283,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","_geo": { "lat": 50.6321800003937, "lng": 3.088331882262139 },"":""} +{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":1381,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","_geo": { "lat": 50.63728851135729, "lng": 3.0703951595971626 },"":""} +{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":1979,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","_geo": { "lat": 50.64264610511925, "lng": 3.0665099941857634 },"":""} +{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"sort_by_rank":0,"geo_rank":65022,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","_geo": { "lat": 51.05028653642387, "lng": 3.7301072771642096 },"":""} +{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"sort_by_rank":2,"geo_rank":34692,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","_geo": { "lat": 50.78776041427129, "lng": 2.661201766290338 },"":""} +{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":202182,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","_geo": { "lat": 48.875617484531965, "lng": 2.346747821504194 },"":""} +{"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"sort_by_rank":0,"geo_rank":740667,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"blue","_geo": { "lat": 43.973998070351065, "lng": 3.4661837318345032 },"":""} +{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":739020,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","_geo": { "lat": 43.98920130353838, "lng": 3.480519311627928 },"":""} +{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"ello creation system","description":"in few word ello was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":""} +{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":737861,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","_geo": { "lat": 44.000507750283695, "lng": 3.5116812040621572 },"":""} +{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"geo_rank":739203,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","_geo": { "lat": 43.99150729038736, "lng": 3.606143957295055 },"":""} +{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":9499586,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","_geo": { "lat": 35.511540843367115, "lng": 138.764368875787 },"":""} +{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"sort_by_rank":0,"geo_rank":9425163,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","_geo": { "lat": 35.00536702277189, "lng": 135.76118763940391 },"":""} +{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":9422437,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"red","_geo": { "lat": 35.06462306367058, "lng": 135.8338440354251 },"":""} +{"id":"Q","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":9339230,"title":"hello world","description":"a hello world program generally is a computer program that outputs or displays the message hello world","tag":"green","_geo": { "lat": 34.39548365683149, "lng": 132.4535960928883 },"":""} diff --git a/milli/tests/search/filters.rs b/milli/tests/search/filters.rs index c810b47af..d992a8e95 100644 --- a/milli/tests/search/filters.rs +++ b/milli/tests/search/filters.rs @@ -47,6 +47,11 @@ test_filter!(eq_mix_and_filter, vec![Right("tag=red"), Right("asc_desc_rank=1")] test_filter!(eq_string_or_filter, vec![Left(vec!["tag=red", "tag=green"])]); test_filter!(eq_mix_or_filter, vec![Left(vec!["tag=red", "asc_desc_rank=1"])]); test_filter!(eq_number_or_filter, vec![Left(vec!["asc_desc_rank=3", "asc_desc_rank=1"])]); +test_filter!(geo_radius, vec![Right("_geoRadius(50.630010347667806, 3.086251829166809, 100000)")]); +test_filter!( + not_geo_radius, + vec![Right("NOT _geoRadius(50.630010347667806, 3.086251829166809, 1000000)")] +); test_filter!(eq_complex_filter, vec![Left(vec!["tag=red", "tag=green"]), Right("asc_desc_rank=3")]); test_filter!( eq_complex_filter_2, diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index b4dfb3080..e3f6c5b09 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -37,6 +37,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { builder.set_filterable_fields(hashset! { S("tag"), S("asc_desc_rank"), + S("_geo"), }); builder.set_sortable_fields(hashset! { S("tag"), @@ -162,6 +163,10 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option { if document.asc_desc_rank > filter.parse().unwrap() { id = Some(document.id.clone()) } + } else if filter.starts_with("_geoRadius") { + id = (document.geo_rank < 100000).then(|| document.id.clone()); + } else if filter.starts_with("NOT _geoRadius") { + id = (document.geo_rank > 1000000).then(|| document.id.clone()); } id } @@ -205,6 +210,7 @@ pub struct TestDocument { pub exact_rank: u32, pub asc_desc_rank: u32, pub sort_by_rank: u32, + pub geo_rank: u32, pub title: String, pub description: String, pub tag: String, From 4b618b95e409a08a615aeabaf46b8ef40c69f099 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 8 Sep 2021 17:11:29 +0200 Subject: [PATCH 31/42] rebase on main --- milli/tests/search/query_criteria.rs | 2 +- milli/tests/search/sort.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index cc08ec863..f6a937f67 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -5,7 +5,7 @@ use heed::EnvOpenOptions; use itertools::Itertools; use maplit::hashset; use milli::update::{Settings, UpdateBuilder, UpdateFormat}; -use milli::{AscDesc, Criterion, Index, Search, SearchResult}; +use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult}; use rand::Rng; use Criterion::*; diff --git a/milli/tests/search/sort.rs b/milli/tests/search/sort.rs index fe87f0623..86404bb99 100644 --- a/milli/tests/search/sort.rs +++ b/milli/tests/search/sort.rs @@ -1,6 +1,6 @@ use big_s::S; use milli::Criterion::{Attribute, Exactness, Proximity, Typo, Words}; -use milli::{AscDesc, Error, Search, UserError}; +use milli::{AscDesc, Error, Member, Search, UserError}; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; @@ -16,7 +16,7 @@ fn sort_ranking_rule_missing() { search.limit(EXTERNAL_DOCUMENTS_IDS.len()); search.authorize_typos(true); search.optional_words(true); - search.sort_criteria(vec![AscDesc::Asc(S("tag"))]); + search.sort_criteria(vec![AscDesc::Asc(Member::Field(S("tag")))]); let result = search.execute(); assert!(matches!(result, Err(Error::UserError(UserError::SortRankingRuleMissing)))); From b15c77ebc4608953e3009e99c9b2be8aaac84d1a Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 8 Sep 2021 18:08:51 +0200 Subject: [PATCH 32/42] return an error in case a user try to sort with :desc --- milli/src/search/criteria/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 782fedcc8..fca159900 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -15,7 +15,7 @@ use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; use crate::criterion::{AscDesc as AscDescName, Member}; use crate::search::criteria::geo::Geo; use crate::search::{word_derivations, WordDerivationsCache}; -use crate::{DocumentId, FieldId, Index, Result, TreeLevel}; +use crate::{DocumentId, FieldId, Index, Result, TreeLevel, UserError}; mod asc_desc; mod attribute; @@ -311,7 +311,7 @@ impl<'t> CriteriaBuilder<'t> { point.clone(), )?), AscDescName::Desc(Member::Geo(_point)) => { - panic!("You can't desc geosort"); // TODO: TAMO: remove this + return Err(UserError::InvalidSortName { name: "Sorting in descending order is currently not supported for the geosearch".to_string() })? } }; } From bad8ea47d5884eaf862adacf33377bf8d8672049 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 8 Sep 2021 18:12:10 +0200 Subject: [PATCH 33/42] edit the two lasts TODO comments --- milli/src/update/index_documents/extract/extract_geo_points.rs | 3 ++- milli/src/update/index_documents/mod.rs | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index c4bdce211..1af22d010 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -33,7 +33,8 @@ pub fn extract_geo_points( let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; writer.insert(docid_bytes, bytes)?; } else { - let primary_key = obkv.get(primary_key_id).unwrap(); // TODO: TAMO: is this valid? + // All document must have a primary key so we can unwrap safely here + let primary_key = obkv.get(primary_key_id).unwrap(); let primary_key = serde_json::from_slice(primary_key).map_err(InternalError::SerdeJson)?; Err(UserError::InvalidGeoField { document_id: primary_key, object: point })? diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 38eea954b..d3b8e47b0 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -229,7 +229,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { ) = crossbeam_channel::unbounded(); // get the primary key field id - let primary_key_id = fields_ids_map.id(&primary_key).unwrap(); // TODO: TAMO: is this unwrap 100% valid? + let primary_key_id = fields_ids_map.id(&primary_key).unwrap(); // get searchable fields for word databases let searchable_fields = From c81ff22c5b980eab5a5b41bb945abac093f31b4e Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 8 Sep 2021 19:17:00 +0200 Subject: [PATCH 34/42] delete the invalid criterion name error in favor of invalid ranking rule name --- milli/src/criterion.rs | 4 ++-- milli/src/error.rs | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 29c477473..e05829eb4 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -59,10 +59,10 @@ impl FromStr for Criterion { Err(UserError::InvalidRankingRuleName { name: text.to_string() })? } Err(UserError::InvalidAscDescSyntax { name }) => { - Err(UserError::InvalidCriterionName { name }.into()) + Err(UserError::InvalidRankingRuleName { name }.into()) } Err(error) => { - Err(UserError::InvalidCriterionName { name: error.to_string() }.into()) + Err(UserError::InvalidRankingRuleName { name: error.to_string() }.into()) } }, } diff --git a/milli/src/error.rs b/milli/src/error.rs index f4601ea9a..157fe4be9 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -60,7 +60,6 @@ pub enum UserError { Csv(csv::Error), DocumentLimitReached, InvalidAscDescSyntax { name: String }, - InvalidCriterionName { name: String }, InvalidDocumentId { document_id: Value }, InvalidFacetsDistribution { invalid_facets_name: HashSet }, InvalidFilter(pest::error::Error), @@ -230,7 +229,6 @@ impl fmt::Display for UserError { Self::InvalidAscDescSyntax { name } => { write!(f, "invalid asc/desc syntax for {}", name) } - Self::InvalidCriterionName { name } => write!(f, "invalid criterion {}", name), Self::InvalidGeoField { document_id, object } => write!( f, "the document with the id: {} contains an invalid _geo field: {}", From a84f3a8b317eecfc09ce9361e64c0ea5171da6d0 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Thu, 9 Sep 2021 12:20:08 +0200 Subject: [PATCH 35/42] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- http-ui/src/main.rs | 4 +-- milli/src/criterion.rs | 18 ++++++---- milli/src/error.rs | 4 +-- milli/src/index.rs | 17 +++++---- milli/src/lib.rs | 2 +- milli/src/search/criteria/geo.rs | 8 +++-- milli/src/search/criteria/mod.rs | 4 ++- milli/src/search/facet/filter_condition.rs | 36 ++++++++++--------- milli/src/update/delete_documents.rs | 7 ++-- .../extract/extract_geo_points.rs | 5 ++- .../src/update/index_documents/extract/mod.rs | 9 ++--- milli/src/update/index_documents/mod.rs | 17 +++++---- .../src/update/index_documents/typed_chunk.rs | 15 ++++---- 13 files changed, 77 insertions(+), 69 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 89f3dcab2..5dbb0c326 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -695,7 +695,7 @@ async fn main() -> anyhow::Result<()> { struct QueryBody { query: Option, filters: Option, - sorters: Option, + sort: Option, facet_filters: Option, String>>>, facet_distribution: Option, limit: Option, @@ -755,7 +755,7 @@ async fn main() -> anyhow::Result<()> { search.limit(limit); } - if let Some(sort) = query.sorters { + if let Some(sort) = query.sort { search.sort_criteria(vec![sort.parse().unwrap()]); } diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index e05829eb4..b95080b4b 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -84,12 +84,16 @@ impl FromStr for Member { text.strip_prefix("_geoPoint(") .and_then(|point| point.strip_suffix(")")) .ok_or_else(|| UserError::InvalidRankingRuleName { name: text.to_string() })?; - let point = point - .split(',') - .map(|el| el.trim().parse()) - .collect::, _>>() - .map_err(|_| UserError::InvalidRankingRuleName { name: text.to_string() })?; - Ok(Member::Geo([point[0], point[1]])) + let (lat, long) = point + .split_once(',') + .ok_or_else(|| UserError::InvalidRankingRuleName { name: text.to_string() }) + .and_then(|(lat, long)| { + lat.trim() + .parse() + .and_then(|lat| long.trim().parse().map(|long| (lat, long))) + .map_err(|_| UserError::InvalidRankingRuleName { name: text.to_string() }) + })?; + Ok(Member::Geo([lat, long])) } else { Ok(Member::Field(text.to_string())) } @@ -99,7 +103,7 @@ impl FromStr for Member { impl fmt::Display for Member { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - Member::Field(name) => write!(f, "{}", name), + Member::Field(name) => f.write_str(name), Member::Geo([lat, lng]) => write!(f, "_geoPoint({}, {})", lat, lng), } } diff --git a/milli/src/error.rs b/milli/src/error.rs index 157fe4be9..21b77b5a7 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -12,10 +12,8 @@ use crate::{DocumentId, FieldId}; pub type Object = Map; -const RESERVED_KEYWORD: &[&'static str] = &["_geo", "_geoDistance"]; - pub fn is_reserved_keyword(keyword: &str) -> bool { - RESERVED_KEYWORD.contains(&keyword) + ["_geo", "_geoDistance"].contains(&keyword) } #[derive(Debug)] diff --git a/milli/src/index.rs b/milli/src/index.rs index f2ddba699..4ab19f175 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -299,6 +299,7 @@ impl Index { /* geo rtree */ + /// Writes the provided `rtree` which associates coordinates to documents ids. pub(crate) fn put_geo_rtree( &self, wtxn: &mut RwTxn, @@ -307,10 +308,12 @@ impl Index { self.main.put::<_, Str, SerdeBincode>>(wtxn, main_key::GEO_RTREE_KEY, rtree) } + /// Delete the `rtree` which associates coordinates to documents ids. pub(crate) fn delete_geo_rtree(&self, wtxn: &mut RwTxn) -> heed::Result { self.main.delete::<_, Str>(wtxn, main_key::GEO_RTREE_KEY) } + /// Returns the `rtree` which associates coordinates to documents ids. pub fn geo_rtree<'t>(&self, rtxn: &'t RoTxn) -> Result>> { match self .main @@ -323,7 +326,7 @@ impl Index { /* geo faceted */ - /// Writes the documents ids that are faceted with a _geo field + /// Writes the documents ids that are faceted with a _geo field. pub(crate) fn put_geo_faceted_documents_ids( &self, wtxn: &mut RwTxn, @@ -336,16 +339,12 @@ impl Index { ) } - /// Delete the documents ids that are faceted with a _geo field - pub(crate) fn delete_geo_faceted_documents_ids(&self, wtxn: &mut RwTxn) -> heed::Result<()> { - self.main.put::<_, Str, RoaringBitmapCodec>( - wtxn, - main_key::GEO_FACETED_DOCUMENTS_IDS_KEY, - &RoaringBitmap::new(), - ) + /// Delete the documents ids that are faceted with a _geo field. + pub(crate) fn delete_geo_faceted_documents_ids(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, main_key::GEO_FACETED_DOCUMENTS_IDS_KEY) } - /// Retrieve all the documents ids that faceted with a _geo field + /// Retrieve all the documents ids that faceted with a _geo field. pub fn geo_faceted_documents_ids(&self, rtxn: &RoTxn) -> heed::Result { match self .main diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 3c7713308..4f066365c 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -142,7 +142,7 @@ where Some((head, tail)) } -/// Return the distance between two points in meters +/// Return the distance between two points in meters. fn distance_between_two_points(a: &[f64; 2], b: &[f64; 2]) -> f64 { let a = haversine::Location { latitude: a[0], longitude: a[1] }; let b = haversine::Location { latitude: b[0], longitude: b[1] }; diff --git a/milli/src/search/criteria/geo.rs b/milli/src/search/criteria/geo.rs index 6f8f1406a..78f741b57 100644 --- a/milli/src/search/criteria/geo.rs +++ b/milli/src/search/criteria/geo.rs @@ -1,3 +1,5 @@ +use std::iter; + use roaring::RoaringBitmap; use rstar::RTree; @@ -23,7 +25,7 @@ impl<'t> Geo<'t> { parent: Box, point: [f64; 2], ) -> Result { - let candidates = Box::new(std::iter::empty()); + let candidates = Box::new(iter::empty()); let allowed_candidates = index.geo_faceted_documents_ids(rtxn)?; let bucket_candidates = RoaringBitmap::new(); let rtree = index.geo_rtree(rtxn)?; @@ -41,7 +43,7 @@ impl<'t> Geo<'t> { } } -impl<'t> Criterion for Geo<'t> { +impl Criterion for Geo<'_> { fn next(&mut self, params: &mut CriterionParameters) -> Result> { // if there is no rtree we have nothing to returns let rtree = match self.rtree.as_ref() { @@ -108,7 +110,7 @@ fn geo_point( let results = rtree .nearest_neighbor_iter(&point) .filter_map(move |point| candidates.contains(point.data).then(|| point.data)) - .map(|id| std::iter::once(id).collect::()) + .map(|id| iter::once(id).collect::()) .collect::>(); Box::new(results.into_iter()) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index fca159900..105a69194 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -311,7 +311,9 @@ impl<'t> CriteriaBuilder<'t> { point.clone(), )?), AscDescName::Desc(Member::Geo(_point)) => { - return Err(UserError::InvalidSortName { name: "Sorting in descending order is currently not supported for the geosearch".to_string() })? + return Err(UserError::InvalidSortName { + name: "Sorting in descending order is currently not supported for the geosearch".to_string(), + })? } }; } diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 08a84899f..1e5bf9ad0 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -21,7 +21,9 @@ use crate::error::UserError; use crate::heed_codec::facet::{ FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, }; -use crate::{CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result}; +use crate::{ + distance_between_two_points, CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result, +}; #[derive(Debug, Clone, PartialEq)] pub enum Operator { @@ -198,10 +200,10 @@ impl FilterCondition { ErrorVariant::CustomError { message: format!("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"), }, - // we want to point to the last parameters and if there was no parameters we + // we want to point to the last parameters and if there was no parameters we // point to the parenthesis - parameters.last().map(|param| param.1.clone()).unwrap_or(param_span), - )))?; + parameters.last().map(|param| param.1.clone()).unwrap_or(param_span), + )))?; } let (lat, lng, distance) = (¶meters[0], ¶meters[1], parameters[2].0); if let Some(span) = (!(-181.0..181.).contains(&lat.0)) @@ -505,19 +507,18 @@ impl FilterCondition { LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)), Between(left, right) => (Included(*left), Included(*right)), GeoLowerThan(base_point, distance) => { - let mut result = RoaringBitmap::new(); let rtree = match index.geo_rtree(rtxn)? { Some(rtree) => rtree, - None => return Ok(result), + None => return Ok(RoaringBitmap::new()), }; - rtree + let result = rtree .nearest_neighbor_iter(base_point) .take_while(|point| { - dbg!(crate::distance_between_two_points(base_point, point.geom())) - < *distance + distance_between_two_points(base_point, point.geom()) < *distance }) - .for_each(|point| drop(result.insert(point.data))); + .map(|point| point.data) + .collect(); return Ok(result); } @@ -600,14 +601,15 @@ fn field_id( let key = items.next().unwrap(); if key.as_rule() == Rule::reserved { return Err(PestError::new_from_span( - ErrorVariant::CustomError { - message: format!( - "`{}` is a reserved keyword and thus can't be used as a filter expression. Available filterable attributes are: {}", + ErrorVariant::CustomError { + message: format!( + "`{}` is a reserved keyword and therefore can't be used as a filter expression. \ + Available filterable attributes are: {}", key.as_str(), filterable_fields.iter().join(", "), - ), - }, - key.as_span(), + ), + }, + key.as_span(), )); } @@ -691,7 +693,7 @@ mod tests { assert!(result.is_err()); let error = result.unwrap_err(); assert!(error.to_string().contains( - "`_geo` is a reserved keyword and thus can't be used as a filter expression." + "`_geo` is a reserved keyword and therefore can't be used as a filter expression." )); } diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 639770bd6..b49cdc3cd 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -383,15 +383,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { if let Some(mut rtree) = self.index.geo_rtree(self.wtxn)? { let mut geo_faceted_doc_ids = self.index.geo_faceted_documents_ids(self.wtxn)?; - let points_to_remove: Vec<_> = rtree + let (points_to_remove, docids_to_remove): (Vec<_>, RoaringBitmap) = rtree .iter() .filter(|&point| self.documents_ids.contains(point.data)) .cloned() - .collect(); + .map(|point| (point, point.data)) + .unzip(); points_to_remove.iter().for_each(|point| { rtree.remove(&point); - geo_faceted_doc_ids.remove(point.data); }); + geo_faceted_doc_ids -= docids_to_remove; self.index.put_geo_rtree(self.wtxn, &rtree)?; self.index.put_geo_faceted_documents_ids(self.wtxn, &geo_faceted_doc_ids)?; diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 1af22d010..a36b608ee 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -22,11 +22,10 @@ pub fn extract_geo_points( while let Some((docid_bytes, value)) = obkv_documents.next()? { let obkv = obkv::KvReader::new(value); - let point = match obkv.get(geo_field_id) { - Some(point) => point, + let point: Value = match obkv.get(geo_field_id) { + Some(point) => serde_json::from_slice(point).map_err(InternalError::SerdeJson)?, None => continue, }; - let point: Value = serde_json::from_slice(point).map_err(InternalError::SerdeJson)?; if let Some((lat, lng)) = point["lat"].as_f64().zip(point["lng"].as_f64()) { // this will create an array of 16 bytes (two 8 bytes floats) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 36e3c870f..47a62be67 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -189,12 +189,9 @@ fn extract_documents_data( let documents_chunk_cloned = documents_chunk.clone(); let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); rayon::spawn(move || { - let _ = match extract_geo_points( - documents_chunk_cloned, - indexer, - primary_key_id, - geo_field_id, - ) { + let result = + extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, geo_field_id); + let _ = match result { Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))), Err(error) => lmdb_writer_sx_cloned.send(Err(error)), }; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index d3b8e47b0..336165894 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -237,12 +237,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // get filterable fields for facet databases let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; // get the fid of the `_geo` field. - let geo_field_id = if let Some(gfid) = self.index.fields_ids_map(self.wtxn)?.id("_geo") { - (self.index.sortable_fields_ids(self.wtxn)?.contains(&gfid) - || self.index.filterable_fields_ids(self.wtxn)?.contains(&gfid)) - .then(|| gfid) - } else { - None + let geo_field_id = match self.index.fields_ids_map(self.wtxn)?.id("_geo") { + Some(gfid) => { + let is_sortable = self.index.sortable_fields_ids(self.wtxn)?.contains(&gfid); + let is_filterable = self.index.filterable_fields_ids(self.wtxn)?.contains(&gfid); + if is_sortable || is_filterable { + Some(gfid) + } else { + None + } + } + None => None, }; let stop_words = self.index.stop_words(self.wtxn)?; diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 5c27c195f..b17f28b66 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,4 +1,5 @@ use std::borrow::Cow; +use std::convert::TryInto; use std::fs::File; use heed::types::ByteSlice; @@ -11,7 +12,7 @@ use super::helpers::{ }; use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::update::index_documents::helpers::into_clonable_grenad; -use crate::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, GeoPoint, Index, Result}; +use crate::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result}; pub(crate) enum TypedChunk { DocidWordPositions(grenad::Reader), @@ -180,24 +181,22 @@ pub(crate) fn write_typed_chunk_into_index( is_merged_database = true; } TypedChunk::GeoPoints(mut geo_points) => { - // TODO: we should create the rtree with the `RTree::bulk_load` function let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default(); - let mut doc_ids = index.geo_faceted_documents_ids(wtxn)?; + let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?; while let Some((key, value)) = geo_points.next()? { // convert the key back to a u32 (4 bytes) - let (key, _) = helpers::try_split_array_at::(key).unwrap(); - let key = u32::from_be_bytes(key); + let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); // convert the latitude and longitude back to a f64 (8 bytes) let (lat, tail) = helpers::try_split_array_at::(value).unwrap(); let (lng, _) = helpers::try_split_array_at::(tail).unwrap(); let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; - rtree.insert(GeoPoint::new(point, key)); - doc_ids.insert(key); + rtree.insert(GeoPoint::new(point, docid)); + geo_faceted_docids.insert(docid); } index.put_geo_rtree(wtxn, &rtree)?; - index.put_geo_faceted_documents_ids(wtxn, &doc_ids)?; + index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; } } From 3fc145c25447f44c4638c127937594bf0f406922 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 9 Sep 2021 15:19:47 +0200 Subject: [PATCH 36/42] if we have no rtree we return all other provided documents --- milli/src/lib.rs | 5 +++-- milli/src/search/criteria/geo.rs | 14 +++++++------- milli/src/search/criteria/mod.rs | 2 +- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 4f066365c..fc27b9d72 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -142,8 +142,9 @@ where Some((head, tail)) } -/// Return the distance between two points in meters. -fn distance_between_two_points(a: &[f64; 2], b: &[f64; 2]) -> f64 { +/// Return the distance between two points in meters. Each points are composed of two f64, +/// one latitude and one longitude. +pub fn distance_between_two_points(a: &[f64; 2], b: &[f64; 2]) -> f64 { let a = haversine::Location { latitude: a[0], longitude: a[1] }; let b = haversine::Location { latitude: b[0], longitude: b[1] }; diff --git a/milli/src/search/criteria/geo.rs b/milli/src/search/criteria/geo.rs index 78f741b57..c9dff307b 100644 --- a/milli/src/search/criteria/geo.rs +++ b/milli/src/search/criteria/geo.rs @@ -45,11 +45,7 @@ impl<'t> Geo<'t> { impl Criterion for Geo<'_> { fn next(&mut self, params: &mut CriterionParameters) -> Result> { - // if there is no rtree we have nothing to returns - let rtree = match self.rtree.as_ref() { - Some(rtree) => rtree, - None => return Ok(None), - }; + let rtree = self.rtree.as_ref(); loop { match self.candidates.next() { @@ -92,8 +88,12 @@ impl Criterion for Geo<'_> { continue; } self.allowed_candidates = &candidates - params.excluded_candidates; - self.candidates = - geo_point(rtree, self.allowed_candidates.clone(), self.point); + self.candidates = match rtree { + Some(rtree) => { + geo_point(rtree, self.allowed_candidates.clone(), self.point) + } + None => Box::new(std::iter::empty()), + }; } None => return Ok(None), }, diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 105a69194..3c9485012 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -312,7 +312,7 @@ impl<'t> CriteriaBuilder<'t> { )?), AscDescName::Desc(Member::Geo(_point)) => { return Err(UserError::InvalidSortName { - name: "Sorting in descending order is currently not supported for the geosearch".to_string(), + name: "sorting in descending order is not supported for the geosearch".to_string(), })? } }; From cfc62a1c15c9dc2c0d2d10697430e3ae666a4f19 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 9 Sep 2021 18:11:38 +0200 Subject: [PATCH 37/42] use geoutils instead of haversine --- milli/Cargo.toml | 2 +- milli/src/lib.rs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index be507332e..171a7ec4c 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -16,7 +16,7 @@ flate2 = "1.0.20" fst = "0.4.5" fxhash = "0.2.1" grenad = { version = "0.3.1", default-features = false, features = ["tempfile"] } -haversine = "0.2.1" +geoutils = "0.4.1" heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index fc27b9d72..7c9f56665 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -145,10 +145,10 @@ where /// Return the distance between two points in meters. Each points are composed of two f64, /// one latitude and one longitude. pub fn distance_between_two_points(a: &[f64; 2], b: &[f64; 2]) -> f64 { - let a = haversine::Location { latitude: a[0], longitude: a[1] }; - let b = haversine::Location { latitude: b[0], longitude: b[1] }; + let a = geoutils::Location::new(a[0], a[1]); + let b = geoutils::Location::new(b[0], b[1]); - haversine::distance(a, b, haversine::Units::Kilometers) * 1000. + a.haversine_distance_to(&b).meters() } #[cfg(test)] From 91ce4d1721765cb5862f6cf531fea0d2021021f0 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 13 Sep 2021 12:42:06 +0200 Subject: [PATCH 38/42] Stop iterating through the whole list of points We stop when there is no possible candidates left --- milli/src/search/criteria/geo.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/milli/src/search/criteria/geo.rs b/milli/src/search/criteria/geo.rs index c9dff307b..9629a4a15 100644 --- a/milli/src/search/criteria/geo.rs +++ b/milli/src/search/criteria/geo.rs @@ -104,14 +104,18 @@ impl Criterion for Geo<'_> { fn geo_point( rtree: &RTree, - candidates: RoaringBitmap, + mut candidates: RoaringBitmap, point: [f64; 2], ) -> Box> { - let results = rtree - .nearest_neighbor_iter(&point) - .filter_map(move |point| candidates.contains(point.data).then(|| point.data)) - .map(|id| iter::once(id).collect::()) - .collect::>(); + let mut results = Vec::new(); + for point in rtree.nearest_neighbor_iter(&point) { + if candidates.remove(point.data) { + results.push(std::iter::once(point.data).collect()); + if candidates.is_empty() { + break; + } + } + } Box::new(results.into_iter()) } From c695a1ffd2306e663cf027d1727dca4e8c0c1503 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 13 Sep 2021 14:27:14 +0200 Subject: [PATCH 39/42] add the possibility to sort by descending order on geoPoint --- milli/src/search/criteria/geo.rs | 39 ++++++++++++++++++++++++++++---- milli/src/search/criteria/mod.rs | 15 ++++++------ 2 files changed, 42 insertions(+), 12 deletions(-) diff --git a/milli/src/search/criteria/geo.rs b/milli/src/search/criteria/geo.rs index 9629a4a15..de6de8912 100644 --- a/milli/src/search/criteria/geo.rs +++ b/milli/src/search/criteria/geo.rs @@ -10,6 +10,7 @@ use crate::{GeoPoint, Index, Result}; pub struct Geo<'t> { index: &'t Index, rtxn: &'t heed::RoTxn<'t>, + ascending: bool, parent: Box, candidates: Box>, allowed_candidates: RoaringBitmap, @@ -19,11 +20,30 @@ pub struct Geo<'t> { } impl<'t> Geo<'t> { - pub fn new( + pub fn asc( index: &'t Index, rtxn: &'t heed::RoTxn<'t>, parent: Box, point: [f64; 2], + ) -> Result { + Self::new(index, rtxn, parent, point, true) + } + + pub fn desc( + index: &'t Index, + rtxn: &'t heed::RoTxn<'t>, + parent: Box, + point: [f64; 2], + ) -> Result { + Self::new(index, rtxn, parent, point, false) + } + + fn new( + index: &'t Index, + rtxn: &'t heed::RoTxn<'t>, + parent: Box, + point: [f64; 2], + ascending: bool, ) -> Result { let candidates = Box::new(iter::empty()); let allowed_candidates = index.geo_faceted_documents_ids(rtxn)?; @@ -33,6 +53,7 @@ impl<'t> Geo<'t> { Ok(Self { index, rtxn, + ascending, parent, candidates, allowed_candidates, @@ -89,9 +110,12 @@ impl Criterion for Geo<'_> { } self.allowed_candidates = &candidates - params.excluded_candidates; self.candidates = match rtree { - Some(rtree) => { - geo_point(rtree, self.allowed_candidates.clone(), self.point) - } + Some(rtree) => geo_point( + rtree, + self.allowed_candidates.clone(), + self.point, + self.ascending, + ), None => Box::new(std::iter::empty()), }; } @@ -106,6 +130,7 @@ fn geo_point( rtree: &RTree, mut candidates: RoaringBitmap, point: [f64; 2], + ascending: bool, ) -> Box> { let mut results = Vec::new(); for point in rtree.nearest_neighbor_iter(&point) { @@ -117,5 +142,9 @@ fn geo_point( } } - Box::new(results.into_iter()) + if ascending { + Box::new(results.into_iter()) + } else { + Box::new(results.into_iter().rev()) + } } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 3c9485012..c2de55de5 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -15,7 +15,7 @@ use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; use crate::criterion::{AscDesc as AscDescName, Member}; use crate::search::criteria::geo::Geo; use crate::search::{word_derivations, WordDerivationsCache}; -use crate::{DocumentId, FieldId, Index, Result, TreeLevel, UserError}; +use crate::{DocumentId, FieldId, Index, Result, TreeLevel}; mod asc_desc; mod attribute; @@ -304,17 +304,18 @@ impl<'t> CriteriaBuilder<'t> { criterion, field.to_string(), )?), - AscDescName::Asc(Member::Geo(point)) => Box::new(Geo::new( + AscDescName::Asc(Member::Geo(point)) => Box::new(Geo::asc( + &self.index, + &self.rtxn, + criterion, + point.clone(), + )?), + AscDescName::Desc(Member::Geo(point)) => Box::new(Geo::desc( &self.index, &self.rtxn, criterion, point.clone(), )?), - AscDescName::Desc(Member::Geo(_point)) => { - return Err(UserError::InvalidSortName { - name: "sorting in descending order is not supported for the geosearch".to_string(), - })? - } }; } criterion From 3b7a2cdbced36fe15fa7e19e7efd6e4687f3ea81 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Mon, 20 Sep 2021 16:10:39 +0200 Subject: [PATCH 40/42] fix typo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/index.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 4ab19f175..f7603148d 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -344,7 +344,7 @@ impl Index { self.main.delete::<_, Str>(wtxn, main_key::GEO_FACETED_DOCUMENTS_IDS_KEY) } - /// Retrieve all the documents ids that faceted with a _geo field. + /// Retrieve all the documents ids that are faceted with a _geo field. pub fn geo_faceted_documents_ids(&self, rtxn: &RoTxn) -> heed::Result { match self .main From f4b8e5675d903161b262910126cbf6ebf461f89a Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 20 Sep 2021 17:21:02 +0200 Subject: [PATCH 41/42] move the reserved keyword logic for the criterion and sort + add test --- milli/src/criterion.rs | 73 +++++++++++++++++++++++++++++++++++++----- milli/src/error.rs | 2 +- 2 files changed, 66 insertions(+), 9 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index b95080b4b..eb9a6c86a 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -50,9 +50,6 @@ impl FromStr for Criterion { "sort" => Ok(Criterion::Sort), "exactness" => Ok(Criterion::Exactness), text => match AscDesc::from_str(text) { - Ok(AscDesc::Asc(Member::Field(field))) if is_reserved_keyword(&field) => { - Err(UserError::InvalidReservedRankingRuleName { name: text.to_string() })? - } Ok(AscDesc::Asc(Member::Field(field))) => Ok(Criterion::Asc(field)), Ok(AscDesc::Desc(Member::Field(field))) => Ok(Criterion::Desc(field)), Ok(AscDesc::Asc(Member::Geo(_))) | Ok(AscDesc::Desc(Member::Geo(_))) => { @@ -79,11 +76,8 @@ impl FromStr for Member { type Err = UserError; fn from_str(text: &str) -> Result { - if text.starts_with("_geoPoint(") { - let point = - text.strip_prefix("_geoPoint(") - .and_then(|point| point.strip_suffix(")")) - .ok_or_else(|| UserError::InvalidRankingRuleName { name: text.to_string() })?; + if let Some(point) = text.strip_prefix("_geoPoint(").and_then(|text| text.strip_suffix(")")) + { let (lat, long) = point .split_once(',') .ok_or_else(|| UserError::InvalidRankingRuleName { name: text.to_string() }) @@ -95,6 +89,9 @@ impl FromStr for Member { })?; Ok(Member::Geo([lat, long])) } else { + if is_reserved_keyword(text) { + return Err(UserError::InvalidReservedRankingRuleName { name: text.to_string() })?; + } Ok(Member::Field(text.to_string())) } } @@ -185,3 +182,63 @@ impl fmt::Display for Criterion { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_asc_desc() { + use big_s::S; + use AscDesc::*; + use Member::*; + + let valid_req = [ + ("truc:asc", Asc(Field(S("truc")))), + ("bidule:desc", Desc(Field(S("bidule")))), + ("a-b:desc", Desc(Field(S("a-b")))), + ("a:b:desc", Desc(Field(S("a:b")))), + ("a12:asc", Asc(Field(S("a12")))), + ("42:asc", Asc(Field(S("42")))), + ("_geoPoint(42, 59):asc", Asc(Geo([42., 59.]))), + ("_geoPoint(42.459, 59):desc", Desc(Geo([42.459, 59.]))), + ("_geoPoint(42, 59.895):desc", Desc(Geo([42., 59.895]))), + ("_geoPoint(42, 59.895):desc", Desc(Geo([42., 59.895]))), + ("_geoPoint(42.0002, 59.895):desc", Desc(Geo([42.0002, 59.895]))), + ("_geoPoint(42., 59.):desc", Desc(Geo([42., 59.]))), + ("truc(12, 13):desc", Desc(Field(S("truc(12, 13)")))), + ]; + + for (req, expected) in valid_req { + let res = req.parse(); + assert!(res.is_ok(), "Failed to parse `{}`, was expecting `{:?}`", req, expected); + assert_eq!(expected, res.unwrap()); + } + + let invalid_req = [ + "truc:machin", + "truc:deesc", + "truc:asc:deesc", + "42desc", + "_geoPoint:asc", + "_geoDistance:asc", + "_geoPoint(42.12 , 59.598)", + "_geoPoint(42.12 , 59.598):deesc", + "_geoPoint(42.12 , 59.598):machin", + "_geoPoint(42.12 , 59.598):asc:aasc", + "_geoPoint(42,12 , 59,598):desc", + "_geoPoint(35, 85, 75):asc", + "_geoPoint(18):asc", + ]; + + for req in invalid_req { + let res = req.parse::(); + assert!( + res.is_err(), + "Should no be able to parse `{}`, was expecting an error but instead got: `{:?}`", + req, + res, + ); + } + } +} diff --git a/milli/src/error.rs b/milli/src/error.rs index 21b77b5a7..e6bd3fd62 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -13,7 +13,7 @@ use crate::{DocumentId, FieldId}; pub type Object = Map; pub fn is_reserved_keyword(keyword: &str) -> bool { - ["_geo", "_geoDistance"].contains(&keyword) + ["_geo", "_geoDistance", "_geoPoint", "_geoRadius"].contains(&keyword) } #[derive(Debug)] From 0d104a0fceab3555c5e2330ab4992d97f74d225d Mon Sep 17 00:00:00 2001 From: Irevoire Date: Mon, 20 Sep 2021 18:08:22 +0200 Subject: [PATCH 42/42] Update milli/src/criterion.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/criterion.rs | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index eb9a6c86a..24879cdd4 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -76,23 +76,29 @@ impl FromStr for Member { type Err = UserError; fn from_str(text: &str) -> Result { - if let Some(point) = text.strip_prefix("_geoPoint(").and_then(|text| text.strip_suffix(")")) - { - let (lat, long) = point - .split_once(',') - .ok_or_else(|| UserError::InvalidRankingRuleName { name: text.to_string() }) - .and_then(|(lat, long)| { - lat.trim() - .parse() - .and_then(|lat| long.trim().parse().map(|long| (lat, long))) - .map_err(|_| UserError::InvalidRankingRuleName { name: text.to_string() }) - })?; - Ok(Member::Geo([lat, long])) - } else { - if is_reserved_keyword(text) { - return Err(UserError::InvalidReservedRankingRuleName { name: text.to_string() })?; + match text.strip_prefix("_geoPoint(").and_then(|text| text.strip_suffix(")")) { + Some(point) => { + let (lat, long) = point + .split_once(',') + .ok_or_else(|| UserError::InvalidRankingRuleName { name: text.to_string() }) + .and_then(|(lat, long)| { + lat.trim() + .parse() + .and_then(|lat| long.trim().parse().map(|long| (lat, long))) + .map_err(|_| UserError::InvalidRankingRuleName { + name: text.to_string(), + }) + })?; + Ok(Member::Geo([lat, long])) + } + None => { + if is_reserved_keyword(text) { + return Err(UserError::InvalidReservedRankingRuleName { + name: text.to_string(), + })?; + } + Ok(Member::Field(text.to_string())) } - Ok(Member::Field(text.to_string())) } } }