3986: Fix geo bounding box with strings r=ManyTheFish a=irevoire

# Pull Request

When sending a document with one geofield of type string (i.e.: `{ "_geo": { "lat": 12, "lng": "13" }}`), the geobounding box would exclude this document.

This PR fixes this issue by automatically parsing the string value in case we're working on a geofield.

## Related issue
Fixes https://github.com/meilisearch/meilisearch/issues/3973

## What does this PR do?
- Automatically parse the facet value iif we're working on a geofield.
- Make insta works with snapshots in loops or closure executed multiple times. (you may need to update your cli if it panics after this PR: `cargo install cargo-insta`).
- Add one integration test in milli and in meilisearch to ensure it works forever.
- Add three snapshots for the dump that mysteriously disappeared I don't know how


Co-authored-by: Tamo <tamo@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2023-08-09 07:58:15 +00:00 committed by GitHub
commit 44c1900f36
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 191 additions and 11 deletions

View File

@ -0,0 +1,24 @@
---
source: dump/src/reader/mod.rs
expression: spells.settings().unwrap()
---
{
"displayedAttributes": [
"*"
],
"searchableAttributes": [
"*"
],
"filterableAttributes": [],
"sortableAttributes": [],
"rankingRules": [
"typo",
"words",
"proximity",
"attribute",
"exactness"
],
"stopWords": [],
"synonyms": {},
"distinctAttribute": null
}

View File

@ -0,0 +1,38 @@
---
source: dump/src/reader/mod.rs
expression: products.settings().unwrap()
---
{
"displayedAttributes": [
"*"
],
"searchableAttributes": [
"*"
],
"filterableAttributes": [],
"sortableAttributes": [],
"rankingRules": [
"typo",
"words",
"proximity",
"attribute",
"exactness"
],
"stopWords": [],
"synonyms": {
"android": [
"phone",
"smartphone"
],
"iphone": [
"phone",
"smartphone"
],
"phone": [
"android",
"iphone",
"smartphone"
]
},
"distinctAttribute": null
}

View File

@ -0,0 +1,31 @@
---
source: dump/src/reader/mod.rs
expression: movies.settings().unwrap()
---
{
"displayedAttributes": [
"*"
],
"searchableAttributes": [
"*"
],
"filterableAttributes": [
"genres",
"id"
],
"sortableAttributes": [
"genres",
"id"
],
"rankingRules": [
"typo",
"words",
"proximity",
"attribute",
"exactness",
"release_date:asc"
],
"stopWords": [],
"synonyms": {},
"distinctAttribute": null
}

View File

@ -167,7 +167,9 @@ macro_rules! snapshot {
let (settings, snap_name, _) = $crate::default_snapshot_settings_for_test(test_name, Some(&snap_name));
settings.bind(|| {
let snap = format!("{}", $value);
meili_snap::insta::assert_snapshot!(format!("{}", snap_name), snap);
insta::allow_duplicates! {
meili_snap::insta::assert_snapshot!(format!("{}", snap_name), snap);
}
});
};
($value:expr, @$inline:literal) => {
@ -176,7 +178,9 @@ macro_rules! snapshot {
let (settings, _, _) = $crate::default_snapshot_settings_for_test("", Some("_dummy_argument"));
settings.bind(|| {
let snap = format!("{}", $value);
meili_snap::insta::assert_snapshot!(snap, @$inline);
insta::allow_duplicates! {
meili_snap::insta::assert_snapshot!(snap, @$inline);
}
});
};
($value:expr) => {
@ -194,7 +198,9 @@ macro_rules! snapshot {
let (settings, snap_name, _) = $crate::default_snapshot_settings_for_test(test_name, None);
settings.bind(|| {
let snap = format!("{}", $value);
meili_snap::insta::assert_snapshot!(format!("{}", snap_name), snap);
insta::allow_duplicates! {
meili_snap::insta::assert_snapshot!(format!("{}", snap_name), snap);
}
});
};
}

View File

@ -1,3 +1,4 @@
use meili_snap::{json_string, snapshot};
use once_cell::sync::Lazy;
use serde_json::{json, Value};
@ -60,3 +61,59 @@ async fn geo_sort_with_geo_strings() {
)
.await;
}
#[actix_rt::test]
async fn geo_bounding_box_with_string_and_number() {
let server = Server::new().await;
let index = server.index("test");
let documents = DOCUMENTS.clone();
index.update_settings_filterable_attributes(json!(["_geo"])).await;
index.update_settings_sortable_attributes(json!(["_geo"])).await;
index.add_documents(documents, None).await;
index.wait_task(2).await;
index
.search(
json!({
"filter": "_geoBoundingBox([89, 179], [-89, -179])",
}),
|response, code| {
assert_eq!(code, 200, "{}", response);
snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###"
{
"hits": [
{
"id": 1,
"name": "Taco Truck",
"address": "444 Salsa Street, Burritoville",
"type": "Mexican",
"rating": 9,
"_geo": {
"lat": 34.0522,
"lng": -118.2437
}
},
{
"id": 2,
"name": "La Bella Italia",
"address": "456 Elm Street, Townsville",
"type": "Italian",
"rating": 9,
"_geo": {
"lat": "45.4777599",
"lng": "9.1967508"
}
}
],
"query": "",
"processingTimeMs": "[time]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 2
}
"###);
},
)
.await;
}

View File

@ -1718,11 +1718,11 @@ pub(crate) mod tests {
.unwrap();
index
.add_documents(documents!([
{ "id": 0, "_geo": { "lat": 0, "lng": 0 } },
{ "id": 1, "_geo": { "lat": 0, "lng": -175 } },
{ "id": 2, "_geo": { "lat": 0, "lng": 175 } },
{ "id": 0, "_geo": { "lat": "0", "lng": "0" } },
{ "id": 1, "_geo": { "lat": 0, "lng": "-175" } },
{ "id": 2, "_geo": { "lat": "0", "lng": 175 } },
{ "id": 3, "_geo": { "lat": 85, "lng": 0 } },
{ "id": 4, "_geo": { "lat": -85, "lng": 0 } },
{ "id": 4, "_geo": { "lat": "-85", "lng": "0" } },
]))
.unwrap();

View File

@ -28,11 +28,13 @@ pub struct ExtractedFacetValues {
///
/// Returns the generated grenad reader containing the docid the fid and the orginal value as key
/// and the normalized value as value extracted from the given chunk of documents.
/// We need the fid of the geofields to correctly parse them as numbers if they were sent as strings initially.
#[logging_timer::time]
pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters,
faceted_fields: &HashSet<FieldId>,
geo_fields_ids: Option<(FieldId, FieldId)>,
) -> Result<ExtractedFacetValues> {
let max_memory = indexer.max_memory_by_thread();
@ -82,7 +84,10 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
match extract_facet_values(&value) {
match extract_facet_values(
&value,
geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng),
) {
FilterableValues::Null => {
facet_is_null_docids.entry(field_id).or_default().insert(document);
}
@ -175,12 +180,13 @@ enum FilterableValues {
Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
}
fn extract_facet_values(value: &Value) -> FilterableValues {
fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
fn inner_extract_facet_values(
value: &Value,
can_recurse: bool,
output_numbers: &mut Vec<f64>,
output_strings: &mut Vec<(String, String)>,
geo_field: bool,
) {
match value {
Value::Null => (),
@ -191,13 +197,30 @@ fn extract_facet_values(value: &Value) -> FilterableValues {
}
}
Value::String(original) => {
// if we're working on a geofield it MUST be something we can parse or else there was an internal error
// in the enrich pipeline. But since the enrich pipeline worked, we want to avoid crashing at all costs.
if geo_field {
if let Ok(float) = original.parse() {
output_numbers.push(float);
} else {
log::warn!(
"Internal error, could not parse a geofield that has been validated. Please open an issue."
)
}
}
let normalized = crate::normalize_facet(original);
output_strings.push((normalized, original.clone()));
}
Value::Array(values) => {
if can_recurse {
for value in values {
inner_extract_facet_values(value, false, output_numbers, output_strings);
inner_extract_facet_values(
value,
false,
output_numbers,
output_strings,
geo_field,
);
}
}
}
@ -213,7 +236,7 @@ fn extract_facet_values(value: &Value) -> FilterableValues {
otherwise => {
let mut numbers = Vec::new();
let mut strings = Vec::new();
inner_extract_facet_values(otherwise, true, &mut numbers, &mut strings);
inner_extract_facet_values(otherwise, true, &mut numbers, &mut strings, geo_field);
FilterableValues::Values { numbers, strings }
}
}

View File

@ -366,6 +366,7 @@ fn send_and_extract_flattened_documents_data(
flattened_documents_chunk.clone(),
indexer,
faceted_fields,
geo_fields_ids,
)?;
// send docid_fid_facet_numbers_chunk to DB writer