4642: Index the _geo fields when changing the setting while there is already documents in the DB r=ManyTheFish a=irevoire

# Pull Request

## Related issue
Fixes https://github.com/meilisearch/meilisearch/issues/4640
Fixes https://github.com/meilisearch/meilisearch/issues/4628

## What does this PR do?
- Add an integration test that first indexes the document and then changes the settings
- Fix `extract_geo_point` by detecting if the `_geo` field has been faceted in this setting change and index all documents

Co-authored-by: Tamo <tamo@meilisearch.com>
Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2024-05-21 13:16:11 +00:00 committed by GitHub
commit 9066a446a3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 157 additions and 85 deletions

View File

@ -117,3 +117,69 @@ async fn geo_bounding_box_with_string_and_number() {
) )
.await; .await;
} }
#[actix_rt::test]
async fn bug_4640() {
// https://github.com/meilisearch/meilisearch/issues/4640
let server = Server::new().await;
let index = server.index("test");
let documents = DOCUMENTS.clone();
index.add_documents(documents, None).await;
index.update_settings_filterable_attributes(json!(["_geo"])).await;
let (ret, _code) = index.update_settings_sortable_attributes(json!(["_geo"])).await;
index.wait_task(ret.uid()).await;
// Sort the document with the second one first
index
.search(
json!({
"sort": ["_geoPoint(45.4777599, 9.1967508):asc"],
}),
|response, code| {
assert_eq!(code, 200, "{}", response);
snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###"
{
"hits": [
{
"id": 2,
"name": "La Bella Italia",
"address": "456 Elm Street, Townsville",
"type": "Italian",
"rating": 9,
"_geo": {
"lat": "45.4777599",
"lng": "9.1967508"
}
},
{
"id": 1,
"name": "Taco Truck",
"address": "444 Salsa Street, Burritoville",
"type": "Mexican",
"rating": 9,
"_geo": {
"lat": 34.0522,
"lng": -118.2437
},
"_geoDistance": 9714063
},
{
"id": 3,
"name": "Crêpe Truck",
"address": "2 Billig Avenue, Rouenville",
"type": "French",
"rating": 10
}
],
"query": "",
"processingTimeMs": "[time]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 3
}
"###);
},
)
.await;
}

View File

@ -45,7 +45,6 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>, obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
settings_diff: &InnerIndexSettingsDiff, settings_diff: &InnerIndexSettingsDiff,
geo_fields_ids: Option<(FieldId, FieldId)>,
) -> Result<ExtractedFacetValues> { ) -> Result<ExtractedFacetValues> {
puffin::profile_function!(); puffin::profile_function!();
@ -127,12 +126,18 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
add_exists.insert(document); add_exists.insert(document);
} }
let geo_support = let del_geo_support = settings_diff
geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng); .old
.geo_fields_ids
.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
let add_geo_support = settings_diff
.new
.geo_fields_ids
.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
let del_filterable_values = let del_filterable_values =
del_value.map(|value| extract_facet_values(&value, geo_support)); del_value.map(|value| extract_facet_values(&value, del_geo_support));
let add_filterable_values = let add_filterable_values =
add_value.map(|value| extract_facet_values(&value, geo_support)); add_value.map(|value| extract_facet_values(&value, add_geo_support));
// Those closures are just here to simplify things a bit. // Those closures are just here to simplify things a bit.
let mut insert_numbers_diff = |del_numbers, add_numbers| { let mut insert_numbers_diff = |del_numbers, add_numbers| {

View File

@ -8,6 +8,7 @@ use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
use crate::error::GeoError; use crate::error::GeoError;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::extract_finite_float_from_value; use crate::update::index_documents::extract_finite_float_from_value;
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
use crate::{FieldId, InternalError, Result}; use crate::{FieldId, InternalError, Result};
/// Extracts the geographical coordinates contained in each document under the `_geo` field. /// Extracts the geographical coordinates contained in each document under the `_geo` field.
@ -18,7 +19,7 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>, obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
primary_key_id: FieldId, primary_key_id: FieldId,
(lat_fid, lng_fid): (FieldId, FieldId), settings_diff: &InnerIndexSettingsDiff,
) -> Result<grenad::Reader<BufReader<File>>> { ) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!(); puffin::profile_function!();
@ -40,23 +41,12 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
serde_json::from_slice(document_id).unwrap() serde_json::from_slice(document_id).unwrap()
}; };
// first we get the two fields // extract old version
match (obkv.get(lat_fid), obkv.get(lng_fid)) { let del_lat_lng =
(Some(lat), Some(lng)) => { extract_lat_lng(&obkv, &settings_diff.old, DelAdd::Deletion, document_id)?;
let deladd_lat_obkv = KvReaderDelAdd::new(lat); // extract new version
let deladd_lng_obkv = KvReaderDelAdd::new(lng); let add_lat_lng =
extract_lat_lng(&obkv, &settings_diff.new, DelAdd::Addition, document_id)?;
// then we extract the values
let del_lat_lng = deladd_lat_obkv
.get(DelAdd::Deletion)
.zip(deladd_lng_obkv.get(DelAdd::Deletion))
.map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
.transpose()?;
let add_lat_lng = deladd_lat_obkv
.get(DelAdd::Addition)
.zip(deladd_lng_obkv.get(DelAdd::Addition))
.map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
.transpose()?;
if del_lat_lng != add_lat_lng { if del_lat_lng != add_lat_lng {
let mut obkv = KvWriterDelAdd::memory(); let mut obkv = KvWriterDelAdd::memory();
@ -74,21 +64,31 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
writer.insert(docid_bytes, bytes)?; writer.insert(docid_bytes, bytes)?;
} }
} }
(None, Some(_)) => {
return Err(GeoError::MissingLatitude { document_id: document_id() }.into())
}
(Some(_), None) => {
return Err(GeoError::MissingLongitude { document_id: document_id() }.into())
}
(None, None) => (),
}
}
writer_into_reader(writer) writer_into_reader(writer)
} }
/// Extract the finite floats lat and lng from two bytes slices. /// Extract the finite floats lat and lng from two bytes slices.
fn extract_lat_lng(lat: &[u8], lng: &[u8], document_id: impl Fn() -> Value) -> Result<[f64; 2]> { fn extract_lat_lng(
document: &obkv::KvReader<FieldId>,
settings: &InnerIndexSettings,
deladd: DelAdd,
document_id: impl Fn() -> Value,
) -> Result<Option<[f64; 2]>> {
match settings.geo_fields_ids {
Some((lat_fid, lng_fid)) => {
let lat = document.get(lat_fid).map(KvReaderDelAdd::new).and_then(|r| r.get(deladd));
let lng = document.get(lng_fid).map(KvReaderDelAdd::new).and_then(|r| r.get(deladd));
let (lat, lng) = match (lat, lng) {
(Some(lat), Some(lng)) => (lat, lng),
(Some(_), None) => {
return Err(GeoError::MissingLatitude { document_id: document_id() }.into())
}
(None, Some(_)) => {
return Err(GeoError::MissingLongitude { document_id: document_id() }.into())
}
(None, None) => return Ok(None),
};
let lat = extract_finite_float_from_value( let lat = extract_finite_float_from_value(
serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
) )
@ -98,6 +98,8 @@ fn extract_lat_lng(lat: &[u8], lng: &[u8], document_id: impl Fn() -> Value) -> R
serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
) )
.map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?; .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
Ok(Some([lat, lng]))
Ok([lat, lng]) }
None => Ok(None),
}
} }

View File

@ -43,7 +43,6 @@ pub(crate) fn data_from_obkv_documents(
indexer: GrenadParameters, indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>, lmdb_writer_sx: Sender<Result<TypedChunk>>,
primary_key_id: FieldId, primary_key_id: FieldId,
geo_fields_ids: Option<(FieldId, FieldId)>,
settings_diff: Arc<InnerIndexSettingsDiff>, settings_diff: Arc<InnerIndexSettingsDiff>,
max_positions_per_attributes: Option<u32>, max_positions_per_attributes: Option<u32>,
) -> Result<()> { ) -> Result<()> {
@ -72,7 +71,6 @@ pub(crate) fn data_from_obkv_documents(
indexer, indexer,
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
primary_key_id, primary_key_id,
geo_fields_ids,
settings_diff.clone(), settings_diff.clone(),
max_positions_per_attributes, max_positions_per_attributes,
) )
@ -300,7 +298,6 @@ fn send_and_extract_flattened_documents_data(
indexer: GrenadParameters, indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>, lmdb_writer_sx: Sender<Result<TypedChunk>>,
primary_key_id: FieldId, primary_key_id: FieldId,
geo_fields_ids: Option<(FieldId, FieldId)>,
settings_diff: Arc<InnerIndexSettingsDiff>, settings_diff: Arc<InnerIndexSettingsDiff>,
max_positions_per_attributes: Option<u32>, max_positions_per_attributes: Option<u32>,
) -> Result<( ) -> Result<(
@ -310,12 +307,13 @@ fn send_and_extract_flattened_documents_data(
let flattened_documents_chunk = let flattened_documents_chunk =
flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
if let Some(geo_fields_ids) = geo_fields_ids { if settings_diff.run_geo_indexing() {
let documents_chunk_cloned = flattened_documents_chunk.clone(); let documents_chunk_cloned = flattened_documents_chunk.clone();
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
let settings_diff = settings_diff.clone();
rayon::spawn(move || { rayon::spawn(move || {
let result = let result =
extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, geo_fields_ids); extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, &settings_diff);
let _ = match result { let _ = match result {
Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))), Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))),
Err(error) => lmdb_writer_sx_cloned.send(Err(error)), Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
@ -354,7 +352,6 @@ fn send_and_extract_flattened_documents_data(
flattened_documents_chunk.clone(), flattened_documents_chunk.clone(),
indexer, indexer,
&settings_diff, &settings_diff,
geo_fields_ids,
)?; )?;
// send fid_docid_facet_numbers_chunk to DB writer // send fid_docid_facet_numbers_chunk to DB writer

View File

@ -324,28 +324,6 @@ where
// get the primary key field id // get the primary key field id
let primary_key_id = settings_diff.new.fields_ids_map.id(&primary_key).unwrap(); let primary_key_id = settings_diff.new.fields_ids_map.id(&primary_key).unwrap();
// get the fid of the `_geo.lat` and `_geo.lng` fields.
let mut field_id_map = self.index.fields_ids_map(self.wtxn)?;
// self.index.fields_ids_map($a)? ==>> field_id_map
let geo_fields_ids = match field_id_map.id("_geo") {
Some(gfid) => {
let is_sortable = self.index.sortable_fields_ids(self.wtxn)?.contains(&gfid);
let is_filterable = self.index.filterable_fields_ids(self.wtxn)?.contains(&gfid);
// if `_geo` is faceted then we get the `lat` and `lng`
if is_sortable || is_filterable {
let field_ids = field_id_map
.insert("_geo.lat")
.zip(field_id_map.insert("_geo.lng"))
.ok_or(UserError::AttributeLimitReached)?;
Some(field_ids)
} else {
None
}
}
None => None,
};
let pool_params = GrenadParameters { let pool_params = GrenadParameters {
chunk_compression_type: self.indexer_config.chunk_compression_type, chunk_compression_type: self.indexer_config.chunk_compression_type,
chunk_compression_level: self.indexer_config.chunk_compression_level, chunk_compression_level: self.indexer_config.chunk_compression_level,
@ -412,7 +390,6 @@ where
pool_params, pool_params,
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
primary_key_id, primary_key_id,
geo_fields_ids,
settings_diff.clone(), settings_diff.clone(),
max_positions_per_attributes, max_positions_per_attributes,
) )

View File

@ -1161,6 +1161,11 @@ impl InnerIndexSettingsDiff {
pub fn settings_update_only(&self) -> bool { pub fn settings_update_only(&self) -> bool {
self.settings_update_only self.settings_update_only
} }
pub fn run_geo_indexing(&self) -> bool {
self.old.geo_fields_ids != self.new.geo_fields_ids
|| (!self.settings_update_only && self.new.geo_fields_ids.is_some())
}
} }
#[derive(Clone)] #[derive(Clone)]
@ -1177,6 +1182,7 @@ pub(crate) struct InnerIndexSettings {
pub proximity_precision: ProximityPrecision, pub proximity_precision: ProximityPrecision,
pub embedding_configs: EmbeddingConfigs, pub embedding_configs: EmbeddingConfigs,
pub existing_fields: HashSet<String>, pub existing_fields: HashSet<String>,
pub geo_fields_ids: Option<(FieldId, FieldId)>,
} }
impl InnerIndexSettings { impl InnerIndexSettings {
@ -1185,7 +1191,7 @@ impl InnerIndexSettings {
let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap()); let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap());
let allowed_separators = index.allowed_separators(rtxn)?; let allowed_separators = index.allowed_separators(rtxn)?;
let dictionary = index.dictionary(rtxn)?; let dictionary = index.dictionary(rtxn)?;
let fields_ids_map = index.fields_ids_map(rtxn)?; let mut fields_ids_map = index.fields_ids_map(rtxn)?;
let user_defined_searchable_fields = index.user_defined_searchable_fields(rtxn)?; let user_defined_searchable_fields = index.user_defined_searchable_fields(rtxn)?;
let user_defined_searchable_fields = let user_defined_searchable_fields =
user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect()); user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect());
@ -1200,6 +1206,24 @@ impl InnerIndexSettings {
.into_iter() .into_iter()
.filter_map(|(field, count)| (count != 0).then_some(field)) .filter_map(|(field, count)| (count != 0).then_some(field))
.collect(); .collect();
// index.fields_ids_map($a)? ==>> fields_ids_map
let geo_fields_ids = match fields_ids_map.id("_geo") {
Some(gfid) => {
let is_sortable = index.sortable_fields_ids(rtxn)?.contains(&gfid);
let is_filterable = index.filterable_fields_ids(rtxn)?.contains(&gfid);
// if `_geo` is faceted then we get the `lat` and `lng`
if is_sortable || is_filterable {
let field_ids = fields_ids_map
.insert("_geo.lat")
.zip(fields_ids_map.insert("_geo.lng"))
.ok_or(UserError::AttributeLimitReached)?;
Some(field_ids)
} else {
None
}
}
None => None,
};
Ok(Self { Ok(Self {
stop_words, stop_words,
@ -1214,6 +1238,7 @@ impl InnerIndexSettings {
proximity_precision, proximity_precision,
embedding_configs, embedding_configs,
existing_fields, existing_fields,
geo_fields_ids,
}) })
} }