mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-11 14:04:31 +01:00
Merge #514
514: Stop flattening every field r=Kerollmops a=irevoire When we need to flatten a document: * The primary key contains a `.`. * Some fields need to be flattened Instead of flattening the whole object and thus creating a lot of allocations with the `serde_json_flatten_crate`, we instead generate a minimal sub-object containing only the fields that need to be flattened. That should create fewer allocations and thus index faster. --------- ``` group indexing_main_e1e362fa indexing_stop-flattening-every-field_40d1bd6b ----- ---------------------- --------------------------------------------- indexing/Indexing geo_point 1.99 23.7±0.23s ? ?/sec 1.00 11.9±0.21s ? ?/sec indexing/Indexing movies in three batches 1.00 18.2±0.24s ? ?/sec 1.01 18.3±0.29s ? ?/sec indexing/Indexing movies with default settings 1.00 17.5±0.09s ? ?/sec 1.01 17.7±0.26s ? ?/sec indexing/Indexing songs in three batches with default settings 1.00 64.8±0.47s ? ?/sec 1.00 65.1±0.49s ? ?/sec indexing/Indexing songs with default settings 1.00 54.9±0.99s ? ?/sec 1.01 55.7±1.34s ? ?/sec indexing/Indexing songs without any facets 1.00 50.6±0.62s ? ?/sec 1.01 50.9±1.05s ? ?/sec indexing/Indexing songs without faceted numbers 1.00 54.0±1.14s ? ?/sec 1.01 54.7±1.13s ? ?/sec indexing/Indexing wiki 1.00 996.2±8.54s ? ?/sec 1.02 1021.1±30.63s ? ?/sec indexing/Indexing wiki in three batches 1.00 1136.8±9.72s ? ?/sec 1.00 1138.6±6.59s ? ?/sec ``` So basically everything slowed down a liiiiiittle bit except the dataset with a nested field which got twice faster Co-authored-by: Tamo <tamo@meilisearch.com>
This commit is contained in:
commit
2fdf520271
@ -1337,32 +1337,34 @@ mod tests {
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut builder = update::Settings::new(&mut wtxn, &index, &config);
|
||||
builder.set_primary_key("nested.id".to_owned());
|
||||
builder.set_primary_key("complex.nested.id".to_owned());
|
||||
builder.execute(|_| ()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let content = documents!([
|
||||
{
|
||||
"nested": {
|
||||
"id": 0,
|
||||
"complex": {
|
||||
"nested": {
|
||||
"id": 0,
|
||||
},
|
||||
},
|
||||
"title": "The zeroth document",
|
||||
},
|
||||
{
|
||||
"nested": {
|
||||
"complex.nested": {
|
||||
"id": 1,
|
||||
},
|
||||
"title": "The first document",
|
||||
},
|
||||
{
|
||||
"nested": {
|
||||
"id": 2,
|
||||
"complex": {
|
||||
"nested.id": 2,
|
||||
},
|
||||
"title": "The second document",
|
||||
},
|
||||
{
|
||||
"nested.id": 3,
|
||||
"complex.nested.id": 3,
|
||||
"title": "The third document",
|
||||
},
|
||||
]);
|
||||
|
@ -340,35 +340,48 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// store the keys and values the original obkv + the flattened json
|
||||
// We first extract all the key+value out of the obkv. If a value is not nested
|
||||
// we keep a reference on its value. If the value is nested we'll get its value
|
||||
// as an owned `Vec<u8>` after flattening it.
|
||||
let mut key_value: Vec<(FieldId, Cow<[u8]>)> = Vec::new();
|
||||
|
||||
// the object we're going to use to store the fields that need to be flattened.
|
||||
let mut doc = serde_json::Map::new();
|
||||
|
||||
for (k, v) in obkv.iter() {
|
||||
let key = self.fields_ids_map.name(k).ok_or(FieldIdMapMissingEntry::FieldId {
|
||||
field_id: k,
|
||||
process: "Flatten from fields ids map.",
|
||||
})?;
|
||||
let value = serde_json::from_slice::<serde_json::Value>(v)
|
||||
.map_err(crate::error::InternalError::SerdeJson)?;
|
||||
doc.insert(key.to_string(), value);
|
||||
// we recreate a json containing only the fields that needs to be flattened.
|
||||
// all the raw values get inserted directly in the `key_value` vec.
|
||||
for (key, value) in obkv.iter() {
|
||||
if json_depth_checker::should_flatten_from_unchecked_slice(value) {
|
||||
let key = self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId {
|
||||
field_id: key,
|
||||
process: "Flatten from fields ids map.",
|
||||
})?;
|
||||
|
||||
let value = serde_json::from_slice::<Value>(value)
|
||||
.map_err(crate::error::InternalError::SerdeJson)?;
|
||||
doc.insert(key.to_string(), value);
|
||||
} else {
|
||||
key_value.push((key, value.into()));
|
||||
}
|
||||
}
|
||||
|
||||
let flattened = flatten_serde_json::flatten(&doc);
|
||||
|
||||
// Once we have the flattened version we can convert it back to obkv and
|
||||
// insert all the new generated fields_ids (if any) in the fields ids map.
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
let mut writer = KvWriter::new(&mut buffer);
|
||||
let mut flattened: Vec<_> = flattened.into_iter().collect();
|
||||
// we reorder the field to get all the known field first
|
||||
flattened
|
||||
.sort_unstable_by_key(|(key, _)| self.fields_ids_map.id(&key).unwrap_or(FieldId::MAX));
|
||||
|
||||
for (key, value) in flattened {
|
||||
// Once we have the flattened version we insert all the new generated fields_ids
|
||||
// (if any) in the fields ids map and serialize the value.
|
||||
for (key, value) in flattened.into_iter() {
|
||||
let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?;
|
||||
let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
|
||||
writer.insert(fid, &value)?;
|
||||
key_value.push((fid, value.into()));
|
||||
}
|
||||
|
||||
// we sort the key. If there was a conflict between the obkv and the new generated value the
|
||||
// keys will be consecutive.
|
||||
key_value.sort_unstable_by_key(|(key, _)| *key);
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
Self::create_obkv_from_key_value(&mut key_value, &mut buffer)?;
|
||||
Ok(Some(buffer))
|
||||
}
|
||||
|
||||
@ -380,41 +393,114 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
output_buffer: &mut Vec<u8>,
|
||||
field_buffer_cache: &mut Vec<(u16, Cow<[u8]>)>,
|
||||
) -> Result<()> {
|
||||
// store the keys and values of the json + the original obkv
|
||||
let mut key_value: Vec<(FieldId, Cow<[u8]>)> = Vec::new();
|
||||
|
||||
// if the primary_key is nested we need to flatten the document before being able to do anything
|
||||
let mut doc = serde_json::Map::new();
|
||||
|
||||
for (k, v) in obkv.iter() {
|
||||
let key =
|
||||
mapping.get(&k).ok_or(InternalError::FieldIdMappingMissingEntry { key: k })?;
|
||||
let key = self.fields_ids_map.name(*key).ok_or(FieldIdMapMissingEntry::FieldId {
|
||||
field_id: *key,
|
||||
process: "Flatten from field mapping.",
|
||||
})?;
|
||||
let value =
|
||||
serde_json::from_slice::<serde_json::Value>(v).map_err(InternalError::SerdeJson)?;
|
||||
doc.insert(key.to_string(), value);
|
||||
// we recreate a json containing only the fields that needs to be flattened.
|
||||
// all the raw values get inserted directly in the `key_value` vec.
|
||||
for (key, value) in obkv.iter() {
|
||||
if json_depth_checker::should_flatten_from_unchecked_slice(value) {
|
||||
let key =
|
||||
mapping.get(&key).ok_or(InternalError::FieldIdMappingMissingEntry { key })?;
|
||||
let key =
|
||||
self.fields_ids_map.name(*key).ok_or(FieldIdMapMissingEntry::FieldId {
|
||||
field_id: *key,
|
||||
process: "Flatten from field mapping.",
|
||||
})?;
|
||||
let value = serde_json::from_slice::<serde_json::Value>(value)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
doc.insert(key.to_string(), value);
|
||||
} else {
|
||||
key_value.push((key, value.into()));
|
||||
}
|
||||
}
|
||||
|
||||
let flattened = flatten_serde_json::flatten(&doc);
|
||||
|
||||
// Once we have the flattened version we can convert it back to obkv and
|
||||
// insert all the new generated fields_ids (if any) in the fields ids map.
|
||||
output_buffer.clear();
|
||||
let mut writer = KvWriter::new(output_buffer);
|
||||
let mut flattened: Vec<_> = flattened.into_iter().collect();
|
||||
// we reorder the field to get all the known field first
|
||||
flattened
|
||||
.sort_unstable_by_key(|(key, _)| self.fields_ids_map.id(&key).unwrap_or(FieldId::MAX));
|
||||
|
||||
for (key, value) in flattened {
|
||||
// Once we have the flattened version we insert all the new generated fields_ids
|
||||
// (if any) in the fields ids map and serialize the value.
|
||||
for (key, value) in flattened.into_iter() {
|
||||
let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?;
|
||||
let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
|
||||
writer.insert(fid, &value)?;
|
||||
key_value.push((fid, value.clone().into()));
|
||||
|
||||
if field_buffer_cache.iter().find(|(id, _)| *id == fid).is_none() {
|
||||
field_buffer_cache.push((fid, value.into()));
|
||||
}
|
||||
}
|
||||
|
||||
// we sort the key. If there was a conflict between the obkv and the new generated value the
|
||||
// keys will be consecutive.
|
||||
key_value.sort_unstable_by_key(|(key, _)| *key);
|
||||
|
||||
Self::create_obkv_from_key_value(&mut key_value, output_buffer)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Generate an obkv from a slice of key / value sorted by key.
|
||||
fn create_obkv_from_key_value(
|
||||
key_value: &mut [(FieldId, Cow<[u8]>)],
|
||||
output_buffer: &mut Vec<u8>,
|
||||
) -> Result<()> {
|
||||
debug_assert!(
|
||||
key_value.windows(2).all(|vec| vec[0].0 <= vec[1].0),
|
||||
"The slice of key / value pair must be sorted."
|
||||
);
|
||||
|
||||
output_buffer.clear();
|
||||
let mut writer = KvWriter::new(output_buffer);
|
||||
|
||||
let mut skip_next_value = false;
|
||||
for things in key_value.windows(2) {
|
||||
if skip_next_value {
|
||||
skip_next_value = false;
|
||||
continue;
|
||||
}
|
||||
let (key1, value1) = &things[0];
|
||||
let (key2, value2) = &things[1];
|
||||
|
||||
// now we're going to look for conflicts between the keys. For example the following documents would cause a conflict:
|
||||
// { "doggo.name": "jean", "doggo": { "name": "paul" } }
|
||||
// we should find a first "doggo.name" from the obkv and a second one from the flattening.
|
||||
// but we must generate the following document:
|
||||
// { "doggo.name": ["jean", "paul"] }
|
||||
// thus we're going to merge the value from the obkv and the flattened document in a single array and skip the next
|
||||
// iteration.
|
||||
if key1 == key2 {
|
||||
skip_next_value = true;
|
||||
|
||||
let value1 = serde_json::from_slice(value1)
|
||||
.map_err(crate::error::InternalError::SerdeJson)?;
|
||||
let value2 = serde_json::from_slice(value2)
|
||||
.map_err(crate::error::InternalError::SerdeJson)?;
|
||||
let value = match (value1, value2) {
|
||||
(Value::Array(mut left), Value::Array(mut right)) => {
|
||||
left.append(&mut right);
|
||||
Value::Array(left)
|
||||
}
|
||||
(Value::Array(mut array), value) | (value, Value::Array(mut array)) => {
|
||||
array.push(value);
|
||||
Value::Array(array)
|
||||
}
|
||||
(left, right) => Value::Array(vec![left, right]),
|
||||
};
|
||||
|
||||
let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
|
||||
writer.insert(*key1, value)?;
|
||||
} else {
|
||||
writer.insert(*key1, value1)?;
|
||||
}
|
||||
}
|
||||
|
||||
if !skip_next_value {
|
||||
// the unwrap is safe here, we know there was at least one value in the document
|
||||
let (key, value) = key_value.last().unwrap();
|
||||
writer.insert(*key, value)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user