Merge #514

514: Stop flattening every field r=Kerollmops a=irevoire When we need to flatten a document: * The primary key contains a `.`. * Some fields need to be flattened Instead of flattening the whole object and thus creating a lot of allocations with the `serde_json_flatten_crate`, we instead generate a minimal sub-object containing only the fields that need to be flattened. That should create fewer allocations and thus index faster. --------- ``` group indexing_main_e1e362fa indexing_stop-flattening-every-field_40d1bd6b ----- ---------------------- --------------------------------------------- indexing/Indexing geo_point 1.99 23.7±0.23s ? ?/sec 1.00 11.9±0.21s ? ?/sec indexing/Indexing movies in three batches 1.00 18.2±0.24s ? ?/sec 1.01 18.3±0.29s ? ?/sec indexing/Indexing movies with default settings 1.00 17.5±0.09s ? ?/sec 1.01 17.7±0.26s ? ?/sec indexing/Indexing songs in three batches with default settings 1.00 64.8±0.47s ? ?/sec 1.00 65.1±0.49s ? ?/sec indexing/Indexing songs with default settings 1.00 54.9±0.99s ? ?/sec 1.01 55.7±1.34s ? ?/sec indexing/Indexing songs without any facets 1.00 50.6±0.62s ? ?/sec 1.01 50.9±1.05s ? ?/sec indexing/Indexing songs without faceted numbers 1.00 54.0±1.14s ? ?/sec 1.01 54.7±1.13s ? ?/sec indexing/Indexing wiki 1.00 996.2±8.54s ? ?/sec 1.02 1021.1±30.63s ? ?/sec indexing/Indexing wiki in three batches 1.00 1136.8±9.72s ? ?/sec 1.00 1138.6±6.59s ? ?/sec ``` So basically everything slowed down a liiiiiittle bit except the dataset with a nested field which got twice faster Co-authored-by: Tamo <tamo@meilisearch.com>
2025-06-30 18:38:29 +02:00 · 2022-04-26 11:50:33 +00:00 · 2022-04-26 11:50:33 +00:00 · 2fdf520271
commit 2fdf520271
parent 5adeac8047 f19d2dc548
2 changed files with 135 additions and 47 deletions
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -1337,32 +1337,34 @@ mod tests {
        let mut wtxn = index.write_txn().unwrap();
        let mut builder = update::Settings::new(&mut wtxn, &index, &config);
-        builder.set_primary_key("nested.id".to_owned());
+        builder.set_primary_key("complex.nested.id".to_owned());
        builder.execute(|_| ()).unwrap();
        wtxn.commit().unwrap();
        let mut wtxn = index.write_txn().unwrap();
        let content = documents!([
            {
-                "nested": {
+                "complex": {
-                    "id": 0,
+                    "nested": {
                        "id": 0,
                    },
                },
                "title": "The zeroth document",
            },
            {
-                "nested": {
+                "complex.nested": {
                    "id": 1,
                },
                "title": "The first document",
            },
            {
-                "nested": {
+                "complex": {
-                    "id": 2,
+                    "nested.id": 2,
                },
                "title": "The second document",
            },
            {
-                "nested.id": 3,
+                "complex.nested.id": 3,
                "title": "The third document",
            },
        ]);
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@ -340,35 +340,48 @@ impl<'a, 'i> Transform<'a, 'i> {
            return Ok(None);
        }
        // store the keys and values the original obkv + the flattened json
        // We first extract all the key+value out of the obkv. If a value is not nested
        // we keep a reference on its value. If the value is nested we'll get its value
        // as an owned `Vec<u8>` after flattening it.
        let mut key_value: Vec<(FieldId, Cow<[u8]>)> = Vec::new();
        // the object we're going to use to store the fields that need to be flattened.
        let mut doc = serde_json::Map::new();
-        for (k, v) in obkv.iter() {
+        // we recreate a json containing only the fields that needs to be flattened.
-            let key = self.fields_ids_map.name(k).ok_or(FieldIdMapMissingEntry::FieldId {
+        // all the raw values get inserted directly in the `key_value` vec.
-                field_id: k,
+        for (key, value) in obkv.iter() {
-                process: "Flatten from fields ids map.",
+            if json_depth_checker::should_flatten_from_unchecked_slice(value) {
-            })?;
+                let key = self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId {
-            let value = serde_json::from_slice::<serde_json::Value>(v)
+                    field_id: key,
-                .map_err(crate::error::InternalError::SerdeJson)?;
+                    process: "Flatten from fields ids map.",
-            doc.insert(key.to_string(), value);
+                })?;
                let value = serde_json::from_slice::<Value>(value)
                    .map_err(crate::error::InternalError::SerdeJson)?;
                doc.insert(key.to_string(), value);
            } else {
                key_value.push((key, value.into()));
            }
        }
        let flattened = flatten_serde_json::flatten(&doc);
-        // Once we have the flattened version we can convert it back to obkv and
+        // Once we have the flattened version we insert all the new generated fields_ids
-        // insert all the new generated fields_ids (if any) in the fields ids map.
+        // (if any) in the fields ids map and serialize the value.
-        let mut buffer: Vec<u8> = Vec::new();
+        for (key, value) in flattened.into_iter() {
        let mut writer = KvWriter::new(&mut buffer);
        let mut flattened: Vec<_> = flattened.into_iter().collect();
        // we reorder the field to get all the known field first
        flattened
            .sort_unstable_by_key(|(key, _)| self.fields_ids_map.id(&key).unwrap_or(FieldId::MAX));
        for (key, value) in flattened {
            let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?;
            let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
-            writer.insert(fid, &value)?;
+            key_value.push((fid, value.into()));
        }
        // we sort the key. If there was a conflict between the obkv and the new generated value the
        // keys will be consecutive.
        key_value.sort_unstable_by_key(|(key, _)| *key);
        let mut buffer = Vec::new();
        Self::create_obkv_from_key_value(&mut key_value, &mut buffer)?;
        Ok(Some(buffer))
    }
@ -380,41 +393,114 @@ impl<'a, 'i> Transform<'a, 'i> {
        output_buffer: &mut Vec<u8>,
        field_buffer_cache: &mut Vec<(u16, Cow<[u8]>)>,
    ) -> Result<()> {
        // store the keys and values of the json + the original obkv
        let mut key_value: Vec<(FieldId, Cow<[u8]>)> = Vec::new();
        // if the primary_key is nested we need to flatten the document before being able to do anything
        let mut doc = serde_json::Map::new();
-        for (k, v) in obkv.iter() {
+        // we recreate a json containing only the fields that needs to be flattened.
-            let key =
+        // all the raw values get inserted directly in the `key_value` vec.
-                mapping.get(&k).ok_or(InternalError::FieldIdMappingMissingEntry { key: k })?;
+        for (key, value) in obkv.iter() {
-            let key = self.fields_ids_map.name(*key).ok_or(FieldIdMapMissingEntry::FieldId {
+            if json_depth_checker::should_flatten_from_unchecked_slice(value) {
-                field_id: *key,
+                let key =
-                process: "Flatten from field mapping.",
+                    mapping.get(&key).ok_or(InternalError::FieldIdMappingMissingEntry { key })?;
-            })?;
+                let key =
-            let value =
+                    self.fields_ids_map.name(*key).ok_or(FieldIdMapMissingEntry::FieldId {
-                serde_json::from_slice::<serde_json::Value>(v).map_err(InternalError::SerdeJson)?;
+                        field_id: *key,
-            doc.insert(key.to_string(), value);
+                        process: "Flatten from field mapping.",
                    })?;
                let value = serde_json::from_slice::<serde_json::Value>(value)
                    .map_err(InternalError::SerdeJson)?;
                doc.insert(key.to_string(), value);
            } else {
                key_value.push((key, value.into()));
            }
        }
        let flattened = flatten_serde_json::flatten(&doc);
-        // Once we have the flattened version we can convert it back to obkv and
+        // Once we have the flattened version we insert all the new generated fields_ids
-        // insert all the new generated fields_ids (if any) in the fields ids map.
+        // (if any) in the fields ids map and serialize the value.
-        output_buffer.clear();
+        for (key, value) in flattened.into_iter() {
        let mut writer = KvWriter::new(output_buffer);
        let mut flattened: Vec<_> = flattened.into_iter().collect();
        // we reorder the field to get all the known field first
        flattened
            .sort_unstable_by_key(|(key, _)| self.fields_ids_map.id(&key).unwrap_or(FieldId::MAX));
        for (key, value) in flattened {
            let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?;
            let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
-            writer.insert(fid, &value)?;
+            key_value.push((fid, value.clone().into()));
            if field_buffer_cache.iter().find(|(id, _)| *id == fid).is_none() {
                field_buffer_cache.push((fid, value.into()));
            }
        }
        // we sort the key. If there was a conflict between the obkv and the new generated value the
        // keys will be consecutive.
        key_value.sort_unstable_by_key(|(key, _)| *key);
        Self::create_obkv_from_key_value(&mut key_value, output_buffer)?;
        Ok(())
    }
    /// Generate an obkv from a slice of key / value sorted by key.
    fn create_obkv_from_key_value(
        key_value: &mut [(FieldId, Cow<[u8]>)],
        output_buffer: &mut Vec<u8>,
    ) -> Result<()> {
        debug_assert!(
            key_value.windows(2).all(|vec| vec[0].0 <= vec[1].0),
            "The slice of key / value pair must be sorted."
        );
        output_buffer.clear();
        let mut writer = KvWriter::new(output_buffer);
        let mut skip_next_value = false;
        for things in key_value.windows(2) {
            if skip_next_value {
                skip_next_value = false;
                continue;
            }
            let (key1, value1) = &things[0];
            let (key2, value2) = &things[1];
            // now we're going to look for conflicts between the keys. For example the following documents would cause a conflict:
            // { "doggo.name": "jean", "doggo": { "name": "paul" } }
            // we should find a first "doggo.name" from the obkv and a second one from the flattening.
            // but we must generate the following document:
            // { "doggo.name": ["jean", "paul"] }
            // thus we're going to merge the value from the obkv and the flattened document in a single array and skip the next
            // iteration.
            if key1 == key2 {
                skip_next_value = true;
                let value1 = serde_json::from_slice(value1)
                    .map_err(crate::error::InternalError::SerdeJson)?;
                let value2 = serde_json::from_slice(value2)
                    .map_err(crate::error::InternalError::SerdeJson)?;
                let value = match (value1, value2) {
                    (Value::Array(mut left), Value::Array(mut right)) => {
                        left.append(&mut right);
                        Value::Array(left)
                    }
                    (Value::Array(mut array), value) | (value, Value::Array(mut array)) => {
                        array.push(value);
                        Value::Array(array)
                    }
                    (left, right) => Value::Array(vec![left, right]),
                };
                let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
                writer.insert(*key1, value)?;
            } else {
                writer.insert(*key1, value1)?;
            }
        }
        if !skip_next_value {
            // the unwrap is safe here, we know there was at least one value in the document
            let (key, value) = key_value.last().unwrap();
            writer.insert(*key, value)?;
        }
        Ok(())
    }