move the flatten-serde-json crate inside of milli

This commit is contained in:
Tamo 2022-04-07 18:20:44 +02:00
parent ab458d8840
commit bab898ce86
No known key found for this signature in database
GPG Key ID: 20CD8020AFA88D69
8 changed files with 479 additions and 2 deletions

View File

@ -1,6 +1,6 @@
[workspace]
resolver = "2"
members = ["milli", "filter-parser", "http-ui", "benchmarks", "infos", "helpers", "cli"]
members = ["milli", "filter-parser", "flatten-serde-json", "http-ui", "benchmarks", "infos", "helpers", "cli"]
default-members = ["milli"]
[profile.dev]

View File

@ -0,0 +1,15 @@
[package]
name = "flatten-serde-json"
version = "0.1.0"
edition = "2021"
description = "Flatten serde-json objects like elastic search"
readme = "README.md"
author = ["Tamo tamo@meilisearch.com"]
repository = "https://github.com/irevoire/flatten-serde-json"
keywords = ["json", "flatten"]
categories = ["command-line-utilities"]
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
serde_json = "1.0"

View File

@ -0,0 +1,153 @@
# Flatten serde Json
This crate flatten [`serde_json`](https://docs.rs/serde_json/latest/serde_json/) `Object` in a format
similar to [elastic search](https://www.elastic.co/guide/en/elasticsearch/reference/current/nested.html).
## Examples
### There is nothing to do
```json
{
"id": "287947",
"title": "Shazam!",
"release_date": 1553299200,
"genres": [
"Action",
"Comedy",
"Fantasy"
]
}
```
Flattens to:
```json
{
"id": "287947",
"title": "Shazam!",
"release_date": 1553299200,
"genres": [
"Action",
"Comedy",
"Fantasy"
]
}
```
------------
### Objects
```json
{
"a": {
"b": "c",
"d": "e",
"f": "g"
}
}
```
Flattens to:
```json
{
"a.b": "c",
"a.d": "e",
"a.f": "g"
}
```
------------
### Array of objects
```json
{
"a": [
{ "b": "c" },
{ "b": "d" },
{ "b": "e" },
]
}
```
Flattens to:
```json
{
"a.b": ["c", "d", "e"],
}
```
------------
### Array of objects with normal value in the array
```json
{
"a": [
42,
{ "b": "c" },
{ "b": "d" },
{ "b": "e" },
]
}
```
Flattens to:
```json
{
"a": 42,
"a.b": ["c", "d", "e"],
}
```
------------
### Array of objects of array of objects of ...
```json
{
"a": [
"b",
["c", "d"],
{ "e": ["f", "g"] },
[
{ "h": "i" },
{ "e": ["j", { "z": "y" }] },
],
["l"],
"m",
]
}
```
Flattens to:
```json
{
"a": ["b", "c", "d", "l", "m"],
"a.e": ["f", "g", "j"],
"a.h": "i",
"a.e.z": "y",
}
```
------------
### Collision between a generated field name and an already existing field
```json
{
"a": {
"b": "c",
},
"a.b": "d",
}
```
Flattens to:
```json
{
"a.b": ["c", "d"],
}
```

View File

@ -0,0 +1,26 @@
[package]
name = "flatten_serde_json-fuzz"
version = "0.0.0"
authors = ["Automatically generated"]
publish = false
edition = "2018"
[package.metadata]
cargo-fuzz = true
[dependencies]
libfuzzer-sys = "0.4"
arbitrary-json = "0.1.1"
[dependencies.flatten_serde_json]
path = ".."
# Prevent this from interfering with workspaces
[workspace]
members = ["."]
[[bin]]
name = "flatten"
path = "fuzz_targets/flatten.rs"
test = false
doc = false

View File

@ -0,0 +1,8 @@
#![no_main]
use arbitrary_json::ArbitraryObject;
use flatten_serde_json::flatten;
use libfuzzer_sys::fuzz_target;
fuzz_target!(|object: ArbitraryObject| {
let _ = flatten(&object);
});

View File

@ -0,0 +1,264 @@
#![doc = include_str!("../README.md")]
use serde_json::{json, Map, Value};
pub fn flatten(json: &Map<String, Value>) -> Map<String, Value> {
let mut obj = Map::new();
insert_object(&mut obj, None, json);
obj
}
fn insert_object(
base_json: &mut Map<String, Value>,
base_key: Option<&str>,
object: &Map<String, Value>,
) {
for (key, value) in object {
let new_key = base_key.map_or_else(|| key.clone(), |base_key| format!("{base_key}.{key}"));
if let Some(array) = value.as_array() {
insert_array(base_json, &new_key, array);
} else if let Some(object) = value.as_object() {
insert_object(base_json, Some(&new_key), object);
} else {
insert_value(base_json, &new_key, value.clone());
}
}
}
fn insert_array(base_json: &mut Map<String, Value>, base_key: &str, array: &Vec<Value>) {
for value in array {
if let Some(object) = value.as_object() {
insert_object(base_json, Some(base_key), object);
} else if let Some(sub_array) = value.as_array() {
insert_array(base_json, base_key, sub_array);
} else {
insert_value(base_json, base_key, value.clone());
}
}
}
fn insert_value(base_json: &mut Map<String, Value>, key: &str, to_insert: Value) {
debug_assert!(!to_insert.is_object());
debug_assert!(!to_insert.is_array());
// does the field aleardy exists?
if let Some(value) = base_json.get_mut(key) {
// is it already an array
if let Some(array) = value.as_array_mut() {
array.push(to_insert);
// or is there a collision
} else {
let value = std::mem::take(value);
base_json[key] = json!([value, to_insert]);
}
// if it does not exist we can push the value untouched
} else {
base_json.insert(key.to_string(), json!(to_insert));
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn no_flattening() {
let mut base: Value = json!({
"id": "287947",
"title": "Shazam!",
"release_date": 1553299200,
"genres": [
"Action",
"Comedy",
"Fantasy"
]
});
let json = std::mem::take(base.as_object_mut().unwrap());
let flat = flatten(&json);
println!(
"got:\n{}\nexpected:\n{}\n",
serde_json::to_string_pretty(&flat).unwrap(),
serde_json::to_string_pretty(&json).unwrap()
);
assert_eq!(flat, json);
}
#[test]
fn flatten_object() {
let mut base: Value = json!({
"a": {
"b": "c",
"d": "e",
"f": "g"
}
});
let json = std::mem::take(base.as_object_mut().unwrap());
let flat = flatten(&json);
assert_eq!(
&flat,
json!({
"a.b": "c",
"a.d": "e",
"a.f": "g"
})
.as_object()
.unwrap()
);
}
#[test]
fn flatten_array() {
let mut base: Value = json!({
"a": [
{ "b": "c" },
{ "b": "d" },
{ "b": "e" },
]
});
let json = std::mem::take(base.as_object_mut().unwrap());
let flat = flatten(&json);
assert_eq!(
&flat,
json!({
"a.b": ["c", "d", "e"],
})
.as_object()
.unwrap()
);
// here we must keep 42 in "a"
let mut base: Value = json!({
"a": [
42,
{ "b": "c" },
{ "b": "d" },
{ "b": "e" },
]
});
let json = std::mem::take(base.as_object_mut().unwrap());
let flat = flatten(&json);
assert_eq!(
&flat,
json!({
"a": 42,
"a.b": ["c", "d", "e"],
})
.as_object()
.unwrap()
);
}
#[test]
fn collision_with_object() {
let mut base: Value = json!({
"a": {
"b": "c",
},
"a.b": "d",
});
let json = std::mem::take(base.as_object_mut().unwrap());
let flat = flatten(&json);
assert_eq!(
&flat,
json!({
"a.b": ["c", "d"],
})
.as_object()
.unwrap()
);
}
#[test]
fn collision_with_array() {
let mut base: Value = json!({
"a": [
{ "b": "c" },
{ "b": "d", "c": "e" },
[35],
],
"a.b": "f",
});
let json = std::mem::take(base.as_object_mut().unwrap());
let flat = flatten(&json);
assert_eq!(
&flat,
json!({
"a.b": ["c", "d", "f"],
"a.c": "e",
"a": 35,
})
.as_object()
.unwrap()
);
}
#[test]
fn flatten_nested_arrays() {
let mut base: Value = json!({
"a": [
["b", "c"],
{ "d": "e" },
["f", "g"],
[
{ "h": "i" },
{ "d": "j" },
],
["k", "l"],
]
});
let json = std::mem::take(base.as_object_mut().unwrap());
let flat = flatten(&json);
assert_eq!(
&flat,
json!({
"a": ["b", "c", "f", "g", "k", "l"],
"a.d": ["e", "j"],
"a.h": "i",
})
.as_object()
.unwrap()
);
}
#[test]
fn flatten_nested_arrays_and_objects() {
let mut base: Value = json!({
"a": [
"b",
["c", "d"],
{ "e": ["f", "g"] },
[
{ "h": "i" },
{ "e": ["j", { "z": "y" }] },
],
["l"],
"m",
]
});
let json = std::mem::take(base.as_object_mut().unwrap());
let flat = flatten(&json);
println!("{}", serde_json::to_string_pretty(&flat).unwrap());
assert_eq!(
&flat,
json!({
"a": ["b", "c", "d", "l", "m"],
"a.e": ["f", "g", "j"],
"a.h": "i",
"a.e.z": "y",
})
.as_object()
.unwrap()
);
}
}

View File

@ -0,0 +1,11 @@
use std::io::stdin;
use flatten_serde_json::flatten;
use serde_json::{Map, Value};
fn main() {
let json: Map<String, Value> = serde_json::from_reader(stdin()).unwrap();
let result = flatten(&json);
println!("{}", serde_json::to_string_pretty(&result).unwrap());
}

View File

@ -14,7 +14,7 @@ crossbeam-channel = "0.5.2"
either = "1.6.1"
fst = "0.4.7"
fxhash = "0.2.1"
flatten-serde-json = "0.1.0"
flatten-serde-json = { path = "../flatten-serde-json" }
grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] }
geoutils = "0.4.1"
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] }