mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-23 05:14:27 +01:00
Merge pull request #494 from meilisearch/flatten-what-is-needed
Only flatten the required objects
This commit is contained in:
commit
d362278a41
@ -1,6 +1,6 @@
|
|||||||
[workspace]
|
[workspace]
|
||||||
resolver = "2"
|
resolver = "2"
|
||||||
members = ["milli", "filter-parser", "flatten-serde-json", "http-ui", "benchmarks", "infos", "helpers", "cli"]
|
members = ["milli", "filter-parser", "flatten-serde-json", "json-depth-checker", "http-ui", "benchmarks", "infos", "helpers", "cli"]
|
||||||
default-members = ["milli"]
|
default-members = ["milli"]
|
||||||
|
|
||||||
[profile.dev]
|
[profile.dev]
|
||||||
|
16
json-depth-checker/Cargo.toml
Normal file
16
json-depth-checker/Cargo.toml
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
[package]
|
||||||
|
name = "json-depth-checker"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
serde_json = "1.0"
|
||||||
|
|
||||||
|
[dev-dependencies]
|
||||||
|
criterion = "0.3"
|
||||||
|
|
||||||
|
[[bench]]
|
||||||
|
name = "depth"
|
||||||
|
harness = false
|
59
json-depth-checker/benches/depth.rs
Normal file
59
json-depth-checker/benches/depth.rs
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
use criterion::{criterion_group, criterion_main, Criterion};
|
||||||
|
use json_depth_checker::should_flatten_from_unchecked_slice;
|
||||||
|
use serde_json::json;
|
||||||
|
|
||||||
|
fn criterion_benchmark(c: &mut Criterion) {
|
||||||
|
let null = serde_json::to_vec(&json!(null)).unwrap();
|
||||||
|
let bool_true = serde_json::to_vec(&json!(true)).unwrap();
|
||||||
|
let bool_false = serde_json::to_vec(&json!(false)).unwrap();
|
||||||
|
let integer = serde_json::to_vec(&json!(42)).unwrap();
|
||||||
|
let float = serde_json::to_vec(&json!(1456.258)).unwrap();
|
||||||
|
let string = serde_json::to_vec(&json!("hello world")).unwrap();
|
||||||
|
let object = serde_json::to_vec(&json!({ "hello": "world",})).unwrap();
|
||||||
|
let complex_object = serde_json::to_vec(&json!({
|
||||||
|
"doggos": [
|
||||||
|
{ "bernard": true },
|
||||||
|
{ "michel": 42 },
|
||||||
|
false,
|
||||||
|
],
|
||||||
|
"bouvier": true,
|
||||||
|
"caniche": null,
|
||||||
|
}))
|
||||||
|
.unwrap();
|
||||||
|
let simple_array = serde_json::to_vec(&json!([
|
||||||
|
1,
|
||||||
|
2,
|
||||||
|
3,
|
||||||
|
"viva",
|
||||||
|
"l\"algeria",
|
||||||
|
true,
|
||||||
|
"[array]",
|
||||||
|
"escaped string \""
|
||||||
|
]))
|
||||||
|
.unwrap();
|
||||||
|
let array_of_array = serde_json::to_vec(&json!([1, [2, [3]]])).unwrap();
|
||||||
|
let array_of_object = serde_json::to_vec(&json!([1, [2, [3]], {}])).unwrap();
|
||||||
|
|
||||||
|
c.bench_function("null", |b| b.iter(|| should_flatten_from_unchecked_slice(&null)));
|
||||||
|
c.bench_function("true", |b| b.iter(|| should_flatten_from_unchecked_slice(&bool_true)));
|
||||||
|
c.bench_function("false", |b| b.iter(|| should_flatten_from_unchecked_slice(&bool_false)));
|
||||||
|
c.bench_function("integer", |b| b.iter(|| should_flatten_from_unchecked_slice(&integer)));
|
||||||
|
c.bench_function("float", |b| b.iter(|| should_flatten_from_unchecked_slice(&float)));
|
||||||
|
c.bench_function("string", |b| b.iter(|| should_flatten_from_unchecked_slice(&string)));
|
||||||
|
c.bench_function("object", |b| b.iter(|| should_flatten_from_unchecked_slice(&object)));
|
||||||
|
c.bench_function("complex object", |b| {
|
||||||
|
b.iter(|| should_flatten_from_unchecked_slice(&complex_object))
|
||||||
|
});
|
||||||
|
c.bench_function("simple array", |b| {
|
||||||
|
b.iter(|| should_flatten_from_unchecked_slice(&simple_array))
|
||||||
|
});
|
||||||
|
c.bench_function("array of array", |b| {
|
||||||
|
b.iter(|| should_flatten_from_unchecked_slice(&array_of_array))
|
||||||
|
});
|
||||||
|
c.bench_function("array of object", |b| {
|
||||||
|
b.iter(|| should_flatten_from_unchecked_slice(&array_of_object))
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
criterion_group!(benches, criterion_benchmark);
|
||||||
|
criterion_main!(benches);
|
27
json-depth-checker/fuzz/Cargo.toml
Normal file
27
json-depth-checker/fuzz/Cargo.toml
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
[package]
|
||||||
|
name = "json-depth-checker"
|
||||||
|
version = "0.0.0"
|
||||||
|
authors = ["Automatically generated"]
|
||||||
|
publish = false
|
||||||
|
edition = "2018"
|
||||||
|
|
||||||
|
[package.metadata]
|
||||||
|
cargo-fuzz = true
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
libfuzzer-sys = "0.4"
|
||||||
|
arbitrary-json = "0.1.1"
|
||||||
|
serde_json = "1.0.79"
|
||||||
|
|
||||||
|
[dependencies.json-depth-checker]
|
||||||
|
path = ".."
|
||||||
|
|
||||||
|
# Prevent this from interfering with workspaces
|
||||||
|
[workspace]
|
||||||
|
members = ["."]
|
||||||
|
|
||||||
|
[[bin]]
|
||||||
|
name = "depth"
|
||||||
|
path = "fuzz_targets/depth.rs"
|
||||||
|
test = false
|
||||||
|
doc = false
|
13
json-depth-checker/fuzz/fuzz_targets/depth.rs
Normal file
13
json-depth-checker/fuzz/fuzz_targets/depth.rs
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
#![no_main]
|
||||||
|
use arbitrary_json::ArbitraryValue;
|
||||||
|
use json_depth_checker::*;
|
||||||
|
use libfuzzer_sys::fuzz_target;
|
||||||
|
|
||||||
|
fuzz_target!(|value: ArbitraryValue| {
|
||||||
|
let value = serde_json::Value::from(value);
|
||||||
|
let left = should_flatten_from_value(&value);
|
||||||
|
let value = serde_json::to_vec(&value).unwrap();
|
||||||
|
let right = should_flatten_from_unchecked_slice(&value);
|
||||||
|
|
||||||
|
assert_eq!(left, right);
|
||||||
|
});
|
114
json-depth-checker/src/lib.rs
Normal file
114
json-depth-checker/src/lib.rs
Normal file
@ -0,0 +1,114 @@
|
|||||||
|
use serde_json::Value;
|
||||||
|
|
||||||
|
/// Your json MUST BE valid and generated by `serde_json::to_vec` before being
|
||||||
|
/// sent in this function. This function is DUMB and FAST but makes a lot of
|
||||||
|
/// asumption about the way `serde_json` will generate its input.
|
||||||
|
///
|
||||||
|
/// Will return `true` if the JSON contains an object, an array of array
|
||||||
|
/// or an array containing an object. Returns `false` for everything else.
|
||||||
|
pub fn should_flatten_from_unchecked_slice(json: &[u8]) -> bool {
|
||||||
|
if json.is_empty() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// since the json we receive has been generated by serde_json we know
|
||||||
|
// it doesn't contains any whitespace at the beginning thus we can check
|
||||||
|
// directly if we're looking at an object.
|
||||||
|
if json[0] == b'{' {
|
||||||
|
return true;
|
||||||
|
} else if json[0] != b'[' {
|
||||||
|
// if the json isn't an object or an array it means it's a simple value.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// The array case is a little bit more complex. We are looking for a second
|
||||||
|
// `[` but we need to ensure that it doesn't appear inside of a string. Thus
|
||||||
|
// we need to keep track of if we're in a string or not.
|
||||||
|
|
||||||
|
// will be used when we met a `\` to skip the next character.
|
||||||
|
let mut skip_next = false;
|
||||||
|
let mut in_string = false;
|
||||||
|
|
||||||
|
for byte in json.iter().skip(1) {
|
||||||
|
match byte {
|
||||||
|
// handle the backlash.
|
||||||
|
_ if skip_next => skip_next = false,
|
||||||
|
b'\\' => skip_next = true,
|
||||||
|
|
||||||
|
// handle the strings.
|
||||||
|
byte if in_string => {
|
||||||
|
if *byte == b'"' {
|
||||||
|
in_string = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b'"' => in_string = true,
|
||||||
|
|
||||||
|
// handle the arrays.
|
||||||
|
b'[' => return true,
|
||||||
|
// since we know the json is valid we don't need to ensure the
|
||||||
|
// array is correctly closed
|
||||||
|
|
||||||
|
// handle the objects.
|
||||||
|
b'{' => return true,
|
||||||
|
|
||||||
|
// ignore everything else
|
||||||
|
_ => (),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Consider using [`should_flatten_from_unchecked_slice`] when you can.
|
||||||
|
/// Will returns `true` if the json contains an object, an array of array
|
||||||
|
/// or an array containing an object.
|
||||||
|
/// Returns `false` for everything else.
|
||||||
|
/// This function has been written to test the [`should_flatten_from_unchecked_slice`].
|
||||||
|
pub fn should_flatten_from_value(json: &Value) -> bool {
|
||||||
|
match json {
|
||||||
|
Value::Object(..) => true,
|
||||||
|
Value::Array(array) => array.iter().any(|value| value.is_array() || value.is_object()),
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use serde_json::*;
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_shouldnt_flatten() {
|
||||||
|
let shouldnt_flatten = vec![
|
||||||
|
json!(null),
|
||||||
|
json!(true),
|
||||||
|
json!(false),
|
||||||
|
json!("a superb string"),
|
||||||
|
json!("a string escaping other \"string\""),
|
||||||
|
json!([null, true, false]),
|
||||||
|
json!(["hello", "world", "!"]),
|
||||||
|
json!(["a \"string\" escaping 'an other'", "\"[\"", "\"{\""]),
|
||||||
|
];
|
||||||
|
for value in shouldnt_flatten {
|
||||||
|
assert!(!should_flatten_from_value(&value));
|
||||||
|
let value = serde_json::to_vec(&value).unwrap();
|
||||||
|
assert!(!should_flatten_from_unchecked_slice(&value));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_should_flatten() {
|
||||||
|
let should_flatten = vec![
|
||||||
|
json!({}),
|
||||||
|
json!({ "hello": "world" }),
|
||||||
|
json!(["hello", ["world"]]),
|
||||||
|
json!([true, true, true, true, true, true, true, true, true, {}]),
|
||||||
|
];
|
||||||
|
for value in should_flatten {
|
||||||
|
assert!(should_flatten_from_value(&value));
|
||||||
|
let value = serde_json::to_vec(&value).unwrap();
|
||||||
|
assert!(should_flatten_from_unchecked_slice(&value));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -18,6 +18,7 @@ flatten-serde-json = { path = "../flatten-serde-json" }
|
|||||||
grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] }
|
grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] }
|
||||||
geoutils = "0.4.1"
|
geoutils = "0.4.1"
|
||||||
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
|
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
|
||||||
|
json-depth-checker = { path = "../json-depth-checker" }
|
||||||
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
|
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
|
||||||
meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.9" }
|
meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.9" }
|
||||||
memmap2 = "0.5.3"
|
memmap2 = "0.5.3"
|
||||||
|
@ -286,9 +286,10 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
})?;
|
})?;
|
||||||
|
|
||||||
self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?;
|
self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?;
|
||||||
let buffer = self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))?;
|
match self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))? {
|
||||||
|
Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?,
|
||||||
self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?;
|
None => self.flattened_sorter.insert(docid.to_be_bytes(), base_obkv)?,
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
self.new_documents_ids.insert(docid);
|
self.new_documents_ids.insert(docid);
|
||||||
}
|
}
|
||||||
@ -300,8 +301,12 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
if let Some(flatten) = flattened_document {
|
if let Some(flatten) = flattened_document {
|
||||||
self.flattened_sorter.insert(docid.to_be_bytes(), &flatten)?;
|
self.flattened_sorter.insert(docid.to_be_bytes(), &flatten)?;
|
||||||
} else {
|
} else {
|
||||||
let buffer = self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))?;
|
match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? {
|
||||||
self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?;
|
Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?,
|
||||||
|
None => {
|
||||||
|
self.flattened_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())?
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
progress_callback(UpdateIndexingStep::RemapDocumentAddition {
|
progress_callback(UpdateIndexingStep::RemapDocumentAddition {
|
||||||
@ -326,8 +331,15 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Flatten a document from the fields ids map contained in self and insert the new
|
// Flatten a document from the fields ids map contained in self and insert the new
|
||||||
// created fields.
|
// created fields. Returns `None` if the document doesn't need to be flattened.
|
||||||
fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Vec<u8>> {
|
fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Option<Vec<u8>>> {
|
||||||
|
if obkv
|
||||||
|
.iter()
|
||||||
|
.all(|(_, value)| !json_depth_checker::should_flatten_from_unchecked_slice(value))
|
||||||
|
{
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
let mut doc = serde_json::Map::new();
|
let mut doc = serde_json::Map::new();
|
||||||
|
|
||||||
for (k, v) in obkv.iter() {
|
for (k, v) in obkv.iter() {
|
||||||
@ -357,7 +369,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
writer.insert(fid, &value)?;
|
writer.insert(fid, &value)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(buffer)
|
Ok(Some(buffer))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Flatten a document from a field mapping generated by [create_fields_mapping]
|
// Flatten a document from a field mapping generated by [create_fields_mapping]
|
||||||
|
Loading…
Reference in New Issue
Block a user