add a fuzzer on milli

This commit is contained in:
Tamo 2022-01-12 17:57:54 +01:00
parent 559e019de1
commit e1053989c0
No known key found for this signature in database
GPG Key ID: 20CD8020AFA88D69
4 changed files with 140 additions and 0 deletions

26
milli/README.md Normal file
View File

@ -0,0 +1,26 @@
# Milli
## Fuzzing milli
Currently you can only fuzz the indexation.
To execute the fuzzer run:
```
cargo fuzz run indexing
```
To execute the fuzzer on multiple thread you can also run:
```
cargo fuzz run -j4 indexing
```
Since the fuzzer is going to create a lot of temporary file to let milli index its documents
I would also recommand to execute it on a ramdisk.
Here is how to setup a ramdisk on linux:
```
sudo mount -t tmpfs none path/to/your/ramdisk
```
And then set the [TMPDIR](https://doc.rust-lang.org/std/env/fn.temp_dir.html) environment variable
to make the fuzzer create its file in it:
```
export TMPDIR=path/to/your/ramdisk
```

2
milli/fuzz/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
/corpus/
/artifacts/

36
milli/fuzz/Cargo.toml Normal file
View File

@ -0,0 +1,36 @@
[package]
name = "milli-fuzz"
version = "0.0.0"
authors = ["Automatically generated"]
publish = false
edition = "2018"
[package.metadata]
cargo-fuzz = true
[dependencies]
libfuzzer-sys = "0.4"
heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" }
serde_json = { version = "1.0.62", features = ["preserve_order"] }
anyhow = "1.0"
tempfile = "3.3"
arbitrary-json = { path = "../../../arbitrary-json" }
[target.'cfg(target_os = "linux")'.dependencies]
jemallocator = "0.3.2"
[dependencies.milli]
path = ".."
# Prevent this from interfering with workspaces
[workspace]
members = ["."]
[profile.release]
debug = true
[[bin]]
name = "indexing"
path = "fuzz_targets/indexing.rs"
test = false
doc = false

View File

@ -0,0 +1,76 @@
#![no_main]
use std::io::{BufWriter, Cursor, Read, Seek, Write};
use anyhow::{bail, Result};
use arbitrary_json::ArbitraryValue;
use heed::EnvOpenOptions;
use libfuzzer_sys::fuzz_target;
use milli::documents::{DocumentBatchBuilder, DocumentBatchReader};
use milli::update::UpdateBuilder;
use milli::Index;
use serde_json::Value;
#[cfg(target_os = "linux")]
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
/// reads json from input and write an obkv batch to writer.
pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
let writer = BufWriter::new(writer);
let mut builder = DocumentBatchBuilder::new(writer)?;
builder.extend_from_json(input)?;
if builder.len() == 0 {
bail!("Empty payload");
}
let count = builder.finish()?;
Ok(count)
}
fn index_documents(
index: &mut milli::Index,
documents: DocumentBatchReader<Cursor<Vec<u8>>>,
) -> Result<()> {
let update_builder = UpdateBuilder::new();
let mut wtxn = index.write_txn()?;
let builder = update_builder.index_documents(&mut wtxn, &index);
builder.execute(documents, |_| ())?;
wtxn.commit()?;
Ok(())
}
fn create_index() -> Result<milli::Index> {
let dir = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
options.max_readers(1);
Ok(Index::new(options, dir.path())?)
}
fuzz_target!(|batches: Vec<Vec<ArbitraryValue>>| {
if let Ok(mut index) = create_index() {
for batch in batches {
let documents: Vec<Value> =
batch.into_iter().map(|value| serde_json::Value::from(value)).collect();
let json = Value::Array(documents);
let json = serde_json::to_string(&json).unwrap();
let mut documents = Cursor::new(Vec::new());
// We ignore all badly generated documents
if let Ok(_count) = read_json(json.as_bytes(), &mut documents) {
let documents = DocumentBatchReader::from_reader(documents).unwrap();
match index_documents(&mut index, documents) {
// Err(e @ InternalError(_) | e @ IoError(_)) => panic!("{:?}", e),
_ => (),
}
}
}
index.prepare_for_closing().wait();
}
});