add a fuzzer on milli

2025-07-03 20:07:09 +02:00 · 2022-01-12 17:57:54 +01:00 · 2022-01-12 17:57:54 +01:00 · e1053989c0
commit e1053989c0
parent 559e019de1
4 changed files with 140 additions and 0 deletions
--- a/milli/README.md
+++ b/milli/README.md
@ -0,0 +1,26 @@
+# Milli
+
+## Fuzzing milli
+
+Currently you can only fuzz the indexation.
+To execute the fuzzer run:
+```
+cargo fuzz run indexing
+```
+
+To execute the fuzzer on multiple thread you can also run:
+```
+cargo fuzz run -j4 indexing
+```
+
+Since the fuzzer is going to create a lot of temporary file to let milli index its documents
+I would also recommand to execute it on a ramdisk.
+Here is how to setup a ramdisk on linux:
+```
+sudo mount -t tmpfs none path/to/your/ramdisk
+```
+And then set the [TMPDIR](https://doc.rust-lang.org/std/env/fn.temp_dir.html) environment variable
+to make the fuzzer create its file in it:
+```
+export TMPDIR=path/to/your/ramdisk
+```
--- a/milli/fuzz/.gitignore
+++ b/milli/fuzz/.gitignore
@ -0,0 +1,2 @@
+/corpus/
+/artifacts/
--- a/milli/fuzz/Cargo.toml
+++ b/milli/fuzz/Cargo.toml
@ -0,0 +1,36 @@
+[package]
+name = "milli-fuzz"
+version = "0.0.0"
+authors = ["Automatically generated"]
+publish = false
+edition = "2018"
+
+[package.metadata]
+cargo-fuzz = true
+
+[dependencies]
+libfuzzer-sys = "0.4"
+heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" }
+serde_json = { version = "1.0.62", features = ["preserve_order"] }
+anyhow = "1.0"
+tempfile = "3.3"
+arbitrary-json = { path = "../../../arbitrary-json" }
+
+[target.'cfg(target_os = "linux")'.dependencies]
+jemallocator = "0.3.2"
+
+[dependencies.milli]
+path = ".."
+
+# Prevent this from interfering with workspaces
+[workspace]
+members = ["."]
+
+[profile.release]
+debug = true
+
+[[bin]]
+name = "indexing"
+path = "fuzz_targets/indexing.rs"
+test = false
+doc = false
--- a/milli/fuzz/fuzz_targets/indexing.rs
+++ b/milli/fuzz/fuzz_targets/indexing.rs
@ -0,0 +1,76 @@
+#![no_main]
+
+use std::io::{BufWriter, Cursor, Read, Seek, Write};
+
+use anyhow::{bail, Result};
+use arbitrary_json::ArbitraryValue;
+use heed::EnvOpenOptions;
+use libfuzzer_sys::fuzz_target;
+use milli::documents::{DocumentBatchBuilder, DocumentBatchReader};
+use milli::update::UpdateBuilder;
+use milli::Index;
+use serde_json::Value;
+
+#[cfg(target_os = "linux")]
+#[global_allocator]
+static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
+
+/// reads json from input and write an obkv batch to writer.
+pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
+    let writer = BufWriter::new(writer);
+    let mut builder = DocumentBatchBuilder::new(writer)?;
+    builder.extend_from_json(input)?;
+
+    if builder.len() == 0 {
+        bail!("Empty payload");
+    }
+
+    let count = builder.finish()?;
+
+    Ok(count)
+}
+
+fn index_documents(
+    index: &mut milli::Index,
+    documents: DocumentBatchReader<Cursor<Vec<u8>>>,
+) -> Result<()> {
+    let update_builder = UpdateBuilder::new();
+    let mut wtxn = index.write_txn()?;
+    let builder = update_builder.index_documents(&mut wtxn, &index);
+
+    builder.execute(documents, |_| ())?;
+    wtxn.commit()?;
+    Ok(())
+}
+
+fn create_index() -> Result<milli::Index> {
+    let dir = tempfile::tempdir().unwrap();
+    let mut options = EnvOpenOptions::new();
+    options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
+    options.max_readers(1);
+    Ok(Index::new(options, dir.path())?)
+}
+
+fuzz_target!(|batches: Vec<Vec<ArbitraryValue>>| {
+    if let Ok(mut index) = create_index() {
+        for batch in batches {
+            let documents: Vec<Value> =
+                batch.into_iter().map(|value| serde_json::Value::from(value)).collect();
+            let json = Value::Array(documents);
+            let json = serde_json::to_string(&json).unwrap();
+
+            let mut documents = Cursor::new(Vec::new());
+
+            // We ignore all badly generated documents
+            if let Ok(_count) = read_json(json.as_bytes(), &mut documents) {
+                let documents = DocumentBatchReader::from_reader(documents).unwrap();
+                match index_documents(&mut index, documents) {
+                    // Err(e @ InternalError(_) | e @ IoError(_)) => panic!("{:?}", e),
+                    _ => (),
+                }
+            }
+        }
+
+        index.prepare_for_closing().wait();
+    }
+});