move the fuzzer to its own crate

2025-01-22 03:07:27 +01:00 · 2023-05-29 12:27:39 +02:00 · 2023-05-29 12:27:39 +02:00 · 6c6387d05e
commit 6c6387d05e
parent 002f42875f
8 changed files with 531 additions and 396 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -13,7 +13,8 @@ members = [
    "filter-parser",
    "flatten-serde-json",
    "json-depth-checker",
-    "benchmarks"
+    "benchmarks",
+    "fuzzers",
 ]

 [workspace.package]
--- a/fuzzers/Cargo.toml
+++ b/fuzzers/Cargo.toml
@ -0,0 +1,20 @@
+[package]
+name = "fuzzers"
+publish = false
+
+version.workspace = true
+authors.workspace = true
+description.workspace = true
+homepage.workspace = true
+readme.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+arbitrary = { version = "1.3.0", features = ["derive"] }
+clap = { version = "4.3.0", features = ["derive"] }
+fastrand = "1.9.0"
+milli = { path = "../milli" }
+serde = { version = "1.0.160", features = ["derive"] }
+serde_json = { version = "1.0.95", features = ["preserve_order"] }
+tempfile = "3.5.0"
--- a/fuzzers/README.md
+++ b/fuzzers/README.md
@ -0,0 +1,3 @@
+# Fuzzers
+
+The purpose of this crate is to contains all the handmade "fuzzer" we may need.
--- a/fuzzers/src/bin/fuzz.rs
+++ b/fuzzers/src/bin/fuzz.rs
@ -0,0 +1,136 @@
+use std::num::NonZeroUsize;
+use std::path::PathBuf;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::time::Duration;
+
+use arbitrary::{Arbitrary, Unstructured};
+use clap::Parser;
+use fuzzers::Operation;
+use milli::heed::EnvOpenOptions;
+use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig};
+use milli::Index;
+use tempfile::TempDir;
+
+#[derive(Debug, Arbitrary)]
+struct Batch([Operation; 5]);
+
+#[derive(Debug, Clone, Parser)]
+struct Opt {
+    /// The number of fuzzer to run in parallel.
+    #[clap(long)]
+    par: Option<NonZeroUsize>,
+    // We need to put a lot of newlines in the following documentation or else everything gets collapsed on one line
+    /// The path in which the databases will be created.
+    /// Using a ramdisk is recommended.
+    ///
+    /// Linux:
+    ///
+    /// sudo mount -t tmpfs -o size=2g tmpfs ramdisk # to create it
+    ///
+    /// sudo umount ramdisk # to remove it
+    ///
+    /// MacOS:
+    ///
+    /// diskutil erasevolume HFS+ 'RAM Disk' `hdiutil attach -nobrowse -nomount ram://4194304 # create it
+    ///
+    /// hdiutil detach /dev/:the_disk
+    ///
+    #[clap(long)]
+    path: Option<PathBuf>,
+}
+
+fn main() {
+    let opt = Opt::parse();
+    let progression: &'static AtomicUsize = Box::leak(Box::new(AtomicUsize::new(0)));
+
+    let par = opt.par.unwrap_or_else(|| std::thread::available_parallelism().unwrap()).get();
+    let mut handles = Vec::with_capacity(par);
+
+    for _ in 0..par {
+        let opt = opt.clone();
+
+        let handle = std::thread::spawn(move || {
+            let mut options = EnvOpenOptions::new();
+            options.map_size(1024 * 1024 * 1024 * 1024);
+            let tempdir = match opt.path {
+                Some(path) => TempDir::new_in(path).unwrap(),
+                None => TempDir::new().unwrap(),
+            };
+            let index = Index::new(options, tempdir.path()).unwrap();
+            let indexer_config = IndexerConfig::default();
+            let index_documents_config = IndexDocumentsConfig::default();
+
+            loop {
+                let v: Vec<u8> = std::iter::repeat_with(|| fastrand::u8(..)).take(1000).collect();
+
+                let mut data = Unstructured::new(&v);
+                let batches = <[Batch; 5]>::arbitrary(&mut data).unwrap();
+                // will be used to display the error once a thread crashes
+                let dbg_input = format!("{:#?}", batches);
+
+                let mut wtxn = index.write_txn().unwrap();
+
+                for batch in batches {
+                    let mut builder = IndexDocuments::new(
+                        &mut wtxn,
+                        &index,
+                        &indexer_config,
+                        index_documents_config.clone(),
+                        |_| (),
+                        || false,
+                    )
+                    .unwrap();
+
+                    for op in batch.0 {
+                        match op {
+                            Operation::AddDoc(doc) => {
+                                let documents =
+                                    milli::documents::objects_from_json_value(doc.to_d());
+                                let documents =
+                                    milli::documents::documents_batch_reader_from_objects(
+                                        documents,
+                                    );
+                                let (b, _added) =
+                                    builder.add_documents(documents).expect(&dbg_input);
+                                builder = b;
+                            }
+                            Operation::DeleteDoc(id) => {
+                                let (b, _removed) =
+                                    builder.remove_documents(vec![id.to_s()]).unwrap();
+                                builder = b;
+                            }
+                        }
+                    }
+                    builder.execute().expect(&dbg_input);
+
+                    // after executing a batch we check if the database is corrupted
+                    let res = index.search(&wtxn).execute().expect(&dbg_input);
+                    index.documents(&wtxn, res.documents_ids).expect(&dbg_input);
+                    progression.fetch_add(1, Ordering::Relaxed);
+                }
+                wtxn.abort().unwrap();
+            }
+        });
+        handles.push(handle);
+    }
+
+    std::thread::spawn(|| {
+        let mut last_value = 0;
+        let start = std::time::Instant::now();
+        loop {
+            let total = progression.load(Ordering::Relaxed);
+            println!(
+                "Has been running for {:?}. Tested {} new values for a total of {}.",
+                start.elapsed(),
+                total - last_value,
+                total
+            );
+            last_value = total;
+            std::thread::sleep(Duration::from_secs(1));
+        }
+    });
+
+    for handle in handles {
+        handle.join().unwrap();
+    }
+}
--- a/fuzzers/src/lib.rs
+++ b/fuzzers/src/lib.rs
@ -0,0 +1,46 @@
+use arbitrary::Arbitrary;
+use serde_json::{json, Value};
+
+#[derive(Debug, Arbitrary)]
+pub enum Document {
+    One,
+    Two,
+    Three,
+    Four,
+    Five,
+    Six,
+}
+
+impl Document {
+    pub fn to_d(&self) -> Value {
+        match self {
+            Document::One => json!({ "id": 0, "doggo": "bernese" }),
+            Document::Two => json!({ "id": 0, "doggo": "golden" }),
+            Document::Three => json!({ "id": 0, "catto": "jorts" }),
+            Document::Four => json!({ "id": 1, "doggo": "bernese" }),
+            Document::Five => json!({ "id": 1, "doggo": "golden" }),
+            Document::Six => json!({ "id": 1, "catto": "jorts" }),
+        }
+    }
+}
+
+#[derive(Debug, Arbitrary)]
+pub enum DocId {
+    Zero,
+    One,
+}
+
+impl DocId {
+    pub fn to_s(&self) -> String {
+        match self {
+            DocId::Zero => "0".to_string(),
+            DocId::One => "1".to_string(),
+        }
+    }
+}
+
+#[derive(Debug, Arbitrary)]
+pub enum Operation {
+    AddDoc(Document),
+    DeleteDoc(DocId),
+}
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@ -65,13 +65,6 @@ maplit = "1.0.2"
 md5 = "0.7.0"
 rand = {version = "0.8.5", features = ["small_rng"] }

-# fuzzing
-arbitrary = { version = "1.3.0", features = ["derive"] }
-fastrand = "1.9.0"
-
-[target.'cfg(fuzzing)'.dev-dependencies]
-fuzzcheck = "0.12.1"
-
 [features]
 all-tokenizations = ["charabia/default"]

--- a/milli/examples/fuzz.rs
+++ b/milli/examples/fuzz.rs
@ -52,7 +52,9 @@ enum Operation {
 #[derive(Debug, Arbitrary)]
 struct Batch([Operation; 5]);

-fn main() {
+#[test]
+#[ignore]
+fn fuzz() {
    let mut options = EnvOpenOptions::new();
    options.map_size(1024 * 1024 * 1024 * 1024);
    let _tempdir = TempDir::new().unwrap();