mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 23:04:26 +01:00
Merge #432
432: Fuzzer r=Kerollmops a=irevoire Provide a first way of fuzzing the indexing part of milli. It depends on [cargo-fuzz](https://rust-fuzz.github.io/book/cargo-fuzz.html) Co-authored-by: Tamo <tamo@meilisearch.com>
This commit is contained in:
commit
15bbde1022
26
milli/README.md
Normal file
26
milli/README.md
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
# Milli
|
||||||
|
|
||||||
|
## Fuzzing milli
|
||||||
|
|
||||||
|
Currently you can only fuzz the indexation.
|
||||||
|
To execute the fuzzer run:
|
||||||
|
```
|
||||||
|
cargo +nightly fuzz run indexing
|
||||||
|
```
|
||||||
|
|
||||||
|
To execute the fuzzer on multiple thread you can also run:
|
||||||
|
```
|
||||||
|
cargo +nightly fuzz run -j4 indexing
|
||||||
|
```
|
||||||
|
|
||||||
|
Since the fuzzer is going to create a lot of temporary file to let milli index its documents
|
||||||
|
I would also recommand to execute it on a ramdisk.
|
||||||
|
Here is how to setup a ramdisk on linux:
|
||||||
|
```
|
||||||
|
sudo mount -t tmpfs none path/to/your/ramdisk
|
||||||
|
```
|
||||||
|
And then set the [TMPDIR](https://doc.rust-lang.org/std/env/fn.temp_dir.html) environment variable
|
||||||
|
to make the fuzzer create its file in it:
|
||||||
|
```
|
||||||
|
export TMPDIR=path/to/your/ramdisk
|
||||||
|
```
|
2
milli/fuzz/.gitignore
vendored
Normal file
2
milli/fuzz/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
/corpus/
|
||||||
|
/artifacts/
|
37
milli/fuzz/Cargo.toml
Normal file
37
milli/fuzz/Cargo.toml
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
[package]
|
||||||
|
name = "milli-fuzz"
|
||||||
|
version = "0.0.0"
|
||||||
|
authors = ["Automatically generated"]
|
||||||
|
publish = false
|
||||||
|
edition = "2018"
|
||||||
|
|
||||||
|
[package.metadata]
|
||||||
|
cargo-fuzz = true
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
arbitrary = "1.0"
|
||||||
|
libfuzzer-sys = "0.4"
|
||||||
|
heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" }
|
||||||
|
serde_json = { version = "1.0.62", features = ["preserve_order"] }
|
||||||
|
anyhow = "1.0"
|
||||||
|
tempfile = "3.3"
|
||||||
|
arbitrary-json = "0.1.0"
|
||||||
|
|
||||||
|
[target.'cfg(target_os = "linux")'.dependencies]
|
||||||
|
jemallocator = "0.3.2"
|
||||||
|
|
||||||
|
[dependencies.milli]
|
||||||
|
path = ".."
|
||||||
|
|
||||||
|
# Prevent this from interfering with workspaces
|
||||||
|
[workspace]
|
||||||
|
members = ["."]
|
||||||
|
|
||||||
|
[profile.release]
|
||||||
|
debug = true
|
||||||
|
|
||||||
|
[[bin]]
|
||||||
|
name = "indexing"
|
||||||
|
path = "fuzz_targets/indexing.rs"
|
||||||
|
test = false
|
||||||
|
doc = false
|
106
milli/fuzz/fuzz_targets/indexing.rs
Normal file
106
milli/fuzz/fuzz_targets/indexing.rs
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
#![no_main]
|
||||||
|
|
||||||
|
use std::collections::HashSet;
|
||||||
|
use std::io::{BufWriter, Cursor, Read, Seek, Write};
|
||||||
|
|
||||||
|
use anyhow::{bail, Result};
|
||||||
|
use arbitrary_json::ArbitraryValue;
|
||||||
|
use heed::EnvOpenOptions;
|
||||||
|
use libfuzzer_sys::fuzz_target;
|
||||||
|
use milli::documents::{DocumentBatchBuilder, DocumentBatchReader};
|
||||||
|
use milli::update::UpdateBuilder;
|
||||||
|
use milli::Index;
|
||||||
|
use serde_json::Value;
|
||||||
|
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
#[global_allocator]
|
||||||
|
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
||||||
|
|
||||||
|
/// reads json from input and write an obkv batch to writer.
|
||||||
|
pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
|
||||||
|
let writer = BufWriter::new(writer);
|
||||||
|
let mut builder = DocumentBatchBuilder::new(writer)?;
|
||||||
|
builder.extend_from_json(input)?;
|
||||||
|
|
||||||
|
if builder.len() == 0 {
|
||||||
|
bail!("Empty payload");
|
||||||
|
}
|
||||||
|
|
||||||
|
let count = builder.finish()?;
|
||||||
|
|
||||||
|
Ok(count)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn index_documents(
|
||||||
|
index: &mut milli::Index,
|
||||||
|
documents: DocumentBatchReader<Cursor<Vec<u8>>>,
|
||||||
|
) -> Result<()> {
|
||||||
|
let update_builder = UpdateBuilder::new();
|
||||||
|
let mut wtxn = index.write_txn()?;
|
||||||
|
let builder = update_builder.index_documents(&mut wtxn, &index);
|
||||||
|
|
||||||
|
builder.execute(documents, |_| ())?;
|
||||||
|
wtxn.commit()?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn create_index() -> Result<milli::Index> {
|
||||||
|
let dir = tempfile::tempdir().unwrap();
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(10 * 1024 * 1024 * 1024); // 10 GB
|
||||||
|
options.max_readers(1);
|
||||||
|
let index = Index::new(options, dir.path())?;
|
||||||
|
|
||||||
|
let update_builder = UpdateBuilder::new();
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let mut builder = update_builder.settings(&mut wtxn, &index);
|
||||||
|
|
||||||
|
let displayed_fields =
|
||||||
|
["id", "title", "album", "artist", "genre", "country", "released", "duration"]
|
||||||
|
.iter()
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
.collect();
|
||||||
|
builder.set_displayed_fields(displayed_fields);
|
||||||
|
|
||||||
|
let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
|
||||||
|
builder.set_searchable_fields(searchable_fields);
|
||||||
|
|
||||||
|
let faceted_fields: HashSet<String> =
|
||||||
|
["released-timestamp", "duration-float", "genre", "country", "artist"]
|
||||||
|
.iter()
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
.collect();
|
||||||
|
builder.set_filterable_fields(faceted_fields.clone());
|
||||||
|
builder.set_sortable_fields(faceted_fields);
|
||||||
|
|
||||||
|
builder.set_distinct_field("same".to_string());
|
||||||
|
|
||||||
|
builder.execute(|_| ()).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
Ok(index)
|
||||||
|
}
|
||||||
|
|
||||||
|
fuzz_target!(|batches: Vec<Vec<ArbitraryValue>>| {
|
||||||
|
if let Ok(mut index) = create_index() {
|
||||||
|
for batch in batches {
|
||||||
|
let documents: Vec<Value> =
|
||||||
|
batch.into_iter().map(|value| serde_json::Value::from(value)).collect();
|
||||||
|
let json = Value::Array(documents);
|
||||||
|
let json = serde_json::to_string(&json).unwrap();
|
||||||
|
|
||||||
|
let mut documents = Cursor::new(Vec::new());
|
||||||
|
|
||||||
|
// We ignore all malformed documents
|
||||||
|
if let Ok(_) = read_json(json.as_bytes(), &mut documents) {
|
||||||
|
documents.rewind().unwrap();
|
||||||
|
let documents = DocumentBatchReader::from_reader(documents).unwrap();
|
||||||
|
// A lot of errors can come out of milli and we don't know which ones are normal or not
|
||||||
|
// so we are only going to look for the unexpected panics.
|
||||||
|
let _ = index_documents(&mut index, documents);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
index.prepare_for_closing().wait();
|
||||||
|
}
|
||||||
|
});
|
Loading…
Reference in New Issue
Block a user