MeiliSearch/milli/fuzz/fuzz_targets/indexing.rs

#![no_main]

use std::collections::HashSet;
use std::io::{BufWriter, Cursor, Read, Seek, Write};

use anyhow::{bail, Result};
use arbitrary_json::ArbitraryValue;
use heed::EnvOpenOptions;
use libfuzzer_sys::fuzz_target;
use milli::documents::{DocumentBatchBuilder, DocumentBatchReader};
use milli::update::UpdateBuilder;
use milli::Index;
use serde_json::Value;

#[cfg(target_os = "linux")]
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;

/// reads json from input and write an obkv batch to writer.
pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
    let writer = BufWriter::new(writer);
    let mut builder = DocumentBatchBuilder::new(writer)?;
    builder.extend_from_json(input)?;

    if builder.len() == 0 {
        bail!("Empty payload");
    }

    let count = builder.finish()?;

    Ok(count)
}

fn index_documents(
    index: &mut milli::Index,
    documents: DocumentBatchReader<Cursor<Vec<u8>>>,
) -> Result<()> {
    let update_builder = UpdateBuilder::new();
    let mut wtxn = index.write_txn()?;
    let builder = update_builder.index_documents(&mut wtxn, &index);

    builder.execute(documents, |_| ())?;
    wtxn.commit()?;
    Ok(())
}

fn create_index() -> Result<milli::Index> {
    let dir = tempfile::tempdir().unwrap();
    let mut options = EnvOpenOptions::new();
    options.map_size(10 * 1024 * 1024 * 1024); // 10 GB
    options.max_readers(1);
    let index = Index::new(options, dir.path())?;

    let update_builder = UpdateBuilder::new();
    let mut wtxn = index.write_txn().unwrap();
    let mut builder = update_builder.settings(&mut wtxn, &index);

    let displayed_fields =
        ["id", "title", "album", "artist", "genre", "country", "released", "duration"]
            .iter()
            .map(|s| s.to_string())
            .collect();
    builder.set_displayed_fields(displayed_fields);

    let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
    builder.set_searchable_fields(searchable_fields);

    let faceted_fields: HashSet<String> =
        ["released-timestamp", "duration-float", "genre", "country", "artist"]
            .iter()
            .map(|s| s.to_string())
            .collect();
    builder.set_filterable_fields(faceted_fields.clone());
    builder.set_sortable_fields(faceted_fields);

    builder.set_distinct_field("same".to_string());

    builder.execute(|_| ()).unwrap();
    wtxn.commit().unwrap();

    Ok(index)
}

fuzz_target!(|batches: Vec<Vec<ArbitraryValue>>| {
    if let Ok(mut index) = create_index() {
        for batch in batches {
            let documents: Vec<Value> =
                batch.into_iter().map(|value| serde_json::Value::from(value)).collect();
            let json = Value::Array(documents);
            let json = serde_json::to_string(&json).unwrap();

            let mut documents = Cursor::new(Vec::new());

            // We ignore all badly generated documents
            if let Ok(_count) = read_json(json.as_bytes(), &mut documents) {
                documents.rewind().unwrap();
                let documents = DocumentBatchReader::from_reader(documents).unwrap();
                match index_documents(&mut index, documents) {
                    _ => (),
                }
            }
        }

        index.prepare_for_closing().wait();
    }
});
add a fuzzer on milli 2022-01-12 17:57:54 +01:00			`#![no_main]`

add some settings to the fuzzed milli and use the published version of arbitrary json 2022-01-13 15:35:24 +01:00			`use std::collections::HashSet;`
add a fuzzer on milli 2022-01-12 17:57:54 +01:00			`use std::io::{BufWriter, Cursor, Read, Seek, Write};`

			`use anyhow::{bail, Result};`
			`use arbitrary_json::ArbitraryValue;`
			`use heed::EnvOpenOptions;`
			`use libfuzzer_sys::fuzz_target;`
			`use milli::documents::{DocumentBatchBuilder, DocumentBatchReader};`
			`use milli::update::UpdateBuilder;`
			`use milli::Index;`
			`use serde_json::Value;`

			`#[cfg(target_os = "linux")]`
			`#[global_allocator]`
			`static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;`

			`/// reads json from input and write an obkv batch to writer.`
			`pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result<usize> {`
			`let writer = BufWriter::new(writer);`
			`let mut builder = DocumentBatchBuilder::new(writer)?;`
			`builder.extend_from_json(input)?;`

			`if builder.len() == 0 {`
			`bail!("Empty payload");`
			`}`

			`let count = builder.finish()?;`

			`Ok(count)`
			`}`

			`fn index_documents(`
			`index: &mut milli::Index,`
			`documents: DocumentBatchReader<Cursor<Vec<u8>>>,`
			`) -> Result<()> {`
			`let update_builder = UpdateBuilder::new();`
			`let mut wtxn = index.write_txn()?;`
			`let builder = update_builder.index_documents(&mut wtxn, &index);`

			`builder.execute(documents, \|_\| ())?;`
			`wtxn.commit()?;`
			`Ok(())`
			`}`

			`fn create_index() -> Result<milli::Index> {`
			`let dir = tempfile::tempdir().unwrap();`
			`let mut options = EnvOpenOptions::new();`
add some settings to the fuzzed milli and use the published version of arbitrary json 2022-01-13 15:35:24 +01:00			`options.map_size(10 * 1024 * 1024 * 1024); // 10 GB`
add a fuzzer on milli 2022-01-12 17:57:54 +01:00			`options.max_readers(1);`
add some settings to the fuzzed milli and use the published version of arbitrary json 2022-01-13 15:35:24 +01:00			`let index = Index::new(options, dir.path())?;`

			`let update_builder = UpdateBuilder::new();`
			`let mut wtxn = index.write_txn().unwrap();`
			`let mut builder = update_builder.settings(&mut wtxn, &index);`

			`let displayed_fields =`
			`["id", "title", "album", "artist", "genre", "country", "released", "duration"]`
			`.iter()`
			`.map(\|s\| s.to_string())`
			`.collect();`
			`builder.set_displayed_fields(displayed_fields);`

			`let searchable_fields = ["title", "album", "artist"].iter().map(\|s\| s.to_string()).collect();`
			`builder.set_searchable_fields(searchable_fields);`

			`let faceted_fields: HashSet<String> =`
			`["released-timestamp", "duration-float", "genre", "country", "artist"]`
			`.iter()`
			`.map(\|s\| s.to_string())`
			`.collect();`
			`builder.set_filterable_fields(faceted_fields.clone());`
			`builder.set_sortable_fields(faceted_fields);`

			`builder.set_distinct_field("same".to_string());`

			`builder.execute(\|_\| ()).unwrap();`
			`wtxn.commit().unwrap();`

			`Ok(index)`
add a fuzzer on milli 2022-01-12 17:57:54 +01:00			`}`

			`fuzz_target!(\|batches: Vec<Vec<ArbitraryValue>>\| {`
			`if let Ok(mut index) = create_index() {`
			`for batch in batches {`
			`let documents: Vec<Value> =`
			`batch.into_iter().map(\|value\| serde_json::Value::from(value)).collect();`
			`let json = Value::Array(documents);`
			`let json = serde_json::to_string(&json).unwrap();`

			`let mut documents = Cursor::new(Vec::new());`

			`// We ignore all badly generated documents`
			`if let Ok(_count) = read_json(json.as_bytes(), &mut documents) {`
add some settings to the fuzzed milli and use the published version of arbitrary json 2022-01-13 15:35:24 +01:00			`documents.rewind().unwrap();`
add a fuzzer on milli 2022-01-12 17:57:54 +01:00			`let documents = DocumentBatchReader::from_reader(documents).unwrap();`
			`match index_documents(&mut index, documents) {`
			`_ => (),`
			`}`
			`}`
			`}`

			`index.prepare_for_closing().wait();`
			`}`
			`});`