2022-01-12 17:57:54 +01:00
|
|
|
#![no_main]
|
|
|
|
|
2022-01-13 15:35:24 +01:00
|
|
|
use std::collections::HashSet;
|
2022-01-12 17:57:54 +01:00
|
|
|
use std::io::{BufWriter, Cursor, Read, Seek, Write};
|
|
|
|
|
|
|
|
use anyhow::{bail, Result};
|
|
|
|
use arbitrary_json::ArbitraryValue;
|
|
|
|
use heed::EnvOpenOptions;
|
|
|
|
use libfuzzer_sys::fuzz_target;
|
2022-06-14 16:22:59 +02:00
|
|
|
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
2022-01-25 12:08:47 +01:00
|
|
|
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
2022-01-12 17:57:54 +01:00
|
|
|
use milli::Index;
|
2022-06-14 16:22:59 +02:00
|
|
|
use serde_json::{Map, Value};
|
2022-01-12 17:57:54 +01:00
|
|
|
|
|
|
|
#[cfg(target_os = "linux")]
|
|
|
|
#[global_allocator]
|
|
|
|
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
|
|
|
|
|
|
|
/// reads json from input and write an obkv batch to writer.
|
|
|
|
pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
|
|
|
|
let writer = BufWriter::new(writer);
|
2022-06-14 16:22:59 +02:00
|
|
|
let mut builder = DocumentsBatchBuilder::new(writer);
|
2022-01-12 17:57:54 +01:00
|
|
|
|
2022-06-15 15:36:27 +02:00
|
|
|
let values: Vec<Object> = serde_json::from_reader(input)?;
|
2022-06-14 16:22:59 +02:00
|
|
|
if builder.documents_count() == 0 {
|
2022-01-12 17:57:54 +01:00
|
|
|
bail!("Empty payload");
|
|
|
|
}
|
|
|
|
|
2022-06-14 16:22:59 +02:00
|
|
|
for object in values {
|
|
|
|
builder.append_json_object(&object)?;
|
|
|
|
}
|
|
|
|
|
|
|
|
let count = builder.documents_count();
|
|
|
|
let vector = builder.into_inner()?;
|
2022-01-12 17:57:54 +01:00
|
|
|
|
2022-06-14 16:22:59 +02:00
|
|
|
Ok(count as usize)
|
2022-01-12 17:57:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
fn index_documents(
|
|
|
|
index: &mut milli::Index,
|
2022-06-14 16:22:59 +02:00
|
|
|
documents: DocumentsBatchReader<Cursor<Vec<u8>>>,
|
2022-01-12 17:57:54 +01:00
|
|
|
) -> Result<()> {
|
2022-01-25 12:08:47 +01:00
|
|
|
let config = IndexerConfig::default();
|
2022-01-12 17:57:54 +01:00
|
|
|
let mut wtxn = index.write_txn()?;
|
|
|
|
|
2022-01-25 12:08:47 +01:00
|
|
|
let indexing_config = IndexDocumentsConfig::default();
|
2022-04-25 18:32:06 +02:00
|
|
|
let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())?;
|
2022-01-25 12:08:47 +01:00
|
|
|
builder.add_documents(documents)?;
|
|
|
|
builder.execute().unwrap();
|
|
|
|
|
2022-01-12 17:57:54 +01:00
|
|
|
wtxn.commit()?;
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
|
|
|
fn create_index() -> Result<milli::Index> {
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
let mut options = EnvOpenOptions::new();
|
2022-01-13 15:35:24 +01:00
|
|
|
options.map_size(10 * 1024 * 1024 * 1024); // 10 GB
|
2022-01-12 17:57:54 +01:00
|
|
|
options.max_readers(1);
|
2022-01-13 15:35:24 +01:00
|
|
|
let index = Index::new(options, dir.path())?;
|
|
|
|
|
2022-01-25 12:08:47 +01:00
|
|
|
let config = IndexerConfig::default();
|
2022-01-13 15:35:24 +01:00
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
2022-01-25 12:08:47 +01:00
|
|
|
|
|
|
|
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
2022-01-13 15:35:24 +01:00
|
|
|
|
|
|
|
let displayed_fields =
|
|
|
|
["id", "title", "album", "artist", "genre", "country", "released", "duration"]
|
|
|
|
.iter()
|
|
|
|
.map(|s| s.to_string())
|
|
|
|
.collect();
|
|
|
|
builder.set_displayed_fields(displayed_fields);
|
|
|
|
|
|
|
|
let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
|
|
|
|
builder.set_searchable_fields(searchable_fields);
|
|
|
|
|
|
|
|
let faceted_fields: HashSet<String> =
|
|
|
|
["released-timestamp", "duration-float", "genre", "country", "artist"]
|
|
|
|
.iter()
|
|
|
|
.map(|s| s.to_string())
|
|
|
|
.collect();
|
|
|
|
builder.set_filterable_fields(faceted_fields.clone());
|
|
|
|
builder.set_sortable_fields(faceted_fields);
|
|
|
|
|
|
|
|
builder.set_distinct_field("same".to_string());
|
|
|
|
|
|
|
|
builder.execute(|_| ()).unwrap();
|
|
|
|
wtxn.commit().unwrap();
|
|
|
|
|
|
|
|
Ok(index)
|
2022-01-12 17:57:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
fuzz_target!(|batches: Vec<Vec<ArbitraryValue>>| {
|
|
|
|
if let Ok(mut index) = create_index() {
|
|
|
|
for batch in batches {
|
|
|
|
let documents: Vec<Value> =
|
|
|
|
batch.into_iter().map(|value| serde_json::Value::from(value)).collect();
|
|
|
|
let json = Value::Array(documents);
|
|
|
|
let json = serde_json::to_string(&json).unwrap();
|
|
|
|
|
|
|
|
let mut documents = Cursor::new(Vec::new());
|
|
|
|
|
2022-01-13 18:51:08 +01:00
|
|
|
// We ignore all malformed documents
|
|
|
|
if let Ok(_) = read_json(json.as_bytes(), &mut documents) {
|
2022-01-13 15:35:24 +01:00
|
|
|
documents.rewind().unwrap();
|
2022-06-14 16:22:59 +02:00
|
|
|
let documents = DocumentsBatchReader::from_reader(documents).unwrap();
|
2022-01-13 18:51:08 +01:00
|
|
|
// A lot of errors can come out of milli and we don't know which ones are normal or not
|
|
|
|
// so we are only going to look for the unexpected panics.
|
|
|
|
let _ = index_documents(&mut index, documents);
|
2022-01-12 17:57:54 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
index.prepare_for_closing().wait();
|
|
|
|
}
|
|
|
|
});
|