Fix the tests for the new DocumentsBatchBuilder/Reader

This commit is contained in:
Kerollmops 2022-06-14 16:04:27 +02:00
parent 419ce3966c
commit e8297ad27e
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
9 changed files with 292 additions and 374 deletions

View File

@ -183,7 +183,8 @@ mod test {
use serde_json::{json, Map};
use super::*;
use crate::documents::DocumentBatchReader;
use crate::documents::DocumentsBatchReader;
use crate::FieldId;
fn obkv_to_value(obkv: &obkv::KvReader<FieldId>, index: &DocumentsBatchIndex) -> Value {
let mut map = Map::new();
@ -192,7 +193,7 @@ mod test {
let field_name = index.name(fid).unwrap().clone();
let value: Value = serde_json::from_slice(value).unwrap();
map.insert(field_name, value);
map.insert(field_name.to_string(), value);
}
Value::Object(map)
@ -200,15 +201,13 @@ mod test {
#[test]
fn add_single_documents_json() {
let mut cursor = Cursor::new(Vec::new());
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
let json = serde_json::json!({
"id": 1,
"field": "hello!",
});
builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_json_object(json.as_object().unwrap()).unwrap();
let json = serde_json::json!({
"blabla": false,
@ -216,100 +215,64 @@ mod test {
"id": 1,
});
builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
builder.append_json_object(json.as_object().unwrap()).unwrap();
assert_eq!(builder.len(), 2);
assert_eq!(builder.documents_count(), 2);
let vector = builder.into_inner().unwrap();
builder.finish().unwrap();
cursor.set_position(0);
let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
assert_eq!(index.len(), 3);
let document = cursor.next_document().unwrap().unwrap();
assert_eq!(document.iter().count(), 2);
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
assert_eq!(index.len(), 3);
let document = cursor.next_document().unwrap().unwrap();
assert_eq!(document.iter().count(), 3);
assert!(reader.next_document_with_index().unwrap().is_none());
}
#[test]
fn add_documents_seq_json() {
let mut cursor = Cursor::new(Vec::new());
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
let json = serde_json::json!([{
"id": 1,
"field": "hello!",
},{
"blabla": false,
"field": "hello!",
"id": 1,
}
]);
builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
assert_eq!(builder.len(), 2);
builder.finish().unwrap();
cursor.set_position(0);
let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
assert_eq!(index.len(), 3);
assert_eq!(document.iter().count(), 2);
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
assert_eq!(index.len(), 3);
assert_eq!(document.iter().count(), 3);
assert!(reader.next_document_with_index().unwrap().is_none());
assert!(cursor.next_document().unwrap().is_none());
}
#[test]
fn add_documents_csv() {
let mut cursor = Cursor::new(Vec::new());
let csv_content = "id:number,field:string\n1,hello!\n2,blabla";
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let csv = "id:number,field:string\n1,hello!\n2,blabla";
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv).unwrap();
assert_eq!(builder.documents_count(), 2);
let vector = builder.into_inner().unwrap();
let builder =
DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap();
builder.finish().unwrap();
cursor.set_position(0);
let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
assert_eq!(index.len(), 2);
let document = cursor.next_document().unwrap().unwrap();
assert_eq!(document.iter().count(), 2);
let (_index, document) = reader.next_document_with_index().unwrap().unwrap();
let document = cursor.next_document().unwrap().unwrap();
assert_eq!(document.iter().count(), 2);
assert!(reader.next_document_with_index().unwrap().is_none());
assert!(cursor.next_document().unwrap().is_none());
}
#[test]
fn simple_csv_document() {
let documents = r#"city,country,pop
let csv_content = r#"city,country,pop
"Boston","United States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
.unwrap()
.finish()
.unwrap();
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
let val = obkv_to_value(&doc, index);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!(
val,
@ -320,22 +283,25 @@ mod test {
})
);
assert!(reader.next_document_with_index().unwrap().is_none());
assert!(cursor.next_document().unwrap().is_none());
}
#[test]
fn coma_in_field() {
let documents = r#"city,country,pop
let csv_content = r#"city,country,pop
"Boston","United, States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
.unwrap()
.finish()
.unwrap();
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
let val = obkv_to_value(&doc, index);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!(
val,
@ -349,17 +315,20 @@ mod test {
#[test]
fn quote_in_field() {
let documents = r#"city,country,pop
let csv_content = r#"city,country,pop
"Boston","United"" States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
.unwrap()
.finish()
.unwrap();
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
let val = obkv_to_value(&doc, index);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!(
val,
@ -373,17 +342,20 @@ mod test {
#[test]
fn integer_in_field() {
let documents = r#"city,country,pop:number
let csv_content = r#"city,country,pop:number
"Boston","United States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
.unwrap()
.finish()
.unwrap();
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
let val = obkv_to_value(&doc, index);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!(
val,
@ -397,17 +369,20 @@ mod test {
#[test]
fn float_in_field() {
let documents = r#"city,country,pop:number
let csv_content = r#"city,country,pop:number
"Boston","United States","4628910.01""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
.unwrap()
.finish()
.unwrap();
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
let val = obkv_to_value(&doc, index);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!(
val,
@ -421,17 +396,20 @@ mod test {
#[test]
fn several_colon_in_header() {
let documents = r#"city:love:string,country:state,pop
let csv_content = r#"city:love:string,country:state,pop
"Boston","United States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
.unwrap()
.finish()
.unwrap();
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
let val = obkv_to_value(&doc, index);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!(
val,
@ -445,17 +423,20 @@ mod test {
#[test]
fn ending_by_colon_in_header() {
let documents = r#"city:,country,pop
let csv_content = r#"city:,country,pop
"Boston","United States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
.unwrap()
.finish()
.unwrap();
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
let val = obkv_to_value(&doc, index);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!(
val,
@ -469,17 +450,20 @@ mod test {
#[test]
fn starting_by_colon_in_header() {
let documents = r#":city,country,pop
let csv_content = r#":city,country,pop
"Boston","United States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
.unwrap()
.finish()
.unwrap();
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
let val = obkv_to_value(&doc, index);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!(
val,
@ -494,32 +478,36 @@ mod test {
#[ignore]
#[test]
fn starting_by_colon_in_header2() {
let documents = r#":string,country,pop
let csv_content = r#":string,country,pop
"Boston","United States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
.unwrap()
.finish()
.unwrap();
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
assert!(reader.next_document_with_index().is_err());
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
assert!(cursor.next_document().is_err());
}
#[test]
fn double_colon_in_header() {
let documents = r#"city::string,country,pop
let csv_content = r#"city::string,country,pop
"Boston","United States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
.unwrap()
.finish()
.unwrap();
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
let val = obkv_to_value(&doc, index);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!(
val,
@ -533,34 +521,32 @@ mod test {
#[test]
fn bad_type_in_header() {
let documents = r#"city,country:number,pop
let csv_content = r#"city,country:number,pop
"Boston","United States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
assert!(
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
assert!(builder.append_csv(csv).is_err());
}
#[test]
fn bad_column_count1() {
let documents = r#"city,country,pop
"Boston","United States","4628910", "too much""#;
let csv_content = r#"city,country,pop
"Boston","United States","4628910", "too much
let csv = csv::Reader::from_reader(Cursor::new(csv_content"#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
assert!(
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
assert!(builder.append_csv(csv).is_err());
}
#[test]
fn bad_column_count2() {
let documents = r#"city,country,pop
let csv_content = r#"city,country,pop
"Boston","United States""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
assert!(
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
assert!(builder.append_csv(csv).is_err());
}
}

View File

@ -159,7 +159,7 @@ mod test {
#[test]
fn create_documents_no_errors() {
let json = json!({
let value = json!({
"number": 1,
"string": "this is a field",
"array": ["an", "array"],
@ -169,26 +169,17 @@ mod test {
"bool": true
});
let json = serde_json::to_vec(&json).unwrap();
let mut v = Vec::new();
let mut cursor = io::Cursor::new(&mut v);
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
builder.extend_from_json(Cursor::new(json)).unwrap();
builder.finish().unwrap();
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_json_object(value.as_object().unwrap()).unwrap();
let vector = builder.into_inner().unwrap();
let mut documents =
DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap();
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
assert_eq!(documents.index().iter().count(), 5);
let reader = documents.next_document_with_index().unwrap().unwrap();
assert_eq!(reader.1.iter().count(), 5);
assert!(documents.next_document_with_index().unwrap().is_none());
assert_eq!(documents.documents_batch_index().iter().count(), 5);
let reader = documents.next_document().unwrap().unwrap();
assert_eq!(reader.iter().count(), 5);
assert!(documents.next_document().unwrap().is_none());
}
#[test]
@ -200,101 +191,55 @@ mod test {
"toto": false,
});
let doc1 = serde_json::to_vec(&doc1).unwrap();
let doc2 = serde_json::to_vec(&doc2).unwrap();
let mut v = Vec::new();
let mut cursor = io::Cursor::new(&mut v);
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
builder.extend_from_json(Cursor::new(doc1)).unwrap();
builder.extend_from_json(Cursor::new(doc2)).unwrap();
builder.finish().unwrap();
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_json_object(doc1.as_object().unwrap()).unwrap();
builder.append_json_object(doc2.as_object().unwrap()).unwrap();
let vector = builder.into_inner().unwrap();
let mut documents =
DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap();
assert_eq!(documents.index().iter().count(), 2);
let reader = documents.next_document_with_index().unwrap().unwrap();
assert_eq!(reader.1.iter().count(), 1);
assert!(documents.next_document_with_index().unwrap().is_some());
assert!(documents.next_document_with_index().unwrap().is_none());
}
#[test]
fn add_documents_array() {
let docs = json!([
{ "toto": false },
{ "tata": "hello" },
]);
let docs = serde_json::to_vec(&docs).unwrap();
let mut v = Vec::new();
let mut cursor = io::Cursor::new(&mut v);
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
builder.extend_from_json(Cursor::new(docs)).unwrap();
builder.finish().unwrap();
let mut documents =
DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap();
assert_eq!(documents.index().iter().count(), 2);
let reader = documents.next_document_with_index().unwrap().unwrap();
assert_eq!(reader.1.iter().count(), 1);
assert!(documents.next_document_with_index().unwrap().is_some());
assert!(documents.next_document_with_index().unwrap().is_none());
}
#[test]
fn add_invalid_document_format() {
let mut v = Vec::new();
let mut cursor = io::Cursor::new(&mut v);
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
let docs = json!([[
{ "toto": false },
{ "tata": "hello" },
]]);
let docs = serde_json::to_vec(&docs).unwrap();
assert!(builder.extend_from_json(Cursor::new(docs)).is_err());
let docs = json!("hello");
let docs = serde_json::to_vec(&docs).unwrap();
assert!(builder.extend_from_json(Cursor::new(docs)).is_err());
DocumentsBatchReader::from_reader(io::Cursor::new(vector)).unwrap().into_cursor();
assert_eq!(documents.documents_batch_index().iter().count(), 2);
let reader = documents.next_document().unwrap().unwrap();
assert_eq!(reader.iter().count(), 1);
assert!(documents.next_document().unwrap().is_some());
assert!(documents.next_document().unwrap().is_none());
}
#[test]
fn test_nested() {
let mut docs = documents!([{
let docs_reader = documents!([{
"hello": {
"toto": ["hello"]
}
}]);
let (_index, doc) = docs.next_document_with_index().unwrap().unwrap();
let mut cursor = docs_reader.into_cursor();
let doc = cursor.next_document().unwrap().unwrap();
let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap();
assert_eq!(nested, json!({ "toto": ["hello"] }));
}
#[test]
fn out_of_order_fields() {
fn out_of_order_json_fields() {
let _documents = documents!([
{"id": 1,"b": 0},
{"id": 2,"a": 0,"b": 0},
]);
}
#[test]
fn out_of_order_csv_fields() {
let csv1_content = "id:number,b\n1,0";
let csv1 = csv::Reader::from_reader(Cursor::new(csv1_content));
let csv2_content = "id:number,a,b\n2,0,0";
let csv2 = csv::Reader::from_reader(Cursor::new(csv2_content));
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv1).unwrap();
builder.append_csv(csv2).unwrap();
let vector = builder.into_inner().unwrap();
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
}
}

View File

@ -35,7 +35,7 @@ mod test {
use roaring::RoaringBitmap;
use serde_json::{json, Value};
use crate::documents::{DocumentBatchBuilder, DocumentBatchReader};
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use crate::index::tests::TempIndex;
use crate::index::Index;
use crate::update::{
@ -43,14 +43,11 @@ mod test {
};
use crate::{DocumentId, FieldId, BEU32};
static JSON: Lazy<Vec<u8>> = Lazy::new(generate_documents);
fn generate_documents() -> Vec<u8> {
static JSON: Lazy<Vec<u8>> = Lazy::new(|| {
let mut rng = rand::thread_rng();
let num_docs = rng.gen_range(10..30);
let mut cursor = Cursor::new(Vec::new());
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
let mut builder = DocumentsBatchBuilder::new(Vec::new());
let txts = ["Toto", "Titi", "Tata"];
let cats = (1..10).map(|i| i.to_string()).collect::<Vec<_>>();
let cat_ints = (1..10).collect::<Vec<_>>();
@ -63,7 +60,7 @@ mod test {
let mut sample_ints = cat_ints.clone();
sample_ints.shuffle(&mut rng);
let doc = json!({
let json = json!({
"id": i,
"txt": txt,
"cat-int": rng.gen_range(0..3),
@ -71,13 +68,16 @@ mod test {
"cat-ints": sample_ints[..(rng.gen_range(0..3))],
});
let doc = Cursor::new(serde_json::to_vec(&doc).unwrap());
builder.extend_from_json(doc).unwrap();
let object = match json {
Value::Object(object) => object,
_ => panic!(),
};
builder.append_json_object(&object).unwrap();
}
builder.finish().unwrap();
cursor.into_inner()
}
builder.into_inner().unwrap()
});
/// Returns a temporary index populated with random test documents, the FieldId for the
/// distinct attribute, and the RoaringBitmap with the document ids.
@ -101,7 +101,8 @@ mod test {
IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap();
let reader =
crate::documents::DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap();
crate::documents::DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice()))
.unwrap();
addition.add_documents(reader).unwrap();
addition.execute().unwrap();
@ -109,8 +110,8 @@ mod test {
let fields_map = index.fields_ids_map(&txn).unwrap();
let fid = fields_map.id(&distinct).unwrap();
let documents = DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap();
let map = (0..documents.len() as u32).collect();
let documents = DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())).unwrap();
let map = (0..documents.documents_count() as u32).collect();
txn.commit().unwrap();

View File

@ -25,7 +25,7 @@ pub use self::helpers::{
};
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
pub use self::transform::{Transform, TransformOutput};
use crate::documents::DocumentBatchReader;
use crate::documents::DocumentsBatchReader;
pub use crate::update::index_documents::helpers::CursorClonableMmap;
use crate::update::{
self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids,
@ -121,7 +121,7 @@ where
/// builder, and the builder must be discarded.
///
/// Returns the number of documents added to the builder.
pub fn add_documents<R>(&mut self, reader: DocumentBatchReader<R>) -> Result<u64>
pub fn add_documents<R>(&mut self, reader: DocumentsBatchReader<R>) -> Result<u64>
where
R: Read + Seek,
{
@ -590,9 +590,8 @@ mod tests {
use maplit::hashset;
use super::*;
use crate::documents::DocumentBatchBuilder;
use crate::documents::DocumentsBatchBuilder;
use crate::update::DeleteDocuments;
use crate::HashMap;
#[test]
fn simple_document_replacement() {
@ -1252,21 +1251,17 @@ mod tests {
let mut wtxn = index.write_txn().unwrap();
let mut big_object = HashMap::new();
big_object.insert(S("id"), "wow");
let mut big_object = serde_json::Map::new();
big_object.insert(S("id"), serde_json::Value::from("wow"));
for i in 0..1000 {
let key = i.to_string();
big_object.insert(key, "I am a text!");
big_object.insert(key, serde_json::Value::from("I am a text!"));
}
let mut cursor = Cursor::new(Vec::new());
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
let big_object = Cursor::new(serde_json::to_vec(&big_object).unwrap());
builder.extend_from_json(big_object).unwrap();
builder.finish().unwrap();
cursor.set_position(0);
let content = DocumentBatchReader::from_reader(cursor).unwrap();
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_json_object(&big_object).unwrap();
let vector = builder.into_inner().unwrap();
let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
@ -1288,23 +1283,19 @@ mod tests {
let mut wtxn = index.write_txn().unwrap();
let mut big_object = HashMap::new();
big_object.insert(S("id"), "wow");
let mut big_object = serde_json::Map::new();
big_object.insert(S("id"), serde_json::Value::from("wow"));
let content: String = (0..=u16::MAX)
.into_iter()
.map(|p| p.to_string())
.reduce(|a, b| a + " " + b.as_ref())
.unwrap();
big_object.insert("content".to_string(), &content);
big_object.insert("content".to_string(), serde_json::Value::from(content));
let mut cursor = Cursor::new(Vec::new());
let big_object = serde_json::to_string(&big_object).unwrap();
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
builder.extend_from_json(&mut big_object.as_bytes()).unwrap();
builder.finish().unwrap();
cursor.set_position(0);
let content = DocumentBatchReader::from_reader(cursor).unwrap();
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_json_object(&big_object).unwrap();
let vector = builder.into_inner().unwrap();
let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
@ -1843,18 +1834,20 @@ mod tests {
// Create 200 documents with a long text
let content = {
let documents: Vec<_> = (0..200i32)
let documents_iter = (0..200i32)
.into_iter()
.map(|i| serde_json::json!({ "id": i, "script": script }))
.collect();
.filter_map(|json| match json {
serde_json::Value::Object(object) => Some(object),
_ => None,
});
let mut writer = std::io::Cursor::new(Vec::new());
let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap();
let documents = serde_json::to_vec(&documents).unwrap();
builder.extend_from_json(std::io::Cursor::new(documents)).unwrap();
builder.finish().unwrap();
writer.set_position(0);
crate::documents::DocumentBatchReader::from_reader(writer).unwrap()
let mut builder = crate::documents::DocumentsBatchBuilder::new(Vec::new());
for object in documents_iter {
builder.append_json_object(&object).unwrap();
}
let vector = builder.into_inner().unwrap();
crate::documents::DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap()
};
// Index those 200 long documents

View File

@ -14,7 +14,7 @@ use smartstring::SmartString;
use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn};
use super::{IndexDocumentsMethod, IndexerConfig};
use crate::documents::{DocumentBatchReader, DocumentsBatchIndex};
use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader};
use crate::error::{Error, InternalError, UserError};
use crate::index::db_name;
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
@ -152,7 +152,7 @@ impl<'a, 'i> Transform<'a, 'i> {
pub fn read_documents<R, F>(
&mut self,
mut reader: DocumentBatchReader<R>,
reader: DocumentsBatchReader<R>,
wtxn: &mut heed::RwTxn,
progress_callback: F,
) -> Result<usize>
@ -160,7 +160,8 @@ impl<'a, 'i> Transform<'a, 'i> {
R: Read + Seek,
F: Fn(UpdateIndexingStep) + Sync,
{
let fields_index = reader.index();
let mut cursor = reader.into_cursor();
let fields_index = cursor.documents_batch_index();
let external_documents_ids = self.index.external_documents_ids(wtxn)?;
let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?;
@ -186,7 +187,8 @@ impl<'a, 'i> Transform<'a, 'i> {
let mut documents_count = 0;
let mut external_id_buffer = Vec::new();
let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new();
while let Some((addition_index, document)) = reader.next_document_with_index()? {
let addition_index = cursor.documents_batch_index().clone();
while let Some(document) = cursor.next_document()? {
let mut field_buffer_cache = drop_and_reuse(field_buffer);
if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) {
progress_callback(UpdateIndexingStep::RemapDocumentAddition {
@ -840,7 +842,7 @@ fn update_primary_key<'a>(
None => {
let mut json = Map::new();
for (key, value) in document.iter() {
let key = addition_index.name(key).cloned();
let key = addition_index.name(key).map(ToString::to_string);
let value = serde_json::from_slice::<Value>(&value).ok();
if let Some((k, v)) = key.zip(value) {

View File

@ -3,9 +3,10 @@ use std::io::Cursor;
use big_s::S;
use heed::EnvOpenOptions;
use maplit::hashset;
use milli::documents::{DocumentBatchBuilder, DocumentBatchReader};
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
use milli::{FacetDistribution, Index};
use serde_json::{Deserializer, Map, Value};
#[test]
fn test_facet_distribution_with_no_facet_values() {
@ -30,35 +31,30 @@ fn test_facet_distribution_with_no_facet_values() {
let mut builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
let mut cursor = Cursor::new(Vec::new());
let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
let mut documents_builder = DocumentsBatchBuilder::new(Vec::new());
let reader = Cursor::new(
r#"[
{
r#"{
"id": 123,
"title": "What a week, hu...",
"genres": [],
"tags": ["blue"]
},
}
{
"id": 345,
"title": "I am the pig!",
"tags": ["red"]
}
]"#,
}"#,
);
for doc in serde_json::Deserializer::from_reader(reader).into_iter::<serde_json::Value>() {
let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap());
documents_builder.extend_from_json(doc).unwrap();
for result in Deserializer::from_reader(reader).into_iter::<Map<String, Value>>() {
let object = result.unwrap();
documents_builder.append_json_object(&object).unwrap();
}
documents_builder.finish().unwrap();
cursor.set_position(0);
let vector = documents_builder.into_inner().unwrap();
// index documents
let content = DocumentBatchReader::from_reader(cursor).unwrap();
let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
builder.add_documents(content).unwrap();
builder.execute().unwrap();

View File

@ -6,10 +6,11 @@ use big_s::S;
use either::{Either, Left, Right};
use heed::EnvOpenOptions;
use maplit::{hashmap, hashset};
use milli::documents::{DocumentBatchBuilder, DocumentBatchReader};
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
use milli::{AscDesc, Criterion, DocumentId, Index, Member};
use serde::Deserialize;
use serde_json::{Deserializer, Map, Value};
use slice_group_by::GroupBy;
mod distinct;
@ -62,21 +63,18 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
let mut builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
let mut cursor = Cursor::new(Vec::new());
let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
let mut documents_builder = DocumentsBatchBuilder::new(Vec::new());
let reader = Cursor::new(CONTENT.as_bytes());
for doc in serde_json::Deserializer::from_reader(reader).into_iter::<serde_json::Value>() {
let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap());
documents_builder.extend_from_json(doc).unwrap();
for result in Deserializer::from_reader(reader).into_iter::<Map<String, Value>>() {
let object = result.unwrap();
documents_builder.append_json_object(&object).unwrap();
}
documents_builder.finish().unwrap();
cursor.set_position(0);
let vector = documents_builder.into_inner().unwrap();
// index documents
let content = DocumentBatchReader::from_reader(cursor).unwrap();
let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
builder.add_documents(content).unwrap();
builder.execute().unwrap();

View File

@ -5,7 +5,7 @@ use big_s::S;
use heed::EnvOpenOptions;
use itertools::Itertools;
use maplit::hashset;
use milli::documents::{DocumentBatchBuilder, DocumentBatchReader};
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult};
use rand::Rng;
@ -393,8 +393,7 @@ fn criteria_ascdesc() {
let mut builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
let mut cursor = Cursor::new(Vec::new());
let mut batch_builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
let mut batch_builder = DocumentsBatchBuilder::new(Vec::new());
(0..ASC_DESC_CANDIDATES_THRESHOLD + 1).for_each(|_| {
let mut rng = rand::thread_rng();
@ -412,16 +411,17 @@ fn criteria_ascdesc() {
"age": age,
});
let json = Cursor::new(serde_json::to_vec(&json).unwrap());
batch_builder.extend_from_json(json).unwrap();
let object = match json {
serde_json::Value::Object(object) => object,
_ => panic!(),
};
batch_builder.append_json_object(&object).unwrap();
});
batch_builder.finish().unwrap();
cursor.set_position(0);
let reader = DocumentBatchReader::from_reader(cursor).unwrap();
let vector = batch_builder.into_inner().unwrap();
let reader = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
builder.add_documents(reader).unwrap();
builder.execute().unwrap();

View File

@ -106,26 +106,23 @@ fn test_typo_disabled_on_word() {
options.map_size(4096 * 100);
let index = Index::new(options, tmp.path()).unwrap();
let documents = json!([
{
"id": 1usize,
"data": "zealand",
},
{
"id": 2usize,
"data": "zearand",
},
]);
let mut builder = milli::documents::DocumentsBatchBuilder::new(Vec::new());
let doc1 = json!({
"id": 1usize,
"data": "zealand",
});
let mut writer = std::io::Cursor::new(Vec::new());
let mut builder = milli::documents::DocumentBatchBuilder::new(&mut writer).unwrap();
let documents = serde_json::to_vec(&documents).unwrap();
builder.extend_from_json(std::io::Cursor::new(documents)).unwrap();
builder.finish().unwrap();
let doc2 = json!({
"id": 2usize,
"data": "zearand",
});
writer.set_position(0);
builder.append_json_object(doc1.as_object().unwrap()).unwrap();
builder.append_json_object(doc2.as_object().unwrap()).unwrap();
let vector = builder.into_inner().unwrap();
let documents = milli::documents::DocumentBatchReader::from_reader(writer).unwrap();
let documents =
milli::documents::DocumentsBatchReader::from_reader(std::io::Cursor::new(vector)).unwrap();
let mut txn = index.write_txn().unwrap();
let config = IndexerConfig::default();