Fix the tests for the new DocumentsBatchBuilder/Reader

This commit is contained in:
Kerollmops 2022-06-14 16:04:27 +02:00
parent 419ce3966c
commit e8297ad27e
No known key found for this signature in database
GPG key ID: 92ADA4E935E71FA4
9 changed files with 292 additions and 374 deletions

View file

@ -183,7 +183,8 @@ mod test {
use serde_json::{json, Map};
use super::*;
use crate::documents::DocumentBatchReader;
use crate::documents::DocumentsBatchReader;
use crate::FieldId;
fn obkv_to_value(obkv: &obkv::KvReader<FieldId>, index: &DocumentsBatchIndex) -> Value {
let mut map = Map::new();
@ -192,7 +193,7 @@ mod test {
let field_name = index.name(fid).unwrap().clone();
let value: Value = serde_json::from_slice(value).unwrap();
map.insert(field_name, value);
map.insert(field_name.to_string(), value);
}
Value::Object(map)
@ -200,15 +201,13 @@ mod test {
#[test]
fn add_single_documents_json() {
let mut cursor = Cursor::new(Vec::new());
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
let json = serde_json::json!({
"id": 1,
"field": "hello!",
});
builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_json_object(json.as_object().unwrap()).unwrap();
let json = serde_json::json!({
"blabla": false,
@ -216,100 +215,64 @@ mod test {
"id": 1,
});
builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
builder.append_json_object(json.as_object().unwrap()).unwrap();
assert_eq!(builder.len(), 2);
assert_eq!(builder.documents_count(), 2);
let vector = builder.into_inner().unwrap();
builder.finish().unwrap();
cursor.set_position(0);
let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
assert_eq!(index.len(), 3);
let document = cursor.next_document().unwrap().unwrap();
assert_eq!(document.iter().count(), 2);
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
assert_eq!(index.len(), 3);
let document = cursor.next_document().unwrap().unwrap();
assert_eq!(document.iter().count(), 3);
assert!(reader.next_document_with_index().unwrap().is_none());
}
#[test]
fn add_documents_seq_json() {
let mut cursor = Cursor::new(Vec::new());
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
let json = serde_json::json!([{
"id": 1,
"field": "hello!",
},{
"blabla": false,
"field": "hello!",
"id": 1,
}
]);
builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
assert_eq!(builder.len(), 2);
builder.finish().unwrap();
cursor.set_position(0);
let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
assert_eq!(index.len(), 3);
assert_eq!(document.iter().count(), 2);
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
assert_eq!(index.len(), 3);
assert_eq!(document.iter().count(), 3);
assert!(reader.next_document_with_index().unwrap().is_none());
assert!(cursor.next_document().unwrap().is_none());
}
#[test]
fn add_documents_csv() {
let mut cursor = Cursor::new(Vec::new());
let csv_content = "id:number,field:string\n1,hello!\n2,blabla";
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let csv = "id:number,field:string\n1,hello!\n2,blabla";
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv).unwrap();
assert_eq!(builder.documents_count(), 2);
let vector = builder.into_inner().unwrap();
let builder =
DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap();
builder.finish().unwrap();
cursor.set_position(0);
let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
assert_eq!(index.len(), 2);
let document = cursor.next_document().unwrap().unwrap();
assert_eq!(document.iter().count(), 2);
let (_index, document) = reader.next_document_with_index().unwrap().unwrap();
let document = cursor.next_document().unwrap().unwrap();
assert_eq!(document.iter().count(), 2);
assert!(reader.next_document_with_index().unwrap().is_none());
assert!(cursor.next_document().unwrap().is_none());
}
#[test]
fn simple_csv_document() {
let documents = r#"city,country,pop
let csv_content = r#"city,country,pop
"Boston","United States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
.unwrap()
.finish()
.unwrap();
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
let val = obkv_to_value(&doc, index);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!(
val,
@ -320,22 +283,25 @@ mod test {
})
);
assert!(reader.next_document_with_index().unwrap().is_none());
assert!(cursor.next_document().unwrap().is_none());
}
#[test]
fn coma_in_field() {
let documents = r#"city,country,pop
let csv_content = r#"city,country,pop
"Boston","United, States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
.unwrap()
.finish()
.unwrap();
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
let val = obkv_to_value(&doc, index);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!(
val,
@ -349,17 +315,20 @@ mod test {
#[test]
fn quote_in_field() {
let documents = r#"city,country,pop
let csv_content = r#"city,country,pop
"Boston","United"" States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
.unwrap()
.finish()
.unwrap();
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
let val = obkv_to_value(&doc, index);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!(
val,
@ -373,17 +342,20 @@ mod test {
#[test]
fn integer_in_field() {
let documents = r#"city,country,pop:number
let csv_content = r#"city,country,pop:number
"Boston","United States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
.unwrap()
.finish()
.unwrap();
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
let val = obkv_to_value(&doc, index);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!(
val,
@ -397,17 +369,20 @@ mod test {
#[test]
fn float_in_field() {
let documents = r#"city,country,pop:number
let csv_content = r#"city,country,pop:number
"Boston","United States","4628910.01""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
.unwrap()
.finish()
.unwrap();
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
let val = obkv_to_value(&doc, index);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!(
val,
@ -421,17 +396,20 @@ mod test {
#[test]
fn several_colon_in_header() {
let documents = r#"city:love:string,country:state,pop
let csv_content = r#"city:love:string,country:state,pop
"Boston","United States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
.unwrap()
.finish()
.unwrap();
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
let val = obkv_to_value(&doc, index);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!(
val,
@ -445,17 +423,20 @@ mod test {
#[test]
fn ending_by_colon_in_header() {
let documents = r#"city:,country,pop
let csv_content = r#"city:,country,pop
"Boston","United States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
.unwrap()
.finish()
.unwrap();
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
let val = obkv_to_value(&doc, index);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!(
val,
@ -469,17 +450,20 @@ mod test {
#[test]
fn starting_by_colon_in_header() {
let documents = r#":city,country,pop
let csv_content = r#":city,country,pop
"Boston","United States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
.unwrap()
.finish()
.unwrap();
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
let val = obkv_to_value(&doc, index);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!(
val,
@ -494,32 +478,36 @@ mod test {
#[ignore]
#[test]
fn starting_by_colon_in_header2() {
let documents = r#":string,country,pop
let csv_content = r#":string,country,pop
"Boston","United States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
.unwrap()
.finish()
.unwrap();
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
assert!(reader.next_document_with_index().is_err());
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
assert!(cursor.next_document().is_err());
}
#[test]
fn double_colon_in_header() {
let documents = r#"city::string,country,pop
let csv_content = r#"city::string,country,pop
"Boston","United States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
.unwrap()
.finish()
.unwrap();
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
let val = obkv_to_value(&doc, index);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv).unwrap();
let vector = builder.into_inner().unwrap();
let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let index = cursor.documents_batch_index().clone();
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!(
val,
@ -533,34 +521,32 @@ mod test {
#[test]
fn bad_type_in_header() {
let documents = r#"city,country:number,pop
let csv_content = r#"city,country:number,pop
"Boston","United States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
assert!(
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
assert!(builder.append_csv(csv).is_err());
}
#[test]
fn bad_column_count1() {
let documents = r#"city,country,pop
"Boston","United States","4628910", "too much""#;
let csv_content = r#"city,country,pop
"Boston","United States","4628910", "too much
let csv = csv::Reader::from_reader(Cursor::new(csv_content"#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
assert!(
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
assert!(builder.append_csv(csv).is_err());
}
#[test]
fn bad_column_count2() {
let documents = r#"city,country,pop
let csv_content = r#"city,country,pop
"Boston","United States""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new();
assert!(
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
assert!(builder.append_csv(csv).is_err());
}
}

View file

@ -159,7 +159,7 @@ mod test {
#[test]
fn create_documents_no_errors() {
let json = json!({
let value = json!({
"number": 1,
"string": "this is a field",
"array": ["an", "array"],
@ -169,26 +169,17 @@ mod test {
"bool": true
});
let json = serde_json::to_vec(&json).unwrap();
let mut v = Vec::new();
let mut cursor = io::Cursor::new(&mut v);
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
builder.extend_from_json(Cursor::new(json)).unwrap();
builder.finish().unwrap();
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_json_object(value.as_object().unwrap()).unwrap();
let vector = builder.into_inner().unwrap();
let mut documents =
DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap();
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
assert_eq!(documents.index().iter().count(), 5);
let reader = documents.next_document_with_index().unwrap().unwrap();
assert_eq!(reader.1.iter().count(), 5);
assert!(documents.next_document_with_index().unwrap().is_none());
assert_eq!(documents.documents_batch_index().iter().count(), 5);
let reader = documents.next_document().unwrap().unwrap();
assert_eq!(reader.iter().count(), 5);
assert!(documents.next_document().unwrap().is_none());
}
#[test]
@ -200,101 +191,55 @@ mod test {
"toto": false,
});
let doc1 = serde_json::to_vec(&doc1).unwrap();
let doc2 = serde_json::to_vec(&doc2).unwrap();
let mut v = Vec::new();
let mut cursor = io::Cursor::new(&mut v);
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
builder.extend_from_json(Cursor::new(doc1)).unwrap();
builder.extend_from_json(Cursor::new(doc2)).unwrap();
builder.finish().unwrap();
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_json_object(doc1.as_object().unwrap()).unwrap();
builder.append_json_object(doc2.as_object().unwrap()).unwrap();
let vector = builder.into_inner().unwrap();
let mut documents =
DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap();
assert_eq!(documents.index().iter().count(), 2);
let reader = documents.next_document_with_index().unwrap().unwrap();
assert_eq!(reader.1.iter().count(), 1);
assert!(documents.next_document_with_index().unwrap().is_some());
assert!(documents.next_document_with_index().unwrap().is_none());
}
#[test]
fn add_documents_array() {
let docs = json!([
{ "toto": false },
{ "tata": "hello" },
]);
let docs = serde_json::to_vec(&docs).unwrap();
let mut v = Vec::new();
let mut cursor = io::Cursor::new(&mut v);
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
builder.extend_from_json(Cursor::new(docs)).unwrap();
builder.finish().unwrap();
let mut documents =
DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap();
assert_eq!(documents.index().iter().count(), 2);
let reader = documents.next_document_with_index().unwrap().unwrap();
assert_eq!(reader.1.iter().count(), 1);
assert!(documents.next_document_with_index().unwrap().is_some());
assert!(documents.next_document_with_index().unwrap().is_none());
}
#[test]
fn add_invalid_document_format() {
let mut v = Vec::new();
let mut cursor = io::Cursor::new(&mut v);
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
let docs = json!([[
{ "toto": false },
{ "tata": "hello" },
]]);
let docs = serde_json::to_vec(&docs).unwrap();
assert!(builder.extend_from_json(Cursor::new(docs)).is_err());
let docs = json!("hello");
let docs = serde_json::to_vec(&docs).unwrap();
assert!(builder.extend_from_json(Cursor::new(docs)).is_err());
DocumentsBatchReader::from_reader(io::Cursor::new(vector)).unwrap().into_cursor();
assert_eq!(documents.documents_batch_index().iter().count(), 2);
let reader = documents.next_document().unwrap().unwrap();
assert_eq!(reader.iter().count(), 1);
assert!(documents.next_document().unwrap().is_some());
assert!(documents.next_document().unwrap().is_none());
}
#[test]
fn test_nested() {
let mut docs = documents!([{
let docs_reader = documents!([{
"hello": {
"toto": ["hello"]
}
}]);
let (_index, doc) = docs.next_document_with_index().unwrap().unwrap();
let mut cursor = docs_reader.into_cursor();
let doc = cursor.next_document().unwrap().unwrap();
let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap();
assert_eq!(nested, json!({ "toto": ["hello"] }));
}
#[test]
fn out_of_order_fields() {
fn out_of_order_json_fields() {
let _documents = documents!([
{"id": 1,"b": 0},
{"id": 2,"a": 0,"b": 0},
]);
}
#[test]
fn out_of_order_csv_fields() {
let csv1_content = "id:number,b\n1,0";
let csv1 = csv::Reader::from_reader(Cursor::new(csv1_content));
let csv2_content = "id:number,a,b\n2,0,0";
let csv2 = csv::Reader::from_reader(Cursor::new(csv2_content));
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv1).unwrap();
builder.append_csv(csv2).unwrap();
let vector = builder.into_inner().unwrap();
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
}
}