Fix the tests for the new DocumentsBatchBuilder/Reader

This commit is contained in:
Kerollmops 2022-06-14 16:04:27 +02:00
parent 419ce3966c
commit e8297ad27e
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
9 changed files with 292 additions and 374 deletions

View File

@ -183,7 +183,8 @@ mod test {
use serde_json::{json, Map}; use serde_json::{json, Map};
use super::*; use super::*;
use crate::documents::DocumentBatchReader; use crate::documents::DocumentsBatchReader;
use crate::FieldId;
fn obkv_to_value(obkv: &obkv::KvReader<FieldId>, index: &DocumentsBatchIndex) -> Value { fn obkv_to_value(obkv: &obkv::KvReader<FieldId>, index: &DocumentsBatchIndex) -> Value {
let mut map = Map::new(); let mut map = Map::new();
@ -192,7 +193,7 @@ mod test {
let field_name = index.name(fid).unwrap().clone(); let field_name = index.name(fid).unwrap().clone();
let value: Value = serde_json::from_slice(value).unwrap(); let value: Value = serde_json::from_slice(value).unwrap();
map.insert(field_name, value); map.insert(field_name.to_string(), value);
} }
Value::Object(map) Value::Object(map)
@ -200,15 +201,13 @@ mod test {
#[test] #[test]
fn add_single_documents_json() { fn add_single_documents_json() {
let mut cursor = Cursor::new(Vec::new());
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
let json = serde_json::json!({ let json = serde_json::json!({
"id": 1, "id": 1,
"field": "hello!", "field": "hello!",
}); });
builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_json_object(json.as_object().unwrap()).unwrap();
let json = serde_json::json!({ let json = serde_json::json!({
"blabla": false, "blabla": false,
@ -216,100 +215,64 @@ mod test {
"id": 1, "id": 1,
}); });
builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); builder.append_json_object(json.as_object().unwrap()).unwrap();
assert_eq!(builder.len(), 2); assert_eq!(builder.documents_count(), 2);
let vector = builder.into_inner().unwrap();
builder.finish().unwrap(); let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
cursor.set_position(0); let index = cursor.documents_batch_index().clone();
let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
assert_eq!(index.len(), 3); assert_eq!(index.len(), 3);
let document = cursor.next_document().unwrap().unwrap();
assert_eq!(document.iter().count(), 2); assert_eq!(document.iter().count(), 2);
let (index, document) = reader.next_document_with_index().unwrap().unwrap(); let document = cursor.next_document().unwrap().unwrap();
assert_eq!(index.len(), 3);
assert_eq!(document.iter().count(), 3); assert_eq!(document.iter().count(), 3);
assert!(reader.next_document_with_index().unwrap().is_none()); assert!(cursor.next_document().unwrap().is_none());
}
#[test]
fn add_documents_seq_json() {
let mut cursor = Cursor::new(Vec::new());
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
let json = serde_json::json!([{
"id": 1,
"field": "hello!",
},{
"blabla": false,
"field": "hello!",
"id": 1,
}
]);
builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
assert_eq!(builder.len(), 2);
builder.finish().unwrap();
cursor.set_position(0);
let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
assert_eq!(index.len(), 3);
assert_eq!(document.iter().count(), 2);
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
assert_eq!(index.len(), 3);
assert_eq!(document.iter().count(), 3);
assert!(reader.next_document_with_index().unwrap().is_none());
} }
#[test] #[test]
fn add_documents_csv() { fn add_documents_csv() {
let mut cursor = Cursor::new(Vec::new()); let csv_content = "id:number,field:string\n1,hello!\n2,blabla";
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let csv = "id:number,field:string\n1,hello!\n2,blabla"; let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv).unwrap();
assert_eq!(builder.documents_count(), 2);
let vector = builder.into_inner().unwrap();
let builder = let mut cursor =
DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap(); DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
builder.finish().unwrap(); let index = cursor.documents_batch_index().clone();
cursor.set_position(0);
let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
assert_eq!(index.len(), 2); assert_eq!(index.len(), 2);
let document = cursor.next_document().unwrap().unwrap();
assert_eq!(document.iter().count(), 2); assert_eq!(document.iter().count(), 2);
let (_index, document) = reader.next_document_with_index().unwrap().unwrap(); let document = cursor.next_document().unwrap().unwrap();
assert_eq!(document.iter().count(), 2); assert_eq!(document.iter().count(), 2);
assert!(reader.next_document_with_index().unwrap().is_none()); assert!(cursor.next_document().unwrap().is_none());
} }
#[test] #[test]
fn simple_csv_document() { fn simple_csv_document() {
let documents = r#"city,country,pop let csv_content = r#"city,country,pop
"Boston","United States","4628910""#; "Boston","United States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new(); let mut builder = DocumentsBatchBuilder::new(Vec::new());
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) builder.append_csv(csv).unwrap();
.unwrap() let vector = builder.into_inner().unwrap();
.finish()
.unwrap(); let mut cursor =
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); let index = cursor.documents_batch_index().clone();
let val = obkv_to_value(&doc, index); let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!( assert_eq!(
val, val,
@ -320,22 +283,25 @@ mod test {
}) })
); );
assert!(reader.next_document_with_index().unwrap().is_none()); assert!(cursor.next_document().unwrap().is_none());
} }
#[test] #[test]
fn coma_in_field() { fn coma_in_field() {
let documents = r#"city,country,pop let csv_content = r#"city,country,pop
"Boston","United, States","4628910""#; "Boston","United, States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new(); let mut builder = DocumentsBatchBuilder::new(Vec::new());
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) builder.append_csv(csv).unwrap();
.unwrap() let vector = builder.into_inner().unwrap();
.finish()
.unwrap(); let mut cursor =
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); let index = cursor.documents_batch_index().clone();
let val = obkv_to_value(&doc, index);
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!( assert_eq!(
val, val,
@ -349,17 +315,20 @@ mod test {
#[test] #[test]
fn quote_in_field() { fn quote_in_field() {
let documents = r#"city,country,pop let csv_content = r#"city,country,pop
"Boston","United"" States","4628910""#; "Boston","United"" States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new(); let mut builder = DocumentsBatchBuilder::new(Vec::new());
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) builder.append_csv(csv).unwrap();
.unwrap() let vector = builder.into_inner().unwrap();
.finish()
.unwrap(); let mut cursor =
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); let index = cursor.documents_batch_index().clone();
let val = obkv_to_value(&doc, index);
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!( assert_eq!(
val, val,
@ -373,17 +342,20 @@ mod test {
#[test] #[test]
fn integer_in_field() { fn integer_in_field() {
let documents = r#"city,country,pop:number let csv_content = r#"city,country,pop:number
"Boston","United States","4628910""#; "Boston","United States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new(); let mut builder = DocumentsBatchBuilder::new(Vec::new());
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) builder.append_csv(csv).unwrap();
.unwrap() let vector = builder.into_inner().unwrap();
.finish()
.unwrap(); let mut cursor =
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); let index = cursor.documents_batch_index().clone();
let val = obkv_to_value(&doc, index);
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!( assert_eq!(
val, val,
@ -397,17 +369,20 @@ mod test {
#[test] #[test]
fn float_in_field() { fn float_in_field() {
let documents = r#"city,country,pop:number let csv_content = r#"city,country,pop:number
"Boston","United States","4628910.01""#; "Boston","United States","4628910.01""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new(); let mut builder = DocumentsBatchBuilder::new(Vec::new());
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) builder.append_csv(csv).unwrap();
.unwrap() let vector = builder.into_inner().unwrap();
.finish()
.unwrap(); let mut cursor =
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); let index = cursor.documents_batch_index().clone();
let val = obkv_to_value(&doc, index);
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!( assert_eq!(
val, val,
@ -421,17 +396,20 @@ mod test {
#[test] #[test]
fn several_colon_in_header() { fn several_colon_in_header() {
let documents = r#"city:love:string,country:state,pop let csv_content = r#"city:love:string,country:state,pop
"Boston","United States","4628910""#; "Boston","United States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new(); let mut builder = DocumentsBatchBuilder::new(Vec::new());
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) builder.append_csv(csv).unwrap();
.unwrap() let vector = builder.into_inner().unwrap();
.finish()
.unwrap(); let mut cursor =
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); let index = cursor.documents_batch_index().clone();
let val = obkv_to_value(&doc, index);
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!( assert_eq!(
val, val,
@ -445,17 +423,20 @@ mod test {
#[test] #[test]
fn ending_by_colon_in_header() { fn ending_by_colon_in_header() {
let documents = r#"city:,country,pop let csv_content = r#"city:,country,pop
"Boston","United States","4628910""#; "Boston","United States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new(); let mut builder = DocumentsBatchBuilder::new(Vec::new());
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) builder.append_csv(csv).unwrap();
.unwrap() let vector = builder.into_inner().unwrap();
.finish()
.unwrap(); let mut cursor =
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); let index = cursor.documents_batch_index().clone();
let val = obkv_to_value(&doc, index);
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!( assert_eq!(
val, val,
@ -469,17 +450,20 @@ mod test {
#[test] #[test]
fn starting_by_colon_in_header() { fn starting_by_colon_in_header() {
let documents = r#":city,country,pop let csv_content = r#":city,country,pop
"Boston","United States","4628910""#; "Boston","United States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new(); let mut builder = DocumentsBatchBuilder::new(Vec::new());
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) builder.append_csv(csv).unwrap();
.unwrap() let vector = builder.into_inner().unwrap();
.finish()
.unwrap(); let mut cursor =
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); let index = cursor.documents_batch_index().clone();
let val = obkv_to_value(&doc, index);
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!( assert_eq!(
val, val,
@ -494,32 +478,36 @@ mod test {
#[ignore] #[ignore]
#[test] #[test]
fn starting_by_colon_in_header2() { fn starting_by_colon_in_header2() {
let documents = r#":string,country,pop let csv_content = r#":string,country,pop
"Boston","United States","4628910""#; "Boston","United States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new(); let mut builder = DocumentsBatchBuilder::new(Vec::new());
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) builder.append_csv(csv).unwrap();
.unwrap() let vector = builder.into_inner().unwrap();
.finish()
.unwrap();
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
assert!(reader.next_document_with_index().is_err()); let mut cursor =
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
assert!(cursor.next_document().is_err());
} }
#[test] #[test]
fn double_colon_in_header() { fn double_colon_in_header() {
let documents = r#"city::string,country,pop let csv_content = r#"city::string,country,pop
"Boston","United States","4628910""#; "Boston","United States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new(); let mut builder = DocumentsBatchBuilder::new(Vec::new());
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) builder.append_csv(csv).unwrap();
.unwrap() let vector = builder.into_inner().unwrap();
.finish()
.unwrap(); let mut cursor =
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); let index = cursor.documents_batch_index().clone();
let val = obkv_to_value(&doc, index);
let doc = cursor.next_document().unwrap().unwrap();
let val = obkv_to_value(&doc, &index);
assert_eq!( assert_eq!(
val, val,
@ -533,34 +521,32 @@ mod test {
#[test] #[test]
fn bad_type_in_header() { fn bad_type_in_header() {
let documents = r#"city,country:number,pop let csv_content = r#"city,country:number,pop
"Boston","United States","4628910""#; "Boston","United States","4628910""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new(); let mut builder = DocumentsBatchBuilder::new(Vec::new());
assert!( assert!(builder.append_csv(csv).is_err());
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
);
} }
#[test] #[test]
fn bad_column_count1() { fn bad_column_count1() {
let documents = r#"city,country,pop let csv_content = r#"city,country,pop
"Boston","United States","4628910", "too much""#; "Boston","United States","4628910", "too much
let csv = csv::Reader::from_reader(Cursor::new(csv_content"#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new(); let mut builder = DocumentsBatchBuilder::new(Vec::new());
assert!( assert!(builder.append_csv(csv).is_err());
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
);
} }
#[test] #[test]
fn bad_column_count2() { fn bad_column_count2() {
let documents = r#"city,country,pop let csv_content = r#"city,country,pop
"Boston","United States""#; "Boston","United States""#;
let csv = csv::Reader::from_reader(Cursor::new(csv_content));
let mut buf = Vec::new(); let mut builder = DocumentsBatchBuilder::new(Vec::new());
assert!( assert!(builder.append_csv(csv).is_err());
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
);
} }
} }

View File

@ -159,7 +159,7 @@ mod test {
#[test] #[test]
fn create_documents_no_errors() { fn create_documents_no_errors() {
let json = json!({ let value = json!({
"number": 1, "number": 1,
"string": "this is a field", "string": "this is a field",
"array": ["an", "array"], "array": ["an", "array"],
@ -169,26 +169,17 @@ mod test {
"bool": true "bool": true
}); });
let json = serde_json::to_vec(&json).unwrap(); let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_json_object(value.as_object().unwrap()).unwrap();
let mut v = Vec::new(); let vector = builder.into_inner().unwrap();
let mut cursor = io::Cursor::new(&mut v);
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
builder.extend_from_json(Cursor::new(json)).unwrap();
builder.finish().unwrap();
let mut documents = let mut documents =
DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
assert_eq!(documents.index().iter().count(), 5); assert_eq!(documents.documents_batch_index().iter().count(), 5);
let reader = documents.next_document().unwrap().unwrap();
let reader = documents.next_document_with_index().unwrap().unwrap(); assert_eq!(reader.iter().count(), 5);
assert!(documents.next_document().unwrap().is_none());
assert_eq!(reader.1.iter().count(), 5);
assert!(documents.next_document_with_index().unwrap().is_none());
} }
#[test] #[test]
@ -200,101 +191,55 @@ mod test {
"toto": false, "toto": false,
}); });
let doc1 = serde_json::to_vec(&doc1).unwrap(); let mut builder = DocumentsBatchBuilder::new(Vec::new());
let doc2 = serde_json::to_vec(&doc2).unwrap(); builder.append_json_object(doc1.as_object().unwrap()).unwrap();
builder.append_json_object(doc2.as_object().unwrap()).unwrap();
let mut v = Vec::new(); let vector = builder.into_inner().unwrap();
let mut cursor = io::Cursor::new(&mut v);
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
builder.extend_from_json(Cursor::new(doc1)).unwrap();
builder.extend_from_json(Cursor::new(doc2)).unwrap();
builder.finish().unwrap();
let mut documents = let mut documents =
DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); DocumentsBatchReader::from_reader(io::Cursor::new(vector)).unwrap().into_cursor();
assert_eq!(documents.documents_batch_index().iter().count(), 2);
assert_eq!(documents.index().iter().count(), 2); let reader = documents.next_document().unwrap().unwrap();
assert_eq!(reader.iter().count(), 1);
let reader = documents.next_document_with_index().unwrap().unwrap(); assert!(documents.next_document().unwrap().is_some());
assert!(documents.next_document().unwrap().is_none());
assert_eq!(reader.1.iter().count(), 1);
assert!(documents.next_document_with_index().unwrap().is_some());
assert!(documents.next_document_with_index().unwrap().is_none());
}
#[test]
fn add_documents_array() {
let docs = json!([
{ "toto": false },
{ "tata": "hello" },
]);
let docs = serde_json::to_vec(&docs).unwrap();
let mut v = Vec::new();
let mut cursor = io::Cursor::new(&mut v);
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
builder.extend_from_json(Cursor::new(docs)).unwrap();
builder.finish().unwrap();
let mut documents =
DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap();
assert_eq!(documents.index().iter().count(), 2);
let reader = documents.next_document_with_index().unwrap().unwrap();
assert_eq!(reader.1.iter().count(), 1);
assert!(documents.next_document_with_index().unwrap().is_some());
assert!(documents.next_document_with_index().unwrap().is_none());
}
#[test]
fn add_invalid_document_format() {
let mut v = Vec::new();
let mut cursor = io::Cursor::new(&mut v);
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
let docs = json!([[
{ "toto": false },
{ "tata": "hello" },
]]);
let docs = serde_json::to_vec(&docs).unwrap();
assert!(builder.extend_from_json(Cursor::new(docs)).is_err());
let docs = json!("hello");
let docs = serde_json::to_vec(&docs).unwrap();
assert!(builder.extend_from_json(Cursor::new(docs)).is_err());
} }
#[test] #[test]
fn test_nested() { fn test_nested() {
let mut docs = documents!([{ let docs_reader = documents!([{
"hello": { "hello": {
"toto": ["hello"] "toto": ["hello"]
} }
}]); }]);
let (_index, doc) = docs.next_document_with_index().unwrap().unwrap(); let mut cursor = docs_reader.into_cursor();
let doc = cursor.next_document().unwrap().unwrap();
let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap(); let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap();
assert_eq!(nested, json!({ "toto": ["hello"] })); assert_eq!(nested, json!({ "toto": ["hello"] }));
} }
#[test] #[test]
fn out_of_order_fields() { fn out_of_order_json_fields() {
let _documents = documents!([ let _documents = documents!([
{"id": 1,"b": 0}, {"id": 1,"b": 0},
{"id": 2,"a": 0,"b": 0}, {"id": 2,"a": 0,"b": 0},
]); ]);
} }
#[test]
fn out_of_order_csv_fields() {
let csv1_content = "id:number,b\n1,0";
let csv1 = csv::Reader::from_reader(Cursor::new(csv1_content));
let csv2_content = "id:number,a,b\n2,0,0";
let csv2 = csv::Reader::from_reader(Cursor::new(csv2_content));
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_csv(csv1).unwrap();
builder.append_csv(csv2).unwrap();
let vector = builder.into_inner().unwrap();
DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
}
} }

View File

@ -35,7 +35,7 @@ mod test {
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde_json::{json, Value}; use serde_json::{json, Value};
use crate::documents::{DocumentBatchBuilder, DocumentBatchReader}; use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use crate::index::tests::TempIndex; use crate::index::tests::TempIndex;
use crate::index::Index; use crate::index::Index;
use crate::update::{ use crate::update::{
@ -43,14 +43,11 @@ mod test {
}; };
use crate::{DocumentId, FieldId, BEU32}; use crate::{DocumentId, FieldId, BEU32};
static JSON: Lazy<Vec<u8>> = Lazy::new(generate_documents); static JSON: Lazy<Vec<u8>> = Lazy::new(|| {
fn generate_documents() -> Vec<u8> {
let mut rng = rand::thread_rng(); let mut rng = rand::thread_rng();
let num_docs = rng.gen_range(10..30); let num_docs = rng.gen_range(10..30);
let mut cursor = Cursor::new(Vec::new()); let mut builder = DocumentsBatchBuilder::new(Vec::new());
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
let txts = ["Toto", "Titi", "Tata"]; let txts = ["Toto", "Titi", "Tata"];
let cats = (1..10).map(|i| i.to_string()).collect::<Vec<_>>(); let cats = (1..10).map(|i| i.to_string()).collect::<Vec<_>>();
let cat_ints = (1..10).collect::<Vec<_>>(); let cat_ints = (1..10).collect::<Vec<_>>();
@ -63,7 +60,7 @@ mod test {
let mut sample_ints = cat_ints.clone(); let mut sample_ints = cat_ints.clone();
sample_ints.shuffle(&mut rng); sample_ints.shuffle(&mut rng);
let doc = json!({ let json = json!({
"id": i, "id": i,
"txt": txt, "txt": txt,
"cat-int": rng.gen_range(0..3), "cat-int": rng.gen_range(0..3),
@ -71,13 +68,16 @@ mod test {
"cat-ints": sample_ints[..(rng.gen_range(0..3))], "cat-ints": sample_ints[..(rng.gen_range(0..3))],
}); });
let doc = Cursor::new(serde_json::to_vec(&doc).unwrap()); let object = match json {
builder.extend_from_json(doc).unwrap(); Value::Object(object) => object,
_ => panic!(),
};
builder.append_json_object(&object).unwrap();
} }
builder.finish().unwrap(); builder.into_inner().unwrap()
cursor.into_inner() });
}
/// Returns a temporary index populated with random test documents, the FieldId for the /// Returns a temporary index populated with random test documents, the FieldId for the
/// distinct attribute, and the RoaringBitmap with the document ids. /// distinct attribute, and the RoaringBitmap with the document ids.
@ -101,7 +101,8 @@ mod test {
IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap(); IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap();
let reader = let reader =
crate::documents::DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap(); crate::documents::DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice()))
.unwrap();
addition.add_documents(reader).unwrap(); addition.add_documents(reader).unwrap();
addition.execute().unwrap(); addition.execute().unwrap();
@ -109,8 +110,8 @@ mod test {
let fields_map = index.fields_ids_map(&txn).unwrap(); let fields_map = index.fields_ids_map(&txn).unwrap();
let fid = fields_map.id(&distinct).unwrap(); let fid = fields_map.id(&distinct).unwrap();
let documents = DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap(); let documents = DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())).unwrap();
let map = (0..documents.len() as u32).collect(); let map = (0..documents.documents_count() as u32).collect();
txn.commit().unwrap(); txn.commit().unwrap();

View File

@ -25,7 +25,7 @@ pub use self::helpers::{
}; };
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
pub use self::transform::{Transform, TransformOutput}; pub use self::transform::{Transform, TransformOutput};
use crate::documents::DocumentBatchReader; use crate::documents::DocumentsBatchReader;
pub use crate::update::index_documents::helpers::CursorClonableMmap; pub use crate::update::index_documents::helpers::CursorClonableMmap;
use crate::update::{ use crate::update::{
self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids,
@ -121,7 +121,7 @@ where
/// builder, and the builder must be discarded. /// builder, and the builder must be discarded.
/// ///
/// Returns the number of documents added to the builder. /// Returns the number of documents added to the builder.
pub fn add_documents<R>(&mut self, reader: DocumentBatchReader<R>) -> Result<u64> pub fn add_documents<R>(&mut self, reader: DocumentsBatchReader<R>) -> Result<u64>
where where
R: Read + Seek, R: Read + Seek,
{ {
@ -590,9 +590,8 @@ mod tests {
use maplit::hashset; use maplit::hashset;
use super::*; use super::*;
use crate::documents::DocumentBatchBuilder; use crate::documents::DocumentsBatchBuilder;
use crate::update::DeleteDocuments; use crate::update::DeleteDocuments;
use crate::HashMap;
#[test] #[test]
fn simple_document_replacement() { fn simple_document_replacement() {
@ -1252,21 +1251,17 @@ mod tests {
let mut wtxn = index.write_txn().unwrap(); let mut wtxn = index.write_txn().unwrap();
let mut big_object = HashMap::new(); let mut big_object = serde_json::Map::new();
big_object.insert(S("id"), "wow"); big_object.insert(S("id"), serde_json::Value::from("wow"));
for i in 0..1000 { for i in 0..1000 {
let key = i.to_string(); let key = i.to_string();
big_object.insert(key, "I am a text!"); big_object.insert(key, serde_json::Value::from("I am a text!"));
} }
let mut cursor = Cursor::new(Vec::new()); let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_json_object(&big_object).unwrap();
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); let vector = builder.into_inner().unwrap();
let big_object = Cursor::new(serde_json::to_vec(&big_object).unwrap()); let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
builder.extend_from_json(big_object).unwrap();
builder.finish().unwrap();
cursor.set_position(0);
let content = DocumentBatchReader::from_reader(cursor).unwrap();
let config = IndexerConfig::default(); let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default(); let indexing_config = IndexDocumentsConfig::default();
@ -1288,23 +1283,19 @@ mod tests {
let mut wtxn = index.write_txn().unwrap(); let mut wtxn = index.write_txn().unwrap();
let mut big_object = HashMap::new(); let mut big_object = serde_json::Map::new();
big_object.insert(S("id"), "wow"); big_object.insert(S("id"), serde_json::Value::from("wow"));
let content: String = (0..=u16::MAX) let content: String = (0..=u16::MAX)
.into_iter() .into_iter()
.map(|p| p.to_string()) .map(|p| p.to_string())
.reduce(|a, b| a + " " + b.as_ref()) .reduce(|a, b| a + " " + b.as_ref())
.unwrap(); .unwrap();
big_object.insert("content".to_string(), &content); big_object.insert("content".to_string(), serde_json::Value::from(content));
let mut cursor = Cursor::new(Vec::new()); let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_json_object(&big_object).unwrap();
let big_object = serde_json::to_string(&big_object).unwrap(); let vector = builder.into_inner().unwrap();
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
builder.extend_from_json(&mut big_object.as_bytes()).unwrap();
builder.finish().unwrap();
cursor.set_position(0);
let content = DocumentBatchReader::from_reader(cursor).unwrap();
let config = IndexerConfig::default(); let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default(); let indexing_config = IndexDocumentsConfig::default();
@ -1843,18 +1834,20 @@ mod tests {
// Create 200 documents with a long text // Create 200 documents with a long text
let content = { let content = {
let documents: Vec<_> = (0..200i32) let documents_iter = (0..200i32)
.into_iter() .into_iter()
.map(|i| serde_json::json!({ "id": i, "script": script })) .map(|i| serde_json::json!({ "id": i, "script": script }))
.collect(); .filter_map(|json| match json {
serde_json::Value::Object(object) => Some(object),
_ => None,
});
let mut writer = std::io::Cursor::new(Vec::new()); let mut builder = crate::documents::DocumentsBatchBuilder::new(Vec::new());
let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); for object in documents_iter {
let documents = serde_json::to_vec(&documents).unwrap(); builder.append_json_object(&object).unwrap();
builder.extend_from_json(std::io::Cursor::new(documents)).unwrap(); }
builder.finish().unwrap(); let vector = builder.into_inner().unwrap();
writer.set_position(0); crate::documents::DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap()
crate::documents::DocumentBatchReader::from_reader(writer).unwrap()
}; };
// Index those 200 long documents // Index those 200 long documents

View File

@ -14,7 +14,7 @@ use smartstring::SmartString;
use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn};
use super::{IndexDocumentsMethod, IndexerConfig}; use super::{IndexDocumentsMethod, IndexerConfig};
use crate::documents::{DocumentBatchReader, DocumentsBatchIndex}; use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader};
use crate::error::{Error, InternalError, UserError}; use crate::error::{Error, InternalError, UserError};
use crate::index::db_name; use crate::index::db_name;
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
@ -152,7 +152,7 @@ impl<'a, 'i> Transform<'a, 'i> {
pub fn read_documents<R, F>( pub fn read_documents<R, F>(
&mut self, &mut self,
mut reader: DocumentBatchReader<R>, reader: DocumentsBatchReader<R>,
wtxn: &mut heed::RwTxn, wtxn: &mut heed::RwTxn,
progress_callback: F, progress_callback: F,
) -> Result<usize> ) -> Result<usize>
@ -160,7 +160,8 @@ impl<'a, 'i> Transform<'a, 'i> {
R: Read + Seek, R: Read + Seek,
F: Fn(UpdateIndexingStep) + Sync, F: Fn(UpdateIndexingStep) + Sync,
{ {
let fields_index = reader.index(); let mut cursor = reader.into_cursor();
let fields_index = cursor.documents_batch_index();
let external_documents_ids = self.index.external_documents_ids(wtxn)?; let external_documents_ids = self.index.external_documents_ids(wtxn)?;
let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?; let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?;
@ -186,7 +187,8 @@ impl<'a, 'i> Transform<'a, 'i> {
let mut documents_count = 0; let mut documents_count = 0;
let mut external_id_buffer = Vec::new(); let mut external_id_buffer = Vec::new();
let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new();
while let Some((addition_index, document)) = reader.next_document_with_index()? { let addition_index = cursor.documents_batch_index().clone();
while let Some(document) = cursor.next_document()? {
let mut field_buffer_cache = drop_and_reuse(field_buffer); let mut field_buffer_cache = drop_and_reuse(field_buffer);
if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) {
progress_callback(UpdateIndexingStep::RemapDocumentAddition { progress_callback(UpdateIndexingStep::RemapDocumentAddition {
@ -840,7 +842,7 @@ fn update_primary_key<'a>(
None => { None => {
let mut json = Map::new(); let mut json = Map::new();
for (key, value) in document.iter() { for (key, value) in document.iter() {
let key = addition_index.name(key).cloned(); let key = addition_index.name(key).map(ToString::to_string);
let value = serde_json::from_slice::<Value>(&value).ok(); let value = serde_json::from_slice::<Value>(&value).ok();
if let Some((k, v)) = key.zip(value) { if let Some((k, v)) = key.zip(value) {

View File

@ -3,9 +3,10 @@ use std::io::Cursor;
use big_s::S; use big_s::S;
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use maplit::hashset; use maplit::hashset;
use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
use milli::{FacetDistribution, Index}; use milli::{FacetDistribution, Index};
use serde_json::{Deserializer, Map, Value};
#[test] #[test]
fn test_facet_distribution_with_no_facet_values() { fn test_facet_distribution_with_no_facet_values() {
@ -30,35 +31,30 @@ fn test_facet_distribution_with_no_facet_values() {
let mut builder = let mut builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
let mut cursor = Cursor::new(Vec::new()); let mut documents_builder = DocumentsBatchBuilder::new(Vec::new());
let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
let reader = Cursor::new( let reader = Cursor::new(
r#"[ r#"{
{
"id": 123, "id": 123,
"title": "What a week, hu...", "title": "What a week, hu...",
"genres": [], "genres": [],
"tags": ["blue"] "tags": ["blue"]
}, }
{ {
"id": 345, "id": 345,
"title": "I am the pig!", "title": "I am the pig!",
"tags": ["red"] "tags": ["red"]
} }"#,
]"#,
); );
for doc in serde_json::Deserializer::from_reader(reader).into_iter::<serde_json::Value>() { for result in Deserializer::from_reader(reader).into_iter::<Map<String, Value>>() {
let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap()); let object = result.unwrap();
documents_builder.extend_from_json(doc).unwrap(); documents_builder.append_json_object(&object).unwrap();
} }
documents_builder.finish().unwrap(); let vector = documents_builder.into_inner().unwrap();
cursor.set_position(0);
// index documents // index documents
let content = DocumentBatchReader::from_reader(cursor).unwrap(); let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
builder.add_documents(content).unwrap(); builder.add_documents(content).unwrap();
builder.execute().unwrap(); builder.execute().unwrap();

View File

@ -6,10 +6,11 @@ use big_s::S;
use either::{Either, Left, Right}; use either::{Either, Left, Right};
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use maplit::{hashmap, hashset}; use maplit::{hashmap, hashset};
use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
use milli::{AscDesc, Criterion, DocumentId, Index, Member}; use milli::{AscDesc, Criterion, DocumentId, Index, Member};
use serde::Deserialize; use serde::Deserialize;
use serde_json::{Deserializer, Map, Value};
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
mod distinct; mod distinct;
@ -62,21 +63,18 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
let mut builder = let mut builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
let mut cursor = Cursor::new(Vec::new()); let mut documents_builder = DocumentsBatchBuilder::new(Vec::new());
let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
let reader = Cursor::new(CONTENT.as_bytes()); let reader = Cursor::new(CONTENT.as_bytes());
for doc in serde_json::Deserializer::from_reader(reader).into_iter::<serde_json::Value>() { for result in Deserializer::from_reader(reader).into_iter::<Map<String, Value>>() {
let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap()); let object = result.unwrap();
documents_builder.extend_from_json(doc).unwrap(); documents_builder.append_json_object(&object).unwrap();
} }
documents_builder.finish().unwrap(); let vector = documents_builder.into_inner().unwrap();
cursor.set_position(0);
// index documents // index documents
let content = DocumentBatchReader::from_reader(cursor).unwrap(); let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
builder.add_documents(content).unwrap(); builder.add_documents(content).unwrap();
builder.execute().unwrap(); builder.execute().unwrap();

View File

@ -5,7 +5,7 @@ use big_s::S;
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use itertools::Itertools; use itertools::Itertools;
use maplit::hashset; use maplit::hashset;
use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult}; use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult};
use rand::Rng; use rand::Rng;
@ -393,8 +393,7 @@ fn criteria_ascdesc() {
let mut builder = let mut builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
let mut cursor = Cursor::new(Vec::new()); let mut batch_builder = DocumentsBatchBuilder::new(Vec::new());
let mut batch_builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
(0..ASC_DESC_CANDIDATES_THRESHOLD + 1).for_each(|_| { (0..ASC_DESC_CANDIDATES_THRESHOLD + 1).for_each(|_| {
let mut rng = rand::thread_rng(); let mut rng = rand::thread_rng();
@ -412,16 +411,17 @@ fn criteria_ascdesc() {
"age": age, "age": age,
}); });
let json = Cursor::new(serde_json::to_vec(&json).unwrap()); let object = match json {
batch_builder.extend_from_json(json).unwrap(); serde_json::Value::Object(object) => object,
_ => panic!(),
};
batch_builder.append_json_object(&object).unwrap();
}); });
batch_builder.finish().unwrap(); let vector = batch_builder.into_inner().unwrap();
cursor.set_position(0);
let reader = DocumentBatchReader::from_reader(cursor).unwrap();
let reader = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
builder.add_documents(reader).unwrap(); builder.add_documents(reader).unwrap();
builder.execute().unwrap(); builder.execute().unwrap();

View File

@ -106,26 +106,23 @@ fn test_typo_disabled_on_word() {
options.map_size(4096 * 100); options.map_size(4096 * 100);
let index = Index::new(options, tmp.path()).unwrap(); let index = Index::new(options, tmp.path()).unwrap();
let documents = json!([ let mut builder = milli::documents::DocumentsBatchBuilder::new(Vec::new());
{ let doc1 = json!({
"id": 1usize, "id": 1usize,
"data": "zealand", "data": "zealand",
}, });
{
let doc2 = json!({
"id": 2usize, "id": 2usize,
"data": "zearand", "data": "zearand",
}, });
]);
let mut writer = std::io::Cursor::new(Vec::new()); builder.append_json_object(doc1.as_object().unwrap()).unwrap();
let mut builder = milli::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); builder.append_json_object(doc2.as_object().unwrap()).unwrap();
let documents = serde_json::to_vec(&documents).unwrap(); let vector = builder.into_inner().unwrap();
builder.extend_from_json(std::io::Cursor::new(documents)).unwrap();
builder.finish().unwrap();
writer.set_position(0); let documents =
milli::documents::DocumentsBatchReader::from_reader(std::io::Cursor::new(vector)).unwrap();
let documents = milli::documents::DocumentBatchReader::from_reader(writer).unwrap();
let mut txn = index.write_txn().unwrap(); let mut txn = index.write_txn().unwrap();
let config = IndexerConfig::default(); let config = IndexerConfig::default();