mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-24 13:40:31 +01:00
add csv builder tests
This commit is contained in:
parent
53c79e85f2
commit
430e9b13d3
@ -54,12 +54,7 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
|
|||||||
/// metadata at the end of the file, and write the metadata offset at the beginning on the
|
/// metadata at the end of the file, and write the metadata offset at the beginning on the
|
||||||
/// file.
|
/// file.
|
||||||
pub fn finish(self) -> Result<(), Error> {
|
pub fn finish(self) -> Result<(), Error> {
|
||||||
let Self {
|
let Self { inner: ByteCounter { mut writer, count: offset }, index, count, .. } = self;
|
||||||
inner: ByteCounter { mut writer, count: offset },
|
|
||||||
index,
|
|
||||||
count,
|
|
||||||
..
|
|
||||||
} = self;
|
|
||||||
|
|
||||||
let meta = DocumentsMetadata { count, index };
|
let meta = DocumentsMetadata { count, index };
|
||||||
|
|
||||||
@ -73,7 +68,6 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// Extends the builder with json documents from a reader.
|
/// Extends the builder with json documents from a reader.
|
||||||
pub fn extend_from_json<R: io::Read>(&mut self, reader: R) -> Result<(), Error> {
|
pub fn extend_from_json<R: io::Read>(&mut self, reader: R) -> Result<(), Error> {
|
||||||
let mut de = serde_json::Deserializer::from_reader(reader);
|
let mut de = serde_json::Deserializer::from_reader(reader);
|
||||||
@ -95,7 +89,6 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
|
|||||||
/// Since all fields in a csv documents are guaranteed to be ordered, we are able to perform
|
/// Since all fields in a csv documents are guaranteed to be ordered, we are able to perform
|
||||||
/// optimisations, and extending from another CSV is not allowed.
|
/// optimisations, and extending from another CSV is not allowed.
|
||||||
pub fn from_csv<R: io::Read>(reader: R, writer: W) -> Result<Self, Error> {
|
pub fn from_csv<R: io::Read>(reader: R, writer: W) -> Result<Self, Error> {
|
||||||
|
|
||||||
let mut this = Self::new(writer)?;
|
let mut this = Self::new(writer)?;
|
||||||
// Ensure that this is the first and only addition made with this builder
|
// Ensure that this is the first and only addition made with this builder
|
||||||
debug_assert!(this.index.is_empty());
|
debug_assert!(this.index.is_empty());
|
||||||
@ -112,28 +105,24 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
|
|||||||
let records = records.into_records();
|
let records = records.into_records();
|
||||||
|
|
||||||
for record in records {
|
for record in records {
|
||||||
match record {
|
let record = record?;
|
||||||
Ok(record) => {
|
let mut writer = obkv::KvWriter::new(Cursor::new(&mut this.obkv_buffer));
|
||||||
let mut writer = obkv::KvWriter::new(Cursor::new(&mut this.obkv_buffer));
|
for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) {
|
||||||
for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) {
|
let value = match ty {
|
||||||
let value = match ty {
|
AllowedType::Number => value.parse::<f64>().map(Value::from)?,
|
||||||
AllowedType::Number => value.parse::<f64>().map(Value::from)?,
|
AllowedType::String => Value::String(value.to_string()),
|
||||||
AllowedType::String => Value::String(value.to_string()),
|
};
|
||||||
};
|
|
||||||
|
|
||||||
serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?;
|
serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?;
|
||||||
writer.insert(*fid, &this.value_buffer)?;
|
writer.insert(*fid, &this.value_buffer)?;
|
||||||
this.value_buffer.clear();
|
this.value_buffer.clear();
|
||||||
}
|
|
||||||
|
|
||||||
this.inner.write_u32::<BigEndian>(this.obkv_buffer.len() as u32)?;
|
|
||||||
this.inner.write_all(&this.obkv_buffer)?;
|
|
||||||
|
|
||||||
this.obkv_buffer.clear();
|
|
||||||
this.count += 1;
|
|
||||||
},
|
|
||||||
Err(_) => panic!(),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
this.inner.write_u32::<BigEndian>(this.obkv_buffer.len() as u32)?;
|
||||||
|
this.inner.write_all(&this.obkv_buffer)?;
|
||||||
|
|
||||||
|
this.obkv_buffer.clear();
|
||||||
|
this.count += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(this)
|
Ok(this)
|
||||||
@ -162,10 +151,25 @@ fn parse_csv_header(header: &str) -> (String, AllowedType) {
|
|||||||
mod test {
|
mod test {
|
||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
|
|
||||||
|
use serde_json::{json, Map};
|
||||||
|
|
||||||
use crate::documents::DocumentBatchReader;
|
use crate::documents::DocumentBatchReader;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
|
fn obkv_to_value(obkv: &obkv::KvReader<FieldId>, index: &DocumentsBatchIndex) -> Value {
|
||||||
|
let mut map = Map::new();
|
||||||
|
|
||||||
|
for (fid, value) in obkv.iter() {
|
||||||
|
let field_name = index.name(fid).unwrap().clone();
|
||||||
|
let value: Value = serde_json::from_slice(value).unwrap();
|
||||||
|
|
||||||
|
map.insert(field_name, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
Value::Object(map)
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn add_single_documents_json() {
|
fn add_single_documents_json() {
|
||||||
let mut cursor = Cursor::new(Vec::new());
|
let mut cursor = Cursor::new(Vec::new());
|
||||||
@ -247,7 +251,8 @@ mod test {
|
|||||||
|
|
||||||
let csv = "id:number,field:string\n1,hello!\n2,blabla";
|
let csv = "id:number,field:string\n1,hello!\n2,blabla";
|
||||||
|
|
||||||
let builder = DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap();
|
let builder =
|
||||||
|
DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap();
|
||||||
builder.finish().unwrap();
|
builder.finish().unwrap();
|
||||||
|
|
||||||
cursor.set_position(0);
|
cursor.set_position(0);
|
||||||
@ -263,4 +268,265 @@ mod test {
|
|||||||
|
|
||||||
assert!(reader.next_document_with_index().unwrap().is_none());
|
assert!(reader.next_document_with_index().unwrap().is_none());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn simple_csv_document() {
|
||||||
|
let documents = r#"city,country,pop
|
||||||
|
"Boston","United States","4628910""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
||||||
|
.unwrap()
|
||||||
|
.finish()
|
||||||
|
.unwrap();
|
||||||
|
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
||||||
|
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
||||||
|
let val = obkv_to_value(&doc, index);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
val,
|
||||||
|
json!({
|
||||||
|
"city": "Boston",
|
||||||
|
"country": "United States",
|
||||||
|
"pop": "4628910",
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
assert!(reader.next_document_with_index().unwrap().is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn coma_in_field() {
|
||||||
|
let documents = r#"city,country,pop
|
||||||
|
"Boston","United, States","4628910""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
||||||
|
.unwrap()
|
||||||
|
.finish()
|
||||||
|
.unwrap();
|
||||||
|
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
||||||
|
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
||||||
|
let val = obkv_to_value(&doc, index);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
val,
|
||||||
|
json!({
|
||||||
|
"city": "Boston",
|
||||||
|
"country": "United, States",
|
||||||
|
"pop": "4628910",
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn quote_in_field() {
|
||||||
|
let documents = r#"city,country,pop
|
||||||
|
"Boston","United"" States","4628910""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
||||||
|
.unwrap()
|
||||||
|
.finish()
|
||||||
|
.unwrap();
|
||||||
|
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
||||||
|
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
||||||
|
let val = obkv_to_value(&doc, index);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
val,
|
||||||
|
json!({
|
||||||
|
"city": "Boston",
|
||||||
|
"country": "United\" States",
|
||||||
|
"pop": "4628910",
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn integer_in_field() {
|
||||||
|
let documents = r#"city,country,pop:number
|
||||||
|
"Boston","United States","4628910""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
||||||
|
.unwrap()
|
||||||
|
.finish()
|
||||||
|
.unwrap();
|
||||||
|
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
||||||
|
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
||||||
|
let val = obkv_to_value(&doc, index);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
val,
|
||||||
|
json!({
|
||||||
|
"city": "Boston",
|
||||||
|
"country": "United States",
|
||||||
|
"pop": 4628910.0,
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn float_in_field() {
|
||||||
|
let documents = r#"city,country,pop:number
|
||||||
|
"Boston","United States","4628910.01""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
||||||
|
.unwrap()
|
||||||
|
.finish()
|
||||||
|
.unwrap();
|
||||||
|
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
||||||
|
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
||||||
|
let val = obkv_to_value(&doc, index);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
val,
|
||||||
|
json!({
|
||||||
|
"city": "Boston",
|
||||||
|
"country": "United States",
|
||||||
|
"pop": 4628910.01,
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn several_colon_in_header() {
|
||||||
|
let documents = r#"city:love:string,country:state,pop
|
||||||
|
"Boston","United States","4628910""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
||||||
|
.unwrap()
|
||||||
|
.finish()
|
||||||
|
.unwrap();
|
||||||
|
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
||||||
|
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
||||||
|
let val = obkv_to_value(&doc, index);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
val,
|
||||||
|
json!({
|
||||||
|
"city:love": "Boston",
|
||||||
|
"country:state": "United States",
|
||||||
|
"pop": "4628910",
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn ending_by_colon_in_header() {
|
||||||
|
let documents = r#"city:,country,pop
|
||||||
|
"Boston","United States","4628910""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
||||||
|
.unwrap()
|
||||||
|
.finish()
|
||||||
|
.unwrap();
|
||||||
|
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
||||||
|
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
||||||
|
let val = obkv_to_value(&doc, index);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
val,
|
||||||
|
json!({
|
||||||
|
"city:": "Boston",
|
||||||
|
"country": "United States",
|
||||||
|
"pop": "4628910",
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn starting_by_colon_in_header() {
|
||||||
|
let documents = r#":city,country,pop
|
||||||
|
"Boston","United States","4628910""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
||||||
|
.unwrap()
|
||||||
|
.finish()
|
||||||
|
.unwrap();
|
||||||
|
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
||||||
|
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
||||||
|
let val = obkv_to_value(&doc, index);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
val,
|
||||||
|
json!({
|
||||||
|
":city": "Boston",
|
||||||
|
"country": "United States",
|
||||||
|
"pop": "4628910",
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[ignore]
|
||||||
|
#[test]
|
||||||
|
fn starting_by_colon_in_header2() {
|
||||||
|
let documents = r#":string,country,pop
|
||||||
|
"Boston","United States","4628910""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
||||||
|
.unwrap()
|
||||||
|
.finish()
|
||||||
|
.unwrap();
|
||||||
|
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
||||||
|
|
||||||
|
assert!(reader.next_document_with_index().is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn double_colon_in_header() {
|
||||||
|
let documents = r#"city::string,country,pop
|
||||||
|
"Boston","United States","4628910""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
|
||||||
|
.unwrap()
|
||||||
|
.finish()
|
||||||
|
.unwrap();
|
||||||
|
let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
|
||||||
|
let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
|
||||||
|
let val = obkv_to_value(&doc, index);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
val,
|
||||||
|
json!({
|
||||||
|
"city:": "Boston",
|
||||||
|
"country": "United States",
|
||||||
|
"pop": "4628910",
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bad_type_in_header() {
|
||||||
|
let documents = r#"city,country:number,pop
|
||||||
|
"Boston","United States","4628910""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
assert!(DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bad_column_count1() {
|
||||||
|
let documents = r#"city,country,pop
|
||||||
|
"Boston","United States","4628910", "too much""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
assert!(DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bad_column_count2() {
|
||||||
|
let documents = r#"city,country,pop
|
||||||
|
"Boston","United States""#;
|
||||||
|
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
assert!(DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -47,7 +47,7 @@ impl DocumentsBatchIndex {
|
|||||||
self.0.iter()
|
self.0.iter()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_id(&self, id: FieldId) -> Option<&String> {
|
pub fn name(&self, id: FieldId) -> Option<&String> {
|
||||||
self.0.get_by_left(&id)
|
self.0.get_by_left(&id)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -179,7 +179,7 @@ impl Transform<'_, '_> {
|
|||||||
if !self.autogenerate_docids {
|
if !self.autogenerate_docids {
|
||||||
let mut json = Map::new();
|
let mut json = Map::new();
|
||||||
for (key, value) in document.iter() {
|
for (key, value) in document.iter() {
|
||||||
let key = addition_index.get_id(key).cloned();
|
let key = addition_index.name(key).cloned();
|
||||||
let value = serde_json::from_slice::<Value>(&value).ok();
|
let value = serde_json::from_slice::<Value>(&value).ok();
|
||||||
|
|
||||||
if let Some((k, v)) = key.zip(value) {
|
if let Some((k, v)) = key.zip(value) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user