MeiliSearch/meilisearch-lib/src/document_formats.rs

134 lines
4.5 KiB
Rust
Raw Normal View History

2022-03-04 03:46:59 +01:00
use std::borrow::Borrow;
use std::fmt::{self, Debug, Display};
use std::io::{self, BufRead, BufReader, BufWriter, Cursor, Read, Seek, Write};
2021-09-28 11:59:55 +02:00
use meilisearch_error::{internal_error, Code, ErrorCode};
2021-09-28 11:59:55 +02:00
use milli::documents::DocumentBatchBuilder;
type Result<T> = std::result::Result<T, DocumentFormatError>;
#[derive(Debug)]
pub enum PayloadType {
2021-09-29 10:17:52 +02:00
Ndjson,
Json,
2021-09-28 22:58:48 +02:00
Csv,
2021-09-28 11:59:55 +02:00
}
impl fmt::Display for PayloadType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
2021-09-29 10:17:52 +02:00
PayloadType::Ndjson => write!(f, "ndjson"),
PayloadType::Json => write!(f, "json"),
2021-09-28 22:58:48 +02:00
PayloadType::Csv => write!(f, "csv"),
2021-09-28 11:59:55 +02:00
}
}
}
2022-03-04 03:46:59 +01:00
#[derive(Debug)]
2021-09-28 11:59:55 +02:00
pub enum DocumentFormatError {
Internal(Box<dyn std::error::Error + Send + Sync + 'static>),
2022-03-04 03:46:59 +01:00
MalformedPayload(Box<milli::documents::Error>, PayloadType),
}
impl Display for DocumentFormatError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Internal(e) => write!(f, "An internal error has occurred: `{}`.", e),
2022-03-04 03:46:59 +01:00
Self::MalformedPayload(me, b) => match me.borrow() {
2022-03-25 13:53:28 +01:00
milli::documents::Error::JsonError(se) => {
// https://github.com/meilisearch/meilisearch/issues/2107
// The user input maybe insanely long. We need to truncate it.
2022-03-25 14:36:11 +01:00
let mut serde_msg = se.to_string();
2022-03-25 13:53:28 +01:00
let prefix = r#"invalid type: string ""#;
if serde_msg.starts_with(prefix) {
let start_idx = prefix.len();
if let Some(end_idx) = serde_msg.rfind("\"") {
if end_idx - start_idx > 100 {
serde_msg.replace_range(start_idx + 50..end_idx - 50, " ... ");
}
} else {
serde_msg = String::from("");
}
}
write!(
2022-03-04 03:46:59 +01:00
f,
2022-03-25 14:31:23 +01:00
"The `{}` payload provided is malformed. `Couldn't serialize document value: {}`.",
b,serde_msg
2022-03-25 13:53:28 +01:00
)
}
_ => write!(f, "The `{}` payload provided is malformed: `{}`.", b, me),
2022-03-04 03:46:59 +01:00
},
}
}
2021-09-28 11:59:55 +02:00
}
2022-03-04 03:46:59 +01:00
impl std::error::Error for DocumentFormatError {}
2021-09-28 11:59:55 +02:00
impl From<(PayloadType, milli::documents::Error)> for DocumentFormatError {
fn from((ty, error): (PayloadType, milli::documents::Error)) -> Self {
match error {
milli::documents::Error::Io(e) => Self::Internal(Box::new(e)),
e => Self::MalformedPayload(Box::new(e), ty),
}
}
}
2021-09-30 10:35:24 +02:00
impl ErrorCode for DocumentFormatError {
fn error_code(&self) -> Code {
match self {
DocumentFormatError::Internal(_) => Code::Internal,
DocumentFormatError::MalformedPayload(_, _) => Code::MalformedPayload,
}
}
}
internal_error!(DocumentFormatError: io::Error);
2021-09-28 11:59:55 +02:00
/// reads csv from input and write an obkv batch to writer.
pub fn read_csv(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
let writer = BufWriter::new(writer);
2021-10-26 19:36:48 +02:00
let builder =
DocumentBatchBuilder::from_csv(input, writer).map_err(|e| (PayloadType::Csv, e))?;
let count = builder.finish().map_err(|e| (PayloadType::Csv, e))?;
2021-09-28 22:58:48 +02:00
Ok(count)
2021-09-28 22:58:48 +02:00
}
/// reads jsonl from input and write an obkv batch to writer.
pub fn read_ndjson(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
let mut reader = BufReader::new(input);
let writer = BufWriter::new(writer);
let mut builder = DocumentBatchBuilder::new(writer).map_err(|e| (PayloadType::Ndjson, e))?;
let mut buf = String::new();
2021-09-28 11:59:55 +02:00
while reader.read_line(&mut buf)? > 0 {
// skip empty lines
if buf == "\n" {
buf.clear();
continue;
}
builder
.extend_from_json(Cursor::new(&buf.as_bytes()))
.map_err(|e| (PayloadType::Ndjson, e))?;
buf.clear();
2021-09-28 11:59:55 +02:00
}
let count = builder.finish().map_err(|e| (PayloadType::Ndjson, e))?;
2021-09-28 11:59:55 +02:00
Ok(count)
2021-09-28 11:59:55 +02:00
}
/// reads json from input and write an obkv batch to writer.
pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
let writer = BufWriter::new(writer);
let mut builder = DocumentBatchBuilder::new(writer).map_err(|e| (PayloadType::Json, e))?;
builder
.extend_from_json(input)
.map_err(|e| (PayloadType::Json, e))?;
let count = builder.finish().map_err(|e| (PayloadType::Json, e))?;
Ok(count)
}