mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-29 16:24:26 +01:00
fix issue 3037
This commit is contained in:
parent
914f8b118c
commit
38982d13fe
12
Cargo.lock
generated
12
Cargo.lock
generated
@ -2376,7 +2376,9 @@ dependencies = [
|
|||||||
"flate2",
|
"flate2",
|
||||||
"fst",
|
"fst",
|
||||||
"insta",
|
"insta",
|
||||||
|
"log",
|
||||||
"meili-snap",
|
"meili-snap",
|
||||||
|
"memmap",
|
||||||
"milli",
|
"milli",
|
||||||
"proptest",
|
"proptest",
|
||||||
"proptest-derive",
|
"proptest-derive",
|
||||||
@ -2396,6 +2398,16 @@ version = "2.5.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
|
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "memmap"
|
||||||
|
version = "0.7.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
"winapi",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "memmap2"
|
name = "memmap2"
|
||||||
version = "0.5.7"
|
version = "0.5.7"
|
||||||
|
@ -1024,18 +1024,20 @@ impl IndexScheduler {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
use std::io::{Seek, Write, BufWriter};
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
|
||||||
use big_s::S;
|
use big_s::S;
|
||||||
use file_store::File;
|
use file_store::File;
|
||||||
use meili_snap::snapshot;
|
use meili_snap::snapshot;
|
||||||
|
use meilisearch_types::document_formats::DocumentFormatError;
|
||||||
use meilisearch_types::milli::obkv_to_json;
|
use meilisearch_types::milli::obkv_to_json;
|
||||||
use meilisearch_types::milli::update::IndexDocumentsMethod::{
|
use meilisearch_types::milli::update::IndexDocumentsMethod::{
|
||||||
ReplaceDocuments, UpdateDocuments,
|
ReplaceDocuments, UpdateDocuments,
|
||||||
};
|
};
|
||||||
use meilisearch_types::tasks::IndexSwap;
|
use meilisearch_types::tasks::IndexSwap;
|
||||||
use meilisearch_types::VERSION_FILE_NAME;
|
use meilisearch_types::VERSION_FILE_NAME;
|
||||||
use tempfile::TempDir;
|
use tempfile::{TempDir, NamedTempFile};
|
||||||
use time::Duration;
|
use time::Duration;
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
|
|
||||||
@ -1128,6 +1130,15 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Adapting to the new json reading interface
|
||||||
|
pub fn read_json(bytes: &[u8], write: impl Write + Seek) -> std::result::Result<usize, DocumentFormatError> {
|
||||||
|
let temp_file = NamedTempFile::new().unwrap();
|
||||||
|
let mut buffer = BufWriter::new(temp_file.reopen().unwrap());
|
||||||
|
buffer.write(bytes).unwrap();
|
||||||
|
buffer.flush().unwrap();
|
||||||
|
meilisearch_types::document_formats::read_json(temp_file.as_file(), write)
|
||||||
|
}
|
||||||
|
|
||||||
/// Create an update file with the given file uuid.
|
/// Create an update file with the given file uuid.
|
||||||
///
|
///
|
||||||
/// The update file contains just one simple document whose id is given by `document_id`.
|
/// The update file contains just one simple document whose id is given by `document_id`.
|
||||||
@ -1147,7 +1158,7 @@ mod tests {
|
|||||||
|
|
||||||
let (_uuid, mut file) = index_scheduler.create_update_file_with_uuid(file_uuid).unwrap();
|
let (_uuid, mut file) = index_scheduler.create_update_file_with_uuid(file_uuid).unwrap();
|
||||||
let documents_count =
|
let documents_count =
|
||||||
meilisearch_types::document_formats::read_json(content.as_bytes(), file.as_file_mut())
|
read_json(content.as_bytes(), file.as_file_mut())
|
||||||
.unwrap() as u64;
|
.unwrap() as u64;
|
||||||
(file, documents_count)
|
(file, documents_count)
|
||||||
}
|
}
|
||||||
@ -1450,7 +1461,7 @@ mod tests {
|
|||||||
|
|
||||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap();
|
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap();
|
||||||
let documents_count =
|
let documents_count =
|
||||||
meilisearch_types::document_formats::read_json(content.as_bytes(), file.as_file_mut())
|
read_json(content.as_bytes(), file.as_file_mut())
|
||||||
.unwrap() as u64;
|
.unwrap() as u64;
|
||||||
file.persist().unwrap();
|
file.persist().unwrap();
|
||||||
index_scheduler
|
index_scheduler
|
||||||
@ -1496,7 +1507,7 @@ mod tests {
|
|||||||
|
|
||||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap();
|
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap();
|
||||||
let documents_count =
|
let documents_count =
|
||||||
meilisearch_types::document_formats::read_json(content.as_bytes(), file.as_file_mut())
|
read_json(content.as_bytes(), file.as_file_mut())
|
||||||
.unwrap() as u64;
|
.unwrap() as u64;
|
||||||
file.persist().unwrap();
|
file.persist().unwrap();
|
||||||
index_scheduler
|
index_scheduler
|
||||||
@ -1678,7 +1689,7 @@ mod tests {
|
|||||||
|
|
||||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap();
|
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap();
|
||||||
let documents_count =
|
let documents_count =
|
||||||
meilisearch_types::document_formats::read_json(content.as_bytes(), file.as_file_mut())
|
read_json(content.as_bytes(), file.as_file_mut())
|
||||||
.unwrap() as u64;
|
.unwrap() as u64;
|
||||||
file.persist().unwrap();
|
file.persist().unwrap();
|
||||||
index_scheduler
|
index_scheduler
|
||||||
@ -1847,7 +1858,7 @@ mod tests {
|
|||||||
);
|
);
|
||||||
|
|
||||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap();
|
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap();
|
||||||
let documents_count = meilisearch_types::document_formats::read_json(
|
let documents_count = read_json(
|
||||||
content.as_bytes(),
|
content.as_bytes(),
|
||||||
file.as_file_mut(),
|
file.as_file_mut(),
|
||||||
)
|
)
|
||||||
@ -1902,7 +1913,7 @@ mod tests {
|
|||||||
);
|
);
|
||||||
|
|
||||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap();
|
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap();
|
||||||
let documents_count = meilisearch_types::document_formats::read_json(
|
let documents_count = read_json(
|
||||||
content.as_bytes(),
|
content.as_bytes(),
|
||||||
file.as_file_mut(),
|
file.as_file_mut(),
|
||||||
)
|
)
|
||||||
@ -1959,7 +1970,7 @@ mod tests {
|
|||||||
);
|
);
|
||||||
|
|
||||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap();
|
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap();
|
||||||
let documents_count = meilisearch_types::document_formats::read_json(
|
let documents_count = read_json(
|
||||||
content.as_bytes(),
|
content.as_bytes(),
|
||||||
file.as_file_mut(),
|
file.as_file_mut(),
|
||||||
)
|
)
|
||||||
@ -2016,7 +2027,7 @@ mod tests {
|
|||||||
);
|
);
|
||||||
|
|
||||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap();
|
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap();
|
||||||
let documents_count = meilisearch_types::document_formats::read_json(
|
let documents_count = read_json(
|
||||||
content.as_bytes(),
|
content.as_bytes(),
|
||||||
file.as_file_mut(),
|
file.as_file_mut(),
|
||||||
)
|
)
|
||||||
@ -2076,7 +2087,7 @@ mod tests {
|
|||||||
);
|
);
|
||||||
|
|
||||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap();
|
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap();
|
||||||
let documents_count = meilisearch_types::document_formats::read_json(
|
let documents_count = read_json(
|
||||||
content.as_bytes(),
|
content.as_bytes(),
|
||||||
file.as_file_mut(),
|
file.as_file_mut(),
|
||||||
)
|
)
|
||||||
@ -2505,7 +2516,7 @@ mod tests {
|
|||||||
|
|
||||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap();
|
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap();
|
||||||
let documents_count =
|
let documents_count =
|
||||||
meilisearch_types::document_formats::read_json(content.as_bytes(), file.as_file_mut())
|
read_json(content.as_bytes(), file.as_file_mut())
|
||||||
.unwrap() as u64;
|
.unwrap() as u64;
|
||||||
file.persist().unwrap();
|
file.persist().unwrap();
|
||||||
index_scheduler
|
index_scheduler
|
||||||
@ -2547,7 +2558,7 @@ mod tests {
|
|||||||
|
|
||||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap();
|
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap();
|
||||||
let documents_count =
|
let documents_count =
|
||||||
meilisearch_types::document_formats::read_json(content.as_bytes(), file.as_file_mut())
|
read_json(content.as_bytes(), file.as_file_mut())
|
||||||
.unwrap() as u64;
|
.unwrap() as u64;
|
||||||
file.persist().unwrap();
|
file.persist().unwrap();
|
||||||
index_scheduler
|
index_scheduler
|
||||||
@ -2596,7 +2607,7 @@ mod tests {
|
|||||||
);
|
);
|
||||||
|
|
||||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap();
|
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap();
|
||||||
let documents_count = meilisearch_types::document_formats::read_json(
|
let documents_count = read_json(
|
||||||
content.as_bytes(),
|
content.as_bytes(),
|
||||||
file.as_file_mut(),
|
file.as_file_mut(),
|
||||||
)
|
)
|
||||||
@ -2645,7 +2656,7 @@ mod tests {
|
|||||||
);
|
);
|
||||||
|
|
||||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap();
|
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap();
|
||||||
let documents_count = meilisearch_types::document_formats::read_json(
|
let documents_count = read_json(
|
||||||
content.as_bytes(),
|
content.as_bytes(),
|
||||||
file.as_file_mut(),
|
file.as_file_mut(),
|
||||||
)
|
)
|
||||||
@ -2708,7 +2719,7 @@ mod tests {
|
|||||||
);
|
);
|
||||||
|
|
||||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap();
|
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap();
|
||||||
let documents_count = meilisearch_types::document_formats::read_json(
|
let documents_count = read_json(
|
||||||
content.as_bytes(),
|
content.as_bytes(),
|
||||||
file.as_file_mut(),
|
file.as_file_mut(),
|
||||||
)
|
)
|
||||||
@ -2773,7 +2784,7 @@ mod tests {
|
|||||||
);
|
);
|
||||||
|
|
||||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap();
|
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap();
|
||||||
let documents_count = meilisearch_types::document_formats::read_json(
|
let documents_count = read_json(
|
||||||
content.as_bytes(),
|
content.as_bytes(),
|
||||||
file.as_file_mut(),
|
file.as_file_mut(),
|
||||||
)
|
)
|
||||||
@ -2845,7 +2856,7 @@ mod tests {
|
|||||||
let allow_index_creation = i % 2 != 0;
|
let allow_index_creation = i % 2 != 0;
|
||||||
|
|
||||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap();
|
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap();
|
||||||
let documents_count = meilisearch_types::document_formats::read_json(
|
let documents_count = read_json(
|
||||||
content.as_bytes(),
|
content.as_bytes(),
|
||||||
file.as_file_mut(),
|
file.as_file_mut(),
|
||||||
)
|
)
|
||||||
@ -2905,7 +2916,7 @@ mod tests {
|
|||||||
let allow_index_creation = i % 2 != 0;
|
let allow_index_creation = i % 2 != 0;
|
||||||
|
|
||||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap();
|
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap();
|
||||||
let documents_count = meilisearch_types::document_formats::read_json(
|
let documents_count = read_json(
|
||||||
content.as_bytes(),
|
content.as_bytes(),
|
||||||
file.as_file_mut(),
|
file.as_file_mut(),
|
||||||
)
|
)
|
||||||
|
@ -95,6 +95,8 @@ pub enum PayloadError {
|
|||||||
MalformedPayload(serde_json::error::Error),
|
MalformedPayload(serde_json::error::Error),
|
||||||
#[error("A json payload is missing.")]
|
#[error("A json payload is missing.")]
|
||||||
MissingPayload,
|
MissingPayload,
|
||||||
|
#[error("Exception when accepting a playload to a temporary file")]
|
||||||
|
ReceivePayloadErr,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ErrorCode for PayloadError {
|
impl ErrorCode for PayloadError {
|
||||||
@ -126,6 +128,7 @@ impl ErrorCode for PayloadError {
|
|||||||
},
|
},
|
||||||
PayloadError::MissingPayload => Code::MissingPayload,
|
PayloadError::MissingPayload => Code::MissingPayload,
|
||||||
PayloadError::MalformedPayload(_) => Code::MalformedPayload,
|
PayloadError::MalformedPayload(_) => Code::MalformedPayload,
|
||||||
|
PayloadError::ReceivePayloadErr => Code::ReceivePayloadErr,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,13 +1,12 @@
|
|||||||
use std::io::{Cursor, ErrorKind};
|
use std::io::{ErrorKind, BufWriter, Write};
|
||||||
|
|
||||||
use actix_web::http::header::CONTENT_TYPE;
|
use actix_web::http::header::CONTENT_TYPE;
|
||||||
use actix_web::web::Data;
|
use actix_web::web::Data;
|
||||||
use actix_web::{web, HttpMessage, HttpRequest, HttpResponse};
|
use actix_web::{web, HttpMessage, HttpRequest, HttpResponse};
|
||||||
use bstr::ByteSlice;
|
use bstr::ByteSlice;
|
||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
use index_scheduler::IndexScheduler;
|
use index_scheduler::IndexScheduler;
|
||||||
use log::debug;
|
use log::{debug, error};
|
||||||
use meilisearch_types::document_formats::{read_csv, read_json, read_ndjson, PayloadType};
|
use meilisearch_types::document_formats::{read_csv, PayloadType, read_json, read_ndjson};
|
||||||
use meilisearch_types::error::ResponseError;
|
use meilisearch_types::error::ResponseError;
|
||||||
use meilisearch_types::heed::RoTxn;
|
use meilisearch_types::heed::RoTxn;
|
||||||
use meilisearch_types::index_uid::IndexUid;
|
use meilisearch_types::index_uid::IndexUid;
|
||||||
@ -20,9 +19,10 @@ use once_cell::sync::Lazy;
|
|||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use serde_cs::vec::CS;
|
use serde_cs::vec::CS;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
use tempfile::NamedTempFile;
|
||||||
use crate::analytics::Analytics;
|
use crate::analytics::Analytics;
|
||||||
use crate::error::MeilisearchHttpError;
|
use crate::error::MeilisearchHttpError;
|
||||||
|
use crate::error::PayloadError::ReceivePayloadErr;
|
||||||
use crate::extractors::authentication::policies::*;
|
use crate::extractors::authentication::policies::*;
|
||||||
use crate::extractors::authentication::GuardedData;
|
use crate::extractors::authentication::GuardedData;
|
||||||
use crate::extractors::payload::Payload;
|
use crate::extractors::payload::Payload;
|
||||||
@ -223,26 +223,51 @@ async fn document_addition(
|
|||||||
|
|
||||||
let (uuid, mut update_file) = index_scheduler.create_update_file()?;
|
let (uuid, mut update_file) = index_scheduler.create_update_file()?;
|
||||||
|
|
||||||
// TODO: this can be slow, maybe we should spawn a thread? But the payload isn't Send+Sync :weary:
|
let err: Result<SummarizedTaskView, MeilisearchHttpError> = Err(MeilisearchHttpError::Payload(ReceivePayloadErr));
|
||||||
// push the entire stream into a `Vec`.
|
|
||||||
// If someone sends us a never ending stream we're going to block the thread.
|
let temp_file = match NamedTempFile::new() {
|
||||||
// TODO: Maybe we should write it to a file to reduce the RAM consumption
|
Ok(temp_file) => temp_file,
|
||||||
// and then reread it to convert it to obkv?
|
Err(e) => {
|
||||||
let mut buffer = Vec::new();
|
error!("create a temporary file error: {}", e);
|
||||||
while let Some(bytes) = body.next().await {
|
return err;
|
||||||
buffer.extend_from_slice(&bytes?);
|
},
|
||||||
|
};
|
||||||
|
debug!("temp file path: {:?}", temp_file.as_ref());
|
||||||
|
let buffer_file = match temp_file.reopen() {
|
||||||
|
Ok(buffer_file) => buffer_file,
|
||||||
|
Err(e) => {
|
||||||
|
error!("reopen payload temporary file error: {}", e);
|
||||||
|
return err;
|
||||||
}
|
}
|
||||||
if buffer.is_empty() {
|
};
|
||||||
|
let mut buffer = BufWriter::new(buffer_file);
|
||||||
|
let mut buffer_write_size:usize = 0;
|
||||||
|
while let Some(bytes) = body.next().await {
|
||||||
|
match buffer.write(&bytes?) {
|
||||||
|
Ok(size) => buffer_write_size = buffer_write_size + size,
|
||||||
|
Err(e) => {
|
||||||
|
error!("bufWriter write error: {}", e);
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Err(e) = buffer.flush() {
|
||||||
|
error!("bufWriter flush error: {}", e);
|
||||||
|
return err
|
||||||
|
};
|
||||||
|
|
||||||
|
if buffer_write_size == 0 {
|
||||||
return Err(MeilisearchHttpError::MissingPayload(format));
|
return Err(MeilisearchHttpError::MissingPayload(format));
|
||||||
}
|
}
|
||||||
let reader = Cursor::new(buffer);
|
|
||||||
|
|
||||||
let documents_count =
|
let documents_count =
|
||||||
tokio::task::spawn_blocking(move || -> Result<_, MeilisearchHttpError> {
|
tokio::task::spawn_blocking(move || -> Result<_, MeilisearchHttpError> {
|
||||||
let documents_count = match format {
|
let documents_count = match format {
|
||||||
PayloadType::Json => read_json(reader, update_file.as_file_mut())?,
|
PayloadType::Json => read_json(temp_file.as_file(), update_file.as_file_mut())?,
|
||||||
PayloadType::Csv => read_csv(reader, update_file.as_file_mut())?,
|
PayloadType::Csv => read_csv(temp_file.as_file(), update_file.as_file_mut())?,
|
||||||
PayloadType::Ndjson => read_ndjson(reader, update_file.as_file_mut())?,
|
PayloadType::Ndjson => read_ndjson(temp_file.as_file(), update_file.as_file_mut())?,
|
||||||
};
|
};
|
||||||
// we NEED to persist the file here because we moved the `udpate_file` in another task.
|
// we NEED to persist the file here because we moved the `udpate_file` in another task.
|
||||||
update_file.persist()?;
|
update_file.persist()?;
|
||||||
|
@ -436,7 +436,7 @@ async fn error_add_malformed_ndjson_documents() {
|
|||||||
assert_eq!(
|
assert_eq!(
|
||||||
response["message"],
|
response["message"],
|
||||||
json!(
|
json!(
|
||||||
r#"The `ndjson` payload provided is malformed. `Couldn't serialize document value: key must be a string at line 2 column 2`."#
|
r#"The `ndjson` payload provided is malformed. `Couldn't serialize document value: trailing characters at line 2 column 1`."#
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
assert_eq!(response["code"], json!("malformed_payload"));
|
assert_eq!(response["code"], json!("malformed_payload"));
|
||||||
@ -456,7 +456,7 @@ async fn error_add_malformed_ndjson_documents() {
|
|||||||
assert_eq!(status_code, 400);
|
assert_eq!(status_code, 400);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
response["message"],
|
response["message"],
|
||||||
json!("The `ndjson` payload provided is malformed. `Couldn't serialize document value: key must be a string at line 2 column 2`.")
|
json!("The `ndjson` payload provided is malformed. `Couldn't serialize document value: trailing characters at line 2 column 1`.")
|
||||||
);
|
);
|
||||||
assert_eq!(response["code"], json!("malformed_payload"));
|
assert_eq!(response["code"], json!("malformed_payload"));
|
||||||
assert_eq!(response["type"], json!("invalid_request"));
|
assert_eq!(response["type"], json!("invalid_request"));
|
||||||
|
@ -23,6 +23,8 @@ thiserror = "1.0.30"
|
|||||||
time = { version = "0.3.7", features = ["serde-well-known", "formatting", "parsing", "macros"] }
|
time = { version = "0.3.7", features = ["serde-well-known", "formatting", "parsing", "macros"] }
|
||||||
tokio = "1.0"
|
tokio = "1.0"
|
||||||
uuid = { version = "1.1.2", features = ["serde", "v4"] }
|
uuid = { version = "1.1.2", features = ["serde", "v4"] }
|
||||||
|
memmap = "0.7.0"
|
||||||
|
log = "0.4.17"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
insta = "1.19.1"
|
insta = "1.19.1"
|
||||||
|
@ -1,13 +1,16 @@
|
|||||||
use std::borrow::Borrow;
|
use std::borrow::Borrow;
|
||||||
use std::fmt::{self, Debug, Display};
|
use std::fmt::{self, Debug, Display};
|
||||||
use std::io::{self, BufReader, Read, Seek, Write};
|
use std::fs::File;
|
||||||
|
use std::io::{self, Seek, Write};
|
||||||
|
use std::marker::PhantomData;
|
||||||
use either::Either;
|
use either::Either;
|
||||||
|
use log::debug;
|
||||||
|
use memmap::MmapOptions;
|
||||||
use milli::documents::{DocumentsBatchBuilder, Error};
|
use milli::documents::{DocumentsBatchBuilder, Error};
|
||||||
use milli::Object;
|
use milli::Object;
|
||||||
use serde::Deserialize;
|
use serde::de::{Visitor, SeqAccess};
|
||||||
|
use serde::{Deserialize, Deserializer};
|
||||||
use serde_json::error::Category;
|
use serde_json::error::Category;
|
||||||
|
|
||||||
use crate::error::{Code, ErrorCode};
|
use crate::error::{Code, ErrorCode};
|
||||||
use crate::internal_error;
|
use crate::internal_error;
|
||||||
|
|
||||||
@ -99,10 +102,10 @@ impl ErrorCode for DocumentFormatError {
|
|||||||
internal_error!(DocumentFormatError: io::Error);
|
internal_error!(DocumentFormatError: io::Error);
|
||||||
|
|
||||||
/// Reads CSV from input and write an obkv batch to writer.
|
/// Reads CSV from input and write an obkv batch to writer.
|
||||||
pub fn read_csv(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
|
pub fn read_csv(file: &File, writer: impl Write + Seek) -> Result<usize> {
|
||||||
let mut builder = DocumentsBatchBuilder::new(writer);
|
let mut builder = DocumentsBatchBuilder::new(writer);
|
||||||
|
let mmap = unsafe { MmapOptions::new().map(file).unwrap()};
|
||||||
let csv = csv::Reader::from_reader(input);
|
let csv = csv::Reader::from_reader(mmap.as_ref());
|
||||||
builder.append_csv(csv).map_err(|e| (PayloadType::Csv, e))?;
|
builder.append_csv(csv).map_err(|e| (PayloadType::Csv, e))?;
|
||||||
|
|
||||||
let count = builder.documents_count();
|
let count = builder.documents_count();
|
||||||
@ -111,30 +114,30 @@ pub fn read_csv(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
|
|||||||
Ok(count as usize)
|
Ok(count as usize)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Reads JSON Lines from input and write an obkv batch to writer.
|
/// Reads JSON from temporary file and write an obkv batch to writer.
|
||||||
pub fn read_ndjson(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
|
pub fn read_json(file: &File, writer: impl Write + Seek) -> Result<usize> {
|
||||||
let mut builder = DocumentsBatchBuilder::new(writer);
|
read_json_inner(file, writer, PayloadType::Json)
|
||||||
let reader = BufReader::new(input);
|
|
||||||
|
|
||||||
for result in serde_json::Deserializer::from_reader(reader).into_iter() {
|
|
||||||
let object = result.map_err(Error::Json).map_err(|e| (PayloadType::Ndjson, e))?;
|
|
||||||
builder
|
|
||||||
.append_json_object(&object)
|
|
||||||
.map_err(Into::into)
|
|
||||||
.map_err(DocumentFormatError::Internal)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
let count = builder.documents_count();
|
|
||||||
let _ = builder.into_inner().map_err(Into::into).map_err(DocumentFormatError::Internal)?;
|
|
||||||
|
|
||||||
Ok(count as usize)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Reads JSON from input and write an obkv batch to writer.
|
/// Reads JSON from temporary file and write an obkv batch to writer.
|
||||||
pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
|
pub fn read_ndjson(file: &File, writer: impl Write + Seek) -> Result<usize> {
|
||||||
let mut builder = DocumentsBatchBuilder::new(writer);
|
read_json_inner(file, writer, PayloadType::Ndjson)
|
||||||
let reader = BufReader::new(input);
|
}
|
||||||
|
|
||||||
|
/// Reads JSON from temporary file and write an obkv batch to writer.
|
||||||
|
fn read_json_inner(file: &File, writer: impl Write + Seek, payload_type: PayloadType) -> Result<usize> {
|
||||||
|
let mut builder = DocumentsBatchBuilder::new(writer);
|
||||||
|
let mmap = unsafe { MmapOptions::new().map(file).unwrap()};
|
||||||
|
let mut deserializer = serde_json::Deserializer::from_slice(&mmap);
|
||||||
|
|
||||||
|
match array_each(&mut deserializer, |obj: Object | {
|
||||||
|
builder
|
||||||
|
.append_json_object(&obj)
|
||||||
|
}) {
|
||||||
|
Ok(Ok(count)) => debug!("serde json array size: {}", count),
|
||||||
|
Ok(Err(e)) => return Err(DocumentFormatError::Internal(Box::new(e))),
|
||||||
|
Err(_e) => {
|
||||||
|
debug!("deserialize single json");
|
||||||
#[derive(Deserialize, Debug)]
|
#[derive(Deserialize, Debug)]
|
||||||
#[serde(transparent)]
|
#[serde(transparent)]
|
||||||
struct ArrayOrSingleObject {
|
struct ArrayOrSingleObject {
|
||||||
@ -143,7 +146,7 @@ pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let content: ArrayOrSingleObject =
|
let content: ArrayOrSingleObject =
|
||||||
serde_json::from_reader(reader).map_err(Error::Json).map_err(|e| (PayloadType::Json, e))?;
|
serde_json::from_reader(file).map_err(Error::Json).map_err(|e| (payload_type, e))?;
|
||||||
|
|
||||||
for object in content.inner.map_right(|o| vec![o]).into_inner() {
|
for object in content.inner.map_right(|o| vec![o]).into_inner() {
|
||||||
builder
|
builder
|
||||||
@ -151,9 +154,52 @@ pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
|
|||||||
.map_err(Into::into)
|
.map_err(Into::into)
|
||||||
.map_err(DocumentFormatError::Internal)?;
|
.map_err(DocumentFormatError::Internal)?;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let count = builder.documents_count();
|
let count = builder.documents_count();
|
||||||
let _ = builder.into_inner().map_err(Into::into).map_err(DocumentFormatError::Internal)?;
|
let _ = builder.into_inner().map_err(Into::into).map_err(DocumentFormatError::Internal)?;
|
||||||
|
|
||||||
Ok(count as usize)
|
Ok(count as usize)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* https://serde.rs/stream-array.html
|
||||||
|
* https://github.com/serde-rs/json/issues/160
|
||||||
|
*/
|
||||||
|
fn array_each<'de, D, T, F>(deserializer: D, f: F) -> std::result::Result<io::Result<u64>, D::Error>
|
||||||
|
where
|
||||||
|
D: Deserializer<'de>,
|
||||||
|
T: Deserialize<'de>,
|
||||||
|
F: FnMut(T) -> io::Result<()>,
|
||||||
|
{
|
||||||
|
struct SeqVisitor<T, F>(F, PhantomData<T>);
|
||||||
|
|
||||||
|
impl<'de, T, F> Visitor<'de> for SeqVisitor<T, F>
|
||||||
|
where
|
||||||
|
T: Deserialize<'de>,
|
||||||
|
F: FnMut(T) -> io::Result<()>,
|
||||||
|
{
|
||||||
|
type Value = io::Result<u64>;
|
||||||
|
|
||||||
|
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
formatter.write_str("a nonempty sequence")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn visit_seq<A>(mut self, mut seq: A) -> std::result::Result<io::Result<u64>, <A as SeqAccess<'de>>::Error>
|
||||||
|
where
|
||||||
|
A: SeqAccess<'de>,
|
||||||
|
{
|
||||||
|
let mut max: u64 = 0;
|
||||||
|
while let Some(value) = seq.next_element::<T>()? {
|
||||||
|
match self.0(value) {
|
||||||
|
Ok(()) => max = max + 1,
|
||||||
|
Err(e) => return Ok(Err(e)),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
Ok(Ok(max))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let visitor = SeqVisitor(f, PhantomData);
|
||||||
|
deserializer.deserialize_seq(visitor)
|
||||||
|
}
|
@ -164,6 +164,7 @@ pub enum Code {
|
|||||||
MissingContentType,
|
MissingContentType,
|
||||||
MalformedPayload,
|
MalformedPayload,
|
||||||
MissingPayload,
|
MissingPayload,
|
||||||
|
ReceivePayloadErr,
|
||||||
|
|
||||||
ApiKeyNotFound,
|
ApiKeyNotFound,
|
||||||
MissingParameter,
|
MissingParameter,
|
||||||
@ -303,6 +304,9 @@ impl Code {
|
|||||||
DuplicateIndexFound => {
|
DuplicateIndexFound => {
|
||||||
ErrCode::invalid("duplicate_index_found", StatusCode::BAD_REQUEST)
|
ErrCode::invalid("duplicate_index_found", StatusCode::BAD_REQUEST)
|
||||||
}
|
}
|
||||||
|
ReceivePayloadErr => {
|
||||||
|
ErrCode::internal("receive_payload_internal_exceptions", StatusCode::INTERNAL_SERVER_ERROR)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user