mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-13 00:18:56 +01:00
Refactor addition of documents: save update file as NDJson
This commit is contained in:
parent
669a3ff85f
commit
5572f0c2c8
8
Cargo.lock
generated
8
Cargo.lock
generated
@ -1124,7 +1124,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "filter-parser"
|
||||
version = "0.29.3"
|
||||
source = "git+https://github.com/meilisearch/milli?branch=use-grenad-for-documents-batches#f51c30f50aaeaaa068a00018d4c5fad7b9f264a2"
|
||||
source = "git+https://github.com/meilisearch/milli?branch=add-documents-ndjson#cf1c5588ea95f78160ec83de2e95161652cc10e2"
|
||||
dependencies = [
|
||||
"nom",
|
||||
"nom_locate",
|
||||
@ -1149,7 +1149,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "flatten-serde-json"
|
||||
version = "0.29.3"
|
||||
source = "git+https://github.com/meilisearch/milli?branch=use-grenad-for-documents-batches#f51c30f50aaeaaa068a00018d4c5fad7b9f264a2"
|
||||
source = "git+https://github.com/meilisearch/milli?branch=add-documents-ndjson#cf1c5588ea95f78160ec83de2e95161652cc10e2"
|
||||
dependencies = [
|
||||
"serde_json",
|
||||
]
|
||||
@ -1662,7 +1662,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "json-depth-checker"
|
||||
version = "0.29.3"
|
||||
source = "git+https://github.com/meilisearch/milli?branch=use-grenad-for-documents-batches#f51c30f50aaeaaa068a00018d4c5fad7b9f264a2"
|
||||
source = "git+https://github.com/meilisearch/milli?branch=add-documents-ndjson#cf1c5588ea95f78160ec83de2e95161652cc10e2"
|
||||
dependencies = [
|
||||
"serde_json",
|
||||
]
|
||||
@ -2190,7 +2190,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "milli"
|
||||
version = "0.29.3"
|
||||
source = "git+https://github.com/meilisearch/milli?branch=use-grenad-for-documents-batches#f51c30f50aaeaaa068a00018d4c5fad7b9f264a2"
|
||||
source = "git+https://github.com/meilisearch/milli?branch=add-documents-ndjson#cf1c5588ea95f78160ec83de2e95161652cc10e2"
|
||||
dependencies = [
|
||||
"bimap",
|
||||
"bincode",
|
||||
|
@ -8,7 +8,7 @@ base64 = "0.13.0"
|
||||
enum-iterator = "0.7.0"
|
||||
hmac = "0.12.1"
|
||||
meilisearch-types = { path = "../meilisearch-types" }
|
||||
milli = { git = "https://github.com/meilisearch/milli", branch = "use-grenad-for-documents-batches" }
|
||||
milli = { git = "https://github.com/meilisearch/milli", branch = 'add-documents-ndjson' }
|
||||
rand = "0.8.4"
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
serde_json = { version = "1.0.79", features = ["preserve_order"] }
|
||||
|
@ -83,17 +83,19 @@ pub fn configure_data(
|
||||
web::JsonConfig::default()
|
||||
.content_type(|mime| mime == mime::APPLICATION_JSON)
|
||||
.error_handler(|err, req: &HttpRequest| match err {
|
||||
JsonPayloadError::ContentType => match req.headers().get(CONTENT_TYPE) {
|
||||
Some(content_type) => MeilisearchHttpError::InvalidContentType(
|
||||
content_type.to_str().unwrap_or("unknown").to_string(),
|
||||
vec![mime::APPLICATION_JSON.to_string()],
|
||||
)
|
||||
.into(),
|
||||
None => MeilisearchHttpError::MissingContentType(vec![
|
||||
mime::APPLICATION_JSON.to_string(),
|
||||
])
|
||||
.into(),
|
||||
},
|
||||
JsonPayloadError::ContentType => {
|
||||
match req.headers().get(CONTENT_TYPE) {
|
||||
Some(content_type) => MeilisearchHttpError::InvalidContentType(
|
||||
content_type.to_str().unwrap_or("unknown").to_string(),
|
||||
vec![mime::APPLICATION_JSON.to_string()],
|
||||
)
|
||||
.into(),
|
||||
None => MeilisearchHttpError::MissingContentType(vec![
|
||||
mime::APPLICATION_JSON.to_string(),
|
||||
])
|
||||
.into(),
|
||||
}
|
||||
}
|
||||
err => PayloadError::from(err).into(),
|
||||
}),
|
||||
)
|
||||
|
@ -326,7 +326,7 @@ async fn error_add_malformed_json_documents() {
|
||||
assert_eq!(
|
||||
response["message"],
|
||||
json!(
|
||||
r#"The `json` payload provided is malformed. `Couldn't serialize document value: invalid type: string "0123456789012345678901234567...890123456789012345678901234567890123456789", expected a sequence at line 1 column 102`."#
|
||||
r#"The `json` payload provided is malformed. `Couldn't serialize document value: invalid type: string "0123456789012345678901234567...7890123456789", expected a document, or a sequence of documents. at line 1 column 102`."#
|
||||
)
|
||||
);
|
||||
assert_eq!(response["code"], json!("malformed_payload"));
|
||||
@ -349,7 +349,7 @@ async fn error_add_malformed_json_documents() {
|
||||
assert_eq!(status_code, 400);
|
||||
assert_eq!(
|
||||
response["message"],
|
||||
json!("The `json` payload provided is malformed. `Couldn't serialize document value: invalid type: string \"0123456789012345678901234567...90123456789012345678901234567890123456789m\", expected a sequence at line 1 column 103`.")
|
||||
json!("The `json` payload provided is malformed. `Couldn't serialize document value: invalid type: string \"0123456789012345678901234567...890123456789m\", expected a document, or a sequence of documents. at line 1 column 103`.")
|
||||
);
|
||||
assert_eq!(response["code"], json!("malformed_payload"));
|
||||
assert_eq!(response["type"], json!("invalid_request"));
|
||||
|
@ -30,7 +30,7 @@ lazy_static = "1.4.0"
|
||||
log = "0.4.14"
|
||||
meilisearch-auth = { path = "../meilisearch-auth" }
|
||||
meilisearch-types = { path = "../meilisearch-types" }
|
||||
milli = { git = "https://github.com/meilisearch/milli", branch = "use-grenad-for-documents-batches" }
|
||||
milli = { git = "https://github.com/meilisearch/milli", branch = 'add-documents-ndjson' }
|
||||
mime = "0.3.16"
|
||||
num_cpus = "1.13.1"
|
||||
obkv = "0.2.0"
|
||||
|
@ -124,17 +124,10 @@ pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
|
||||
let mut builder = DocumentsBatchBuilder::new(writer);
|
||||
let reader = BufReader::new(input);
|
||||
|
||||
let objects: Vec<_> = serde_json::from_reader(reader)
|
||||
.map_err(Error::Json)
|
||||
builder
|
||||
.append_json(reader)
|
||||
.map_err(|e| (PayloadType::Json, e))?;
|
||||
|
||||
for object in objects {
|
||||
builder
|
||||
.append_json_object(&object)
|
||||
.map_err(Into::into)
|
||||
.map_err(DocumentFormatError::Internal)?;
|
||||
}
|
||||
|
||||
let count = builder.documents_count();
|
||||
let _ = builder
|
||||
.into_inner()
|
||||
|
@ -16,6 +16,7 @@ pub enum DumpError {
|
||||
|
||||
internal_error!(
|
||||
DumpError: milli::heed::Error,
|
||||
IndexResolverError,
|
||||
std::io::Error,
|
||||
tokio::task::JoinError,
|
||||
tokio::sync::oneshot::error::RecvError,
|
||||
|
@ -1,7 +1,7 @@
|
||||
use meilisearch_auth::SearchRules;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
use std::io::Cursor;
|
||||
use std::io::{BufWriter, Cursor};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
@ -392,6 +392,7 @@ where
|
||||
}
|
||||
let (content_uuid, mut update_file) = self.update_file_store.new_update()?;
|
||||
let documents_count = tokio::task::spawn_blocking(move || -> Result<_> {
|
||||
let writer = BufWriter::new(&mut *update_file);
|
||||
// check if the payload is empty, and return an error
|
||||
if buffer.is_empty() {
|
||||
return Err(IndexControllerError::MissingPayload(format));
|
||||
@ -399,9 +400,9 @@ where
|
||||
|
||||
let reader = Cursor::new(buffer);
|
||||
let count = match format {
|
||||
DocumentAdditionFormat::Json => read_json(reader, &mut *update_file)?,
|
||||
DocumentAdditionFormat::Csv => read_csv(reader, &mut *update_file)?,
|
||||
DocumentAdditionFormat::Ndjson => read_ndjson(reader, &mut *update_file)?,
|
||||
DocumentAdditionFormat::Json => read_json(reader, writer)?,
|
||||
DocumentAdditionFormat::Csv => read_csv(reader, writer)?,
|
||||
DocumentAdditionFormat::Ndjson => read_ndjson(reader, writer)?,
|
||||
};
|
||||
|
||||
update_file.persist()?;
|
||||
|
@ -4,7 +4,6 @@ use std::ops::{Deref, DerefMut};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use milli::documents::DocumentsBatchReader;
|
||||
use serde_json::Map;
|
||||
use tempfile::{NamedTempFile, PersistError};
|
||||
use uuid::Uuid;
|
||||
|
||||
@ -151,23 +150,13 @@ mod store {
|
||||
let update_file = File::open(update_file_path)?;
|
||||
let mut dst_file = NamedTempFile::new_in(&dump_path)?;
|
||||
let mut document_cursor = DocumentsBatchReader::from_reader(update_file)?.into_cursor();
|
||||
let index = document_cursor.documents_batch_index().clone();
|
||||
|
||||
let mut document_buffer = Map::new();
|
||||
// TODO: we need to find a way to do this more efficiently. (create a custom serializer
|
||||
// for jsonl for example...)
|
||||
let mut dst_file_buf_writer = BufWriter::new(&mut dst_file);
|
||||
while let Some(document) = document_cursor.next_document()? {
|
||||
for (field_id, content) in document.iter() {
|
||||
if let Some(field_name) = index.name(field_id) {
|
||||
let content = serde_json::from_slice(content)?;
|
||||
document_buffer.insert(field_name.to_string(), content);
|
||||
}
|
||||
}
|
||||
|
||||
serde_json::to_writer(&mut dst_file, &document_buffer)?;
|
||||
dst_file.write_all(b"\n")?;
|
||||
document_buffer.clear();
|
||||
serde_json::to_writer(&mut dst_file_buf_writer, &document)?;
|
||||
dst_file_buf_writer.write_all(b"\n")?;
|
||||
}
|
||||
drop(dst_file_buf_writer);
|
||||
|
||||
dst_file.persist(dst)?;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user