mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-29 16:24:26 +01:00
Merge #2523
2523: Improve the tasks error reporting when processed in batches r=irevoire a=Kerollmops This fixes #2478 by changing the behavior of the task handler when there is an error in a batch of document addition or update. What changes is that when there is a user error in a task in a batch we now report this task as failed with the right error message but we continue to process the other tasks. A user error can be when a geo field is invalid, a document id is invalid, or missing. fixes #2582, #2478 Co-authored-by: Kerollmops <clement@meilisearch.com> Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
commit
b5f91b91c3
37
Cargo.lock
generated
37
Cargo.lock
generated
@ -644,9 +644,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "charabia"
|
name = "charabia"
|
||||||
version = "0.5.0"
|
version = "0.5.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "4a26a3df4d9c9231eb1e757fe6b1c66c471e0c2cd5410265e7c3109a726663c4"
|
checksum = "2ed19edcd98f5bf6572f48d6f5982d595cb8718e47c6f0066d942b280575ff02"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"character_converter",
|
"character_converter",
|
||||||
"cow-utils",
|
"cow-utils",
|
||||||
@ -1123,8 +1123,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "filter-parser"
|
name = "filter-parser"
|
||||||
version = "0.31.2"
|
version = "0.32.0"
|
||||||
source = "git+https://github.com/meilisearch/milli.git?tag=v0.31.2#132558bf6a4e434de2a48314c4a208dea295a992"
|
source = "git+https://github.com/meilisearch/milli.git?tag=v0.32.0#e1bc610d2722a8010216c45d5a32cbe3db18468e"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"nom",
|
"nom",
|
||||||
"nom_locate",
|
"nom_locate",
|
||||||
@ -1148,8 +1148,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "flatten-serde-json"
|
name = "flatten-serde-json"
|
||||||
version = "0.31.2"
|
version = "0.32.0"
|
||||||
source = "git+https://github.com/meilisearch/milli.git?tag=v0.31.2#132558bf6a4e434de2a48314c4a208dea295a992"
|
source = "git+https://github.com/meilisearch/milli.git?tag=v0.32.0#e1bc610d2722a8010216c45d5a32cbe3db18468e"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"serde_json",
|
"serde_json",
|
||||||
]
|
]
|
||||||
@ -1661,8 +1661,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "json-depth-checker"
|
name = "json-depth-checker"
|
||||||
version = "0.31.2"
|
version = "0.32.0"
|
||||||
source = "git+https://github.com/meilisearch/milli.git?tag=v0.31.2#132558bf6a4e434de2a48314c4a208dea295a992"
|
source = "git+https://github.com/meilisearch/milli.git?tag=v0.32.0#e1bc610d2722a8010216c45d5a32cbe3db18468e"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"serde_json",
|
"serde_json",
|
||||||
]
|
]
|
||||||
@ -2013,7 +2013,7 @@ dependencies = [
|
|||||||
"sha2",
|
"sha2",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"time 0.3.9",
|
"time 0.3.9",
|
||||||
"uuid 1.1.2",
|
"uuid",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -2082,7 +2082,7 @@ dependencies = [
|
|||||||
"tokio",
|
"tokio",
|
||||||
"tokio-stream",
|
"tokio-stream",
|
||||||
"urlencoding",
|
"urlencoding",
|
||||||
"uuid 1.1.2",
|
"uuid",
|
||||||
"vergen",
|
"vergen",
|
||||||
"walkdir",
|
"walkdir",
|
||||||
"yaup",
|
"yaup",
|
||||||
@ -2147,7 +2147,7 @@ dependencies = [
|
|||||||
"thiserror",
|
"thiserror",
|
||||||
"time 0.3.9",
|
"time 0.3.9",
|
||||||
"tokio",
|
"tokio",
|
||||||
"uuid 1.1.2",
|
"uuid",
|
||||||
"walkdir",
|
"walkdir",
|
||||||
"whoami",
|
"whoami",
|
||||||
]
|
]
|
||||||
@ -2189,8 +2189,8 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "milli"
|
name = "milli"
|
||||||
version = "0.31.2"
|
version = "0.32.0"
|
||||||
source = "git+https://github.com/meilisearch/milli.git?tag=v0.31.2#132558bf6a4e434de2a48314c4a208dea295a992"
|
source = "git+https://github.com/meilisearch/milli.git?tag=v0.32.0#e1bc610d2722a8010216c45d5a32cbe3db18468e"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bimap",
|
"bimap",
|
||||||
"bincode",
|
"bincode",
|
||||||
@ -2229,7 +2229,7 @@ dependencies = [
|
|||||||
"tempfile",
|
"tempfile",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"time 0.3.9",
|
"time 0.3.9",
|
||||||
"uuid 0.8.2",
|
"uuid",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -3671,15 +3671,6 @@ version = "0.1.6"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "5190c9442dcdaf0ddd50f37420417d219ae5261bbf5db120d0f9bab996c9cba1"
|
checksum = "5190c9442dcdaf0ddd50f37420417d219ae5261bbf5db120d0f9bab996c9cba1"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "uuid"
|
|
||||||
version = "0.8.2"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7"
|
|
||||||
dependencies = [
|
|
||||||
"getrandom",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "uuid"
|
name = "uuid"
|
||||||
version = "1.1.2"
|
version = "1.1.2"
|
||||||
|
@ -7,7 +7,7 @@ edition = "2021"
|
|||||||
enum-iterator = "0.7.0"
|
enum-iterator = "0.7.0"
|
||||||
hmac = "0.12.1"
|
hmac = "0.12.1"
|
||||||
meilisearch-types = { path = "../meilisearch-types" }
|
meilisearch-types = { path = "../meilisearch-types" }
|
||||||
milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.31.2" }
|
milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.32.0" }
|
||||||
rand = "0.8.4"
|
rand = "0.8.4"
|
||||||
serde = { version = "1.0.136", features = ["derive"] }
|
serde = { version = "1.0.136", features = ["derive"] }
|
||||||
serde_json = { version = "1.0.79", features = ["preserve_order"] }
|
serde_json = { version = "1.0.79", features = ["preserve_order"] }
|
||||||
|
@ -1,17 +0,0 @@
|
|||||||
use meilisearch_lib::heed::Env;
|
|
||||||
use walkdir::WalkDir;
|
|
||||||
|
|
||||||
pub trait EnvSizer {
|
|
||||||
fn size(&self) -> u64;
|
|
||||||
}
|
|
||||||
|
|
||||||
impl EnvSizer for Env {
|
|
||||||
fn size(&self) -> u64 {
|
|
||||||
WalkDir::new(self.path())
|
|
||||||
.into_iter()
|
|
||||||
.filter_map(|entry| entry.ok())
|
|
||||||
.filter_map(|entry| entry.metadata().ok())
|
|
||||||
.filter(|metadata| metadata.is_file())
|
|
||||||
.fold(0, |acc, m| acc + m.len())
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,3 +0,0 @@
|
|||||||
mod env;
|
|
||||||
|
|
||||||
pub use env::EnvSizer;
|
|
@ -5,7 +5,6 @@ pub mod analytics;
|
|||||||
pub mod task;
|
pub mod task;
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
pub mod extractors;
|
pub mod extractors;
|
||||||
pub mod helpers;
|
|
||||||
pub mod option;
|
pub mod option;
|
||||||
pub mod routes;
|
pub mod routes;
|
||||||
|
|
||||||
@ -30,9 +29,9 @@ pub static AUTOBATCHING_ENABLED: AtomicBool = AtomicBool::new(false);
|
|||||||
pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<MeiliSearch> {
|
pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<MeiliSearch> {
|
||||||
let mut meilisearch = MeiliSearch::builder();
|
let mut meilisearch = MeiliSearch::builder();
|
||||||
|
|
||||||
// enable autobatching?
|
// disable autobatching?
|
||||||
AUTOBATCHING_ENABLED.store(
|
AUTOBATCHING_ENABLED.store(
|
||||||
opt.scheduler_options.enable_auto_batching,
|
!opt.scheduler_options.disable_auto_batching,
|
||||||
std::sync::atomic::Ordering::Relaxed,
|
std::sync::atomic::Ordering::Relaxed,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -231,7 +231,7 @@ pub struct TaskView {
|
|||||||
#[serde(serialize_with = "time::serde::rfc3339::option::serialize")]
|
#[serde(serialize_with = "time::serde::rfc3339::option::serialize")]
|
||||||
finished_at: Option<OffsetDateTime>,
|
finished_at: Option<OffsetDateTime>,
|
||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
batch_uid: Option<Option<BatchId>>,
|
batch_uid: Option<BatchId>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<Task> for TaskView {
|
impl From<Task> for TaskView {
|
||||||
@ -380,15 +380,15 @@ impl From<Task> for TaskView {
|
|||||||
|
|
||||||
let duration = finished_at.zip(started_at).map(|(tf, ts)| (tf - ts));
|
let duration = finished_at.zip(started_at).map(|(tf, ts)| (tf - ts));
|
||||||
|
|
||||||
let batch_uid = if AUTOBATCHING_ENABLED.load(std::sync::atomic::Ordering::Relaxed) {
|
let batch_uid = AUTOBATCHING_ENABLED
|
||||||
let id = events.iter().find_map(|e| match e {
|
.load(std::sync::atomic::Ordering::Relaxed)
|
||||||
|
.then(|| {
|
||||||
|
events.iter().find_map(|e| match e {
|
||||||
TaskEvent::Batched { batch_id, .. } => Some(*batch_id),
|
TaskEvent::Batched { batch_id, .. } => Some(*batch_id),
|
||||||
_ => None,
|
_ => None,
|
||||||
});
|
})
|
||||||
Some(id)
|
})
|
||||||
} else {
|
.flatten();
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
uid: id,
|
uid: id,
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
use crate::common::{GetAllDocumentsOptions, Server};
|
use crate::common::{GetAllDocumentsOptions, Server};
|
||||||
use actix_web::test;
|
use actix_web::test;
|
||||||
|
|
||||||
use meilisearch_http::{analytics, create_app};
|
use meilisearch_http::{analytics, create_app};
|
||||||
use serde_json::{json, Value};
|
use serde_json::{json, Value};
|
||||||
use time::{format_description::well_known::Rfc3339, OffsetDateTime};
|
use time::{format_description::well_known::Rfc3339, OffsetDateTime};
|
||||||
@ -326,7 +327,7 @@ async fn error_add_malformed_json_documents() {
|
|||||||
assert_eq!(
|
assert_eq!(
|
||||||
response["message"],
|
response["message"],
|
||||||
json!(
|
json!(
|
||||||
r#"The `json` payload provided is malformed. `Couldn't serialize document value: invalid type: string "0123456789012345678901234567...890123456789", expected a documents, or a sequence of documents. at line 1 column 102`."#
|
r#"The `json` payload provided is malformed. `Couldn't serialize document value: invalid type: string "0123456789012345678901234567...890123456789012345678901234567890123456789", expected a sequence at line 1 column 102`."#
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
assert_eq!(response["code"], json!("malformed_payload"));
|
assert_eq!(response["code"], json!("malformed_payload"));
|
||||||
@ -349,9 +350,7 @@ async fn error_add_malformed_json_documents() {
|
|||||||
assert_eq!(status_code, 400);
|
assert_eq!(status_code, 400);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
response["message"],
|
response["message"],
|
||||||
json!(
|
json!("The `json` payload provided is malformed. `Couldn't serialize document value: invalid type: string \"0123456789012345678901234567...90123456789012345678901234567890123456789m\", expected a sequence at line 1 column 103`.")
|
||||||
r#"The `json` payload provided is malformed. `Couldn't serialize document value: invalid type: string "0123456789012345678901234567...90123456789m", expected a documents, or a sequence of documents. at line 1 column 103`."#
|
|
||||||
)
|
|
||||||
);
|
);
|
||||||
assert_eq!(response["code"], json!("malformed_payload"));
|
assert_eq!(response["code"], json!("malformed_payload"));
|
||||||
assert_eq!(response["type"], json!("invalid_request"));
|
assert_eq!(response["type"], json!("invalid_request"));
|
||||||
@ -388,7 +387,7 @@ async fn error_add_malformed_ndjson_documents() {
|
|||||||
assert_eq!(
|
assert_eq!(
|
||||||
response["message"],
|
response["message"],
|
||||||
json!(
|
json!(
|
||||||
r#"The `ndjson` payload provided is malformed. `Couldn't serialize document value: key must be a string at line 1 column 2`."#
|
r#"The `ndjson` payload provided is malformed. `Couldn't serialize document value: key must be a string at line 2 column 2`."#
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
assert_eq!(response["code"], json!("malformed_payload"));
|
assert_eq!(response["code"], json!("malformed_payload"));
|
||||||
@ -411,9 +410,7 @@ async fn error_add_malformed_ndjson_documents() {
|
|||||||
assert_eq!(status_code, 400);
|
assert_eq!(status_code, 400);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
response["message"],
|
response["message"],
|
||||||
json!(
|
json!("The `ndjson` payload provided is malformed. `Couldn't serialize document value: key must be a string at line 2 column 2`.")
|
||||||
r#"The `ndjson` payload provided is malformed. `Couldn't serialize document value: key must be a string at line 1 column 2`."#
|
|
||||||
)
|
|
||||||
);
|
);
|
||||||
assert_eq!(response["code"], json!("malformed_payload"));
|
assert_eq!(response["code"], json!("malformed_payload"));
|
||||||
assert_eq!(response["type"], json!("invalid_request"));
|
assert_eq!(response["type"], json!("invalid_request"));
|
||||||
@ -1020,7 +1017,7 @@ async fn add_documents_invalid_geo_field() {
|
|||||||
index.wait_task(2).await;
|
index.wait_task(2).await;
|
||||||
let (response, code) = index.get_task(2).await;
|
let (response, code) = index.get_task(2).await;
|
||||||
assert_eq!(code, 200);
|
assert_eq!(code, 200);
|
||||||
assert_eq!(response["status"], "succeeded");
|
assert_eq!(response["status"], "failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[actix_rt::test]
|
#[actix_rt::test]
|
||||||
@ -1099,3 +1096,62 @@ async fn add_documents_with_primary_key_twice() {
|
|||||||
let (response, _code) = index.get_task(1).await;
|
let (response, _code) = index.get_task(1).await;
|
||||||
assert_eq!(response["status"], "succeeded");
|
assert_eq!(response["status"], "succeeded");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[actix_rt::test]
|
||||||
|
async fn batch_several_documents_addition() {
|
||||||
|
let server = Server::new().await;
|
||||||
|
let index = server.index("test");
|
||||||
|
|
||||||
|
let mut documents: Vec<_> = (0..150usize)
|
||||||
|
.into_iter()
|
||||||
|
.map(|id| {
|
||||||
|
json!(
|
||||||
|
{
|
||||||
|
"id": id,
|
||||||
|
"title": "foo",
|
||||||
|
"desc": "bar"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
documents[100] = json!({"title": "error", "desc": "error"});
|
||||||
|
|
||||||
|
// enqueue batch of documents
|
||||||
|
let mut waiter = Vec::new();
|
||||||
|
for chunk in documents.chunks(30) {
|
||||||
|
waiter.push(index.add_documents(json!(chunk), Some("id")));
|
||||||
|
}
|
||||||
|
|
||||||
|
// wait first batch of documents to finish
|
||||||
|
futures::future::join_all(waiter).await;
|
||||||
|
index.wait_task(4).await;
|
||||||
|
|
||||||
|
// run a second completely failing batch
|
||||||
|
documents[40] = json!({"title": "error", "desc": "error"});
|
||||||
|
documents[70] = json!({"title": "error", "desc": "error"});
|
||||||
|
documents[130] = json!({"title": "error", "desc": "error"});
|
||||||
|
let mut waiter = Vec::new();
|
||||||
|
for chunk in documents.chunks(30) {
|
||||||
|
waiter.push(index.add_documents(json!(chunk), Some("id")));
|
||||||
|
}
|
||||||
|
// wait second batch of documents to finish
|
||||||
|
futures::future::join_all(waiter).await;
|
||||||
|
index.wait_task(9).await;
|
||||||
|
|
||||||
|
let (response, _code) = index.filtered_tasks(&[], &["failed"]).await;
|
||||||
|
|
||||||
|
// Check if only the 6th task failed
|
||||||
|
println!("{}", &response);
|
||||||
|
assert_eq!(response["results"].as_array().unwrap().len(), 5);
|
||||||
|
|
||||||
|
// Check if there are exactly 120 documents (150 - 30) in the index;
|
||||||
|
let (response, code) = index
|
||||||
|
.get_all_documents(GetAllDocumentsOptions {
|
||||||
|
limit: Some(200),
|
||||||
|
..Default::default()
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
assert_eq!(code, 200, "failed with `{}`", response);
|
||||||
|
assert_eq!(response["results"].as_array().unwrap().len(), 120);
|
||||||
|
}
|
||||||
|
@ -708,9 +708,7 @@ async fn faceting_max_values_per_facet() {
|
|||||||
}),
|
}),
|
||||||
|response, code| {
|
|response, code| {
|
||||||
assert_eq!(code, 200, "{}", response);
|
assert_eq!(code, 200, "{}", response);
|
||||||
let numbers = dbg!(&response)["facetDistribution"]["number"]
|
let numbers = &response["facetDistribution"]["number"].as_object().unwrap();
|
||||||
.as_object()
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(numbers.len(), 10_000);
|
assert_eq!(numbers.len(), 10_000);
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
@ -28,7 +28,7 @@ lazy_static = "1.4.0"
|
|||||||
log = "0.4.14"
|
log = "0.4.14"
|
||||||
meilisearch-auth = { path = "../meilisearch-auth" }
|
meilisearch-auth = { path = "../meilisearch-auth" }
|
||||||
meilisearch-types = { path = "../meilisearch-types" }
|
meilisearch-types = { path = "../meilisearch-types" }
|
||||||
milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.31.2" }
|
milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.32.0" }
|
||||||
mime = "0.3.16"
|
mime = "0.3.16"
|
||||||
num_cpus = "1.13.1"
|
num_cpus = "1.13.1"
|
||||||
obkv = "0.2.0"
|
obkv = "0.2.0"
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
use std::borrow::Borrow;
|
use std::borrow::Borrow;
|
||||||
use std::fmt::{self, Debug, Display};
|
use std::fmt::{self, Debug, Display};
|
||||||
use std::io::{self, BufRead, BufReader, BufWriter, Cursor, Read, Seek, Write};
|
use std::io::{self, BufReader, Read, Seek, Write};
|
||||||
|
|
||||||
use meilisearch_types::error::{Code, ErrorCode};
|
use meilisearch_types::error::{Code, ErrorCode};
|
||||||
use meilisearch_types::internal_error;
|
use meilisearch_types::internal_error;
|
||||||
use milli::documents::DocumentBatchBuilder;
|
use milli::documents::{DocumentsBatchBuilder, Error};
|
||||||
|
|
||||||
type Result<T> = std::result::Result<T, DocumentFormatError>;
|
type Result<T> = std::result::Result<T, DocumentFormatError>;
|
||||||
|
|
||||||
@ -18,9 +18,9 @@ pub enum PayloadType {
|
|||||||
impl fmt::Display for PayloadType {
|
impl fmt::Display for PayloadType {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
match self {
|
match self {
|
||||||
PayloadType::Ndjson => write!(f, "ndjson"),
|
PayloadType::Ndjson => f.write_str("ndjson"),
|
||||||
PayloadType::Json => write!(f, "json"),
|
PayloadType::Json => f.write_str("json"),
|
||||||
PayloadType::Csv => write!(f, "csv"),
|
PayloadType::Csv => f.write_str("csv"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -28,7 +28,7 @@ impl fmt::Display for PayloadType {
|
|||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub enum DocumentFormatError {
|
pub enum DocumentFormatError {
|
||||||
Internal(Box<dyn std::error::Error + Send + Sync + 'static>),
|
Internal(Box<dyn std::error::Error + Send + Sync + 'static>),
|
||||||
MalformedPayload(Box<milli::documents::Error>, PayloadType),
|
MalformedPayload(Error, PayloadType),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Display for DocumentFormatError {
|
impl Display for DocumentFormatError {
|
||||||
@ -36,7 +36,7 @@ impl Display for DocumentFormatError {
|
|||||||
match self {
|
match self {
|
||||||
Self::Internal(e) => write!(f, "An internal error has occurred: `{}`.", e),
|
Self::Internal(e) => write!(f, "An internal error has occurred: `{}`.", e),
|
||||||
Self::MalformedPayload(me, b) => match me.borrow() {
|
Self::MalformedPayload(me, b) => match me.borrow() {
|
||||||
milli::documents::Error::JsonError(se) => {
|
Error::Json(se) => {
|
||||||
// https://github.com/meilisearch/meilisearch/issues/2107
|
// https://github.com/meilisearch/meilisearch/issues/2107
|
||||||
// The user input maybe insanely long. We need to truncate it.
|
// The user input maybe insanely long. We need to truncate it.
|
||||||
let mut serde_msg = se.to_string();
|
let mut serde_msg = se.to_string();
|
||||||
@ -59,11 +59,11 @@ impl Display for DocumentFormatError {
|
|||||||
|
|
||||||
impl std::error::Error for DocumentFormatError {}
|
impl std::error::Error for DocumentFormatError {}
|
||||||
|
|
||||||
impl From<(PayloadType, milli::documents::Error)> for DocumentFormatError {
|
impl From<(PayloadType, Error)> for DocumentFormatError {
|
||||||
fn from((ty, error): (PayloadType, milli::documents::Error)) -> Self {
|
fn from((ty, error): (PayloadType, Error)) -> Self {
|
||||||
match error {
|
match error {
|
||||||
milli::documents::Error::Io(e) => Self::Internal(Box::new(e)),
|
Error::Io(e) => Self::Internal(Box::new(e)),
|
||||||
e => Self::MalformedPayload(Box::new(e), ty),
|
e => Self::MalformedPayload(e, ty),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -79,51 +79,67 @@ impl ErrorCode for DocumentFormatError {
|
|||||||
|
|
||||||
internal_error!(DocumentFormatError: io::Error);
|
internal_error!(DocumentFormatError: io::Error);
|
||||||
|
|
||||||
/// reads csv from input and write an obkv batch to writer.
|
/// Reads CSV from input and write an obkv batch to writer.
|
||||||
pub fn read_csv(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
|
pub fn read_csv(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
|
||||||
let writer = BufWriter::new(writer);
|
let mut builder = DocumentsBatchBuilder::new(writer);
|
||||||
let builder =
|
|
||||||
DocumentBatchBuilder::from_csv(input, writer).map_err(|e| (PayloadType::Csv, e))?;
|
|
||||||
|
|
||||||
let count = builder.finish().map_err(|e| (PayloadType::Csv, e))?;
|
let csv = csv::Reader::from_reader(input);
|
||||||
|
builder.append_csv(csv).map_err(|e| (PayloadType::Csv, e))?;
|
||||||
|
|
||||||
Ok(count)
|
let count = builder.documents_count();
|
||||||
|
let _ = builder
|
||||||
|
.into_inner()
|
||||||
|
.map_err(Into::into)
|
||||||
|
.map_err(DocumentFormatError::Internal)?;
|
||||||
|
|
||||||
|
Ok(count as usize)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// reads jsonl from input and write an obkv batch to writer.
|
/// Reads JSON Lines from input and write an obkv batch to writer.
|
||||||
pub fn read_ndjson(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
|
pub fn read_ndjson(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
|
||||||
let mut reader = BufReader::new(input);
|
let mut builder = DocumentsBatchBuilder::new(writer);
|
||||||
let writer = BufWriter::new(writer);
|
let reader = BufReader::new(input);
|
||||||
|
|
||||||
let mut builder = DocumentBatchBuilder::new(writer).map_err(|e| (PayloadType::Ndjson, e))?;
|
for result in serde_json::Deserializer::from_reader(reader).into_iter() {
|
||||||
let mut buf = String::new();
|
let object = result
|
||||||
|
.map_err(Error::Json)
|
||||||
while reader.read_line(&mut buf)? > 0 {
|
|
||||||
// skip empty lines
|
|
||||||
if buf == "\n" {
|
|
||||||
buf.clear();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
builder
|
|
||||||
.extend_from_json(Cursor::new(&buf.as_bytes()))
|
|
||||||
.map_err(|e| (PayloadType::Ndjson, e))?;
|
.map_err(|e| (PayloadType::Ndjson, e))?;
|
||||||
buf.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
let count = builder.finish().map_err(|e| (PayloadType::Ndjson, e))?;
|
|
||||||
|
|
||||||
Ok(count)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// reads json from input and write an obkv batch to writer.
|
|
||||||
pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
|
|
||||||
let writer = BufWriter::new(writer);
|
|
||||||
let mut builder = DocumentBatchBuilder::new(writer).map_err(|e| (PayloadType::Json, e))?;
|
|
||||||
builder
|
builder
|
||||||
.extend_from_json(input)
|
.append_json_object(&object)
|
||||||
|
.map_err(Into::into)
|
||||||
|
.map_err(DocumentFormatError::Internal)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let count = builder.documents_count();
|
||||||
|
let _ = builder
|
||||||
|
.into_inner()
|
||||||
|
.map_err(Into::into)
|
||||||
|
.map_err(DocumentFormatError::Internal)?;
|
||||||
|
|
||||||
|
Ok(count as usize)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Reads JSON from input and write an obkv batch to writer.
|
||||||
|
pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
|
||||||
|
let mut builder = DocumentsBatchBuilder::new(writer);
|
||||||
|
let reader = BufReader::new(input);
|
||||||
|
|
||||||
|
let objects: Vec<_> = serde_json::from_reader(reader)
|
||||||
|
.map_err(Error::Json)
|
||||||
.map_err(|e| (PayloadType::Json, e))?;
|
.map_err(|e| (PayloadType::Json, e))?;
|
||||||
|
|
||||||
let count = builder.finish().map_err(|e| (PayloadType::Json, e))?;
|
for object in objects {
|
||||||
|
builder
|
||||||
Ok(count)
|
.append_json_object(&object)
|
||||||
|
.map_err(Into::into)
|
||||||
|
.map_err(DocumentFormatError::Internal)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let count = builder.documents_count();
|
||||||
|
let _ = builder
|
||||||
|
.into_inner()
|
||||||
|
.map_err(Into::into)
|
||||||
|
.map_err(DocumentFormatError::Internal)?;
|
||||||
|
|
||||||
|
Ok(count as usize)
|
||||||
}
|
}
|
||||||
|
@ -11,7 +11,7 @@ pub enum DumpError {
|
|||||||
#[error("An internal error has occurred. `{0}`.")]
|
#[error("An internal error has occurred. `{0}`.")]
|
||||||
Internal(Box<dyn std::error::Error + Send + Sync + 'static>),
|
Internal(Box<dyn std::error::Error + Send + Sync + 'static>),
|
||||||
#[error("{0}")]
|
#[error("{0}")]
|
||||||
IndexResolver(#[from] IndexResolverError),
|
IndexResolver(Box<IndexResolverError>),
|
||||||
}
|
}
|
||||||
|
|
||||||
internal_error!(
|
internal_error!(
|
||||||
@ -26,6 +26,12 @@ internal_error!(
|
|||||||
TaskError
|
TaskError
|
||||||
);
|
);
|
||||||
|
|
||||||
|
impl From<IndexResolverError> for DumpError {
|
||||||
|
fn from(e: IndexResolverError) -> Self {
|
||||||
|
Self::IndexResolver(Box::new(e))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl ErrorCode for DumpError {
|
impl ErrorCode for DumpError {
|
||||||
fn error_code(&self) -> Code {
|
fn error_code(&self) -> Code {
|
||||||
match self {
|
match self {
|
||||||
|
@ -25,6 +25,7 @@ impl ErrorCode for MilliError<'_> {
|
|||||||
// TODO: wait for spec for new error codes.
|
// TODO: wait for spec for new error codes.
|
||||||
UserError::SerdeJson(_)
|
UserError::SerdeJson(_)
|
||||||
| UserError::DocumentLimitReached
|
| UserError::DocumentLimitReached
|
||||||
|
| UserError::AccessingSoftDeletedDocument { .. }
|
||||||
| UserError::UnknownInternalDocumentId { .. } => Code::Internal,
|
| UserError::UnknownInternalDocumentId { .. } => Code::Internal,
|
||||||
UserError::InvalidStoreFile => Code::InvalidStore,
|
UserError::InvalidStoreFile => Code::InvalidStore,
|
||||||
UserError::NoSpaceLeftOnDevice => Code::NoSpaceLeftOnDevice,
|
UserError::NoSpaceLeftOnDevice => Code::NoSpaceLeftOnDevice,
|
||||||
@ -32,7 +33,9 @@ impl ErrorCode for MilliError<'_> {
|
|||||||
UserError::AttributeLimitReached => Code::MaxFieldsLimitExceeded,
|
UserError::AttributeLimitReached => Code::MaxFieldsLimitExceeded,
|
||||||
UserError::InvalidFilter(_) => Code::Filter,
|
UserError::InvalidFilter(_) => Code::Filter,
|
||||||
UserError::MissingDocumentId { .. } => Code::MissingDocumentId,
|
UserError::MissingDocumentId { .. } => Code::MissingDocumentId,
|
||||||
UserError::InvalidDocumentId { .. } => Code::InvalidDocumentId,
|
UserError::InvalidDocumentId { .. } | UserError::TooManyDocumentIds { .. } => {
|
||||||
|
Code::InvalidDocumentId
|
||||||
|
}
|
||||||
UserError::MissingPrimaryKey => Code::MissingPrimaryKey,
|
UserError::MissingPrimaryKey => Code::MissingPrimaryKey,
|
||||||
UserError::PrimaryKeyCannotBeChanged(_) => Code::PrimaryKeyAlreadyPresent,
|
UserError::PrimaryKeyCannotBeChanged(_) => Code::PrimaryKeyAlreadyPresent,
|
||||||
UserError::SortRankingRuleMissing => Code::Sort,
|
UserError::SortRankingRuleMissing => Code::Sort,
|
||||||
|
@ -4,7 +4,7 @@ use std::path::Path;
|
|||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use indexmap::IndexMap;
|
use indexmap::IndexMap;
|
||||||
use milli::documents::DocumentBatchReader;
|
use milli::documents::DocumentsBatchReader;
|
||||||
use milli::heed::{EnvOpenOptions, RoTxn};
|
use milli::heed::{EnvOpenOptions, RoTxn};
|
||||||
use milli::update::{IndexDocumentsConfig, IndexerConfig};
|
use milli::update::{IndexDocumentsConfig, IndexerConfig};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
@ -135,19 +135,20 @@ impl Index {
|
|||||||
if !empty {
|
if !empty {
|
||||||
tmp_doc_file.seek(SeekFrom::Start(0))?;
|
tmp_doc_file.seek(SeekFrom::Start(0))?;
|
||||||
|
|
||||||
let documents_reader = DocumentBatchReader::from_reader(tmp_doc_file)?;
|
let documents_reader = DocumentsBatchReader::from_reader(tmp_doc_file)?;
|
||||||
|
|
||||||
//If the document file is empty, we don't perform the document addition, to prevent
|
//If the document file is empty, we don't perform the document addition, to prevent
|
||||||
//a primary key error to be thrown.
|
//a primary key error to be thrown.
|
||||||
let config = IndexDocumentsConfig::default();
|
let config = IndexDocumentsConfig::default();
|
||||||
let mut builder = milli::update::IndexDocuments::new(
|
let builder = milli::update::IndexDocuments::new(
|
||||||
&mut txn,
|
&mut txn,
|
||||||
&index,
|
&index,
|
||||||
indexer_config,
|
indexer_config,
|
||||||
config,
|
config,
|
||||||
|_| (),
|
|_| (),
|
||||||
)?;
|
)?;
|
||||||
builder.add_documents(documents_reader)?;
|
let (builder, user_error) = builder.add_documents(documents_reader)?;
|
||||||
|
user_error?;
|
||||||
builder.execute()?;
|
builder.execute()?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -40,6 +40,12 @@ impl ErrorCode for IndexError {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl From<milli::UserError> for IndexError {
|
||||||
|
fn from(error: milli::UserError) -> IndexError {
|
||||||
|
IndexError::Milli(error.into())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, thiserror::Error)]
|
#[derive(Debug, thiserror::Error)]
|
||||||
pub enum FacetError {
|
pub enum FacetError {
|
||||||
#[error("Invalid syntax for the filter parameter: `expected {}, found: {1}`.", .0.join(", "))]
|
#[error("Invalid syntax for the filter parameter: `expected {}, found: {1}`.", .0.join(", "))]
|
||||||
|
@ -4,7 +4,6 @@ use std::marker::PhantomData;
|
|||||||
use std::ops::Deref;
|
use std::ops::Deref;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use walkdir::WalkDir;
|
|
||||||
|
|
||||||
use fst::IntoStreamer;
|
use fst::IntoStreamer;
|
||||||
use milli::heed::{CompactionOption, EnvOpenOptions, RoTxn};
|
use milli::heed::{CompactionOption, EnvOpenOptions, RoTxn};
|
||||||
@ -14,6 +13,7 @@ use serde::{Deserialize, Serialize};
|
|||||||
use serde_json::{Map, Value};
|
use serde_json::{Map, Value};
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
|
use walkdir::WalkDir;
|
||||||
|
|
||||||
use crate::index::search::DEFAULT_PAGINATION_MAX_TOTAL_HITS;
|
use crate::index::search::DEFAULT_PAGINATION_MAX_TOTAL_HITS;
|
||||||
|
|
||||||
@ -245,11 +245,8 @@ impl Index {
|
|||||||
let fields_ids_map = self.fields_ids_map(&txn)?;
|
let fields_ids_map = self.fields_ids_map(&txn)?;
|
||||||
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
|
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
|
||||||
|
|
||||||
let iter = self.all_documents(&txn)?.skip(offset).take(limit);
|
|
||||||
|
|
||||||
let mut documents = Vec::new();
|
let mut documents = Vec::new();
|
||||||
|
for entry in self.all_documents(&txn)?.skip(offset).take(limit) {
|
||||||
for entry in iter {
|
|
||||||
let (_id, obkv) = entry?;
|
let (_id, obkv) = entry?;
|
||||||
let document = obkv_to_json(&all_fields, &fields_ids_map, obkv)?;
|
let document = obkv_to_json(&all_fields, &fields_ids_map, obkv)?;
|
||||||
let document = match &attributes_to_retrieve {
|
let document = match &attributes_to_retrieve {
|
||||||
@ -302,7 +299,7 @@ impl Index {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn size(&self) -> u64 {
|
pub fn size(&self) -> u64 {
|
||||||
WalkDir::new(self.inner.path())
|
WalkDir::new(self.path())
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.filter_map(|entry| entry.ok())
|
.filter_map(|entry| entry.ok())
|
||||||
.filter_map(|entry| entry.metadata().ok())
|
.filter_map(|entry| entry.metadata().ok())
|
||||||
|
@ -24,12 +24,12 @@ pub use test::MockIndex as Index;
|
|||||||
/// code for unit testing, in places where an index would normally be used.
|
/// code for unit testing, in places where an index would normally be used.
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub mod test {
|
pub mod test {
|
||||||
use std::path::Path;
|
use std::path::{Path, PathBuf};
|
||||||
use std::path::PathBuf;
|
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use milli::update::IndexerConfig;
|
use milli::update::{
|
||||||
use milli::update::{DocumentAdditionResult, DocumentDeletionResult, IndexDocumentsMethod};
|
DocumentAdditionResult, DocumentDeletionResult, IndexDocumentsMethod, IndexerConfig,
|
||||||
|
};
|
||||||
use nelson::Mocker;
|
use nelson::Mocker;
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
|
|
||||||
@ -162,7 +162,7 @@ pub mod test {
|
|||||||
primary_key: Option<String>,
|
primary_key: Option<String>,
|
||||||
file_store: UpdateFileStore,
|
file_store: UpdateFileStore,
|
||||||
contents: impl Iterator<Item = Uuid>,
|
contents: impl Iterator<Item = Uuid>,
|
||||||
) -> Result<DocumentAdditionResult> {
|
) -> Result<Vec<Result<DocumentAdditionResult>>> {
|
||||||
match self {
|
match self {
|
||||||
MockIndex::Real(index) => {
|
MockIndex::Real(index) => {
|
||||||
index.update_documents(method, primary_key, file_store, contents)
|
index.update_documents(method, primary_key, file_store, contents)
|
||||||
|
@ -3,7 +3,7 @@ use std::marker::PhantomData;
|
|||||||
use std::num::NonZeroUsize;
|
use std::num::NonZeroUsize;
|
||||||
|
|
||||||
use log::{debug, info, trace};
|
use log::{debug, info, trace};
|
||||||
use milli::documents::DocumentBatchReader;
|
use milli::documents::DocumentsBatchReader;
|
||||||
use milli::update::{
|
use milli::update::{
|
||||||
DocumentAdditionResult, DocumentDeletionResult, IndexDocumentsConfig, IndexDocumentsMethod,
|
DocumentAdditionResult, DocumentDeletionResult, IndexDocumentsConfig, IndexDocumentsMethod,
|
||||||
Setting,
|
Setting,
|
||||||
@ -11,7 +11,7 @@ use milli::update::{
|
|||||||
use serde::{Deserialize, Serialize, Serializer};
|
use serde::{Deserialize, Serialize, Serializer};
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
|
|
||||||
use super::error::Result;
|
use super::error::{IndexError, Result};
|
||||||
use super::index::{Index, IndexMeta};
|
use super::index::{Index, IndexMeta};
|
||||||
use crate::update_file_store::UpdateFileStore;
|
use crate::update_file_store::UpdateFileStore;
|
||||||
|
|
||||||
@ -299,7 +299,7 @@ impl Index {
|
|||||||
primary_key: Option<String>,
|
primary_key: Option<String>,
|
||||||
file_store: UpdateFileStore,
|
file_store: UpdateFileStore,
|
||||||
contents: impl IntoIterator<Item = Uuid>,
|
contents: impl IntoIterator<Item = Uuid>,
|
||||||
) -> Result<DocumentAdditionResult> {
|
) -> Result<Vec<Result<DocumentAdditionResult>>> {
|
||||||
trace!("performing document addition");
|
trace!("performing document addition");
|
||||||
let mut txn = self.write_txn()?;
|
let mut txn = self.write_txn()?;
|
||||||
|
|
||||||
@ -323,19 +323,34 @@ impl Index {
|
|||||||
indexing_callback,
|
indexing_callback,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
|
let mut results = Vec::new();
|
||||||
for content_uuid in contents.into_iter() {
|
for content_uuid in contents.into_iter() {
|
||||||
let content_file = file_store.get_update(content_uuid)?;
|
let content_file = file_store.get_update(content_uuid)?;
|
||||||
let reader = DocumentBatchReader::from_reader(content_file)?;
|
let reader = DocumentsBatchReader::from_reader(content_file)?;
|
||||||
builder.add_documents(reader)?;
|
let (new_builder, user_result) = builder.add_documents(reader)?;
|
||||||
|
builder = new_builder;
|
||||||
|
|
||||||
|
let user_result = match user_result {
|
||||||
|
Ok(count) => {
|
||||||
|
let addition = DocumentAdditionResult {
|
||||||
|
indexed_documents: count,
|
||||||
|
number_of_documents: count,
|
||||||
|
};
|
||||||
|
info!("document addition done: {:?}", addition);
|
||||||
|
Ok(addition)
|
||||||
|
}
|
||||||
|
Err(e) => Err(IndexError::from(e)),
|
||||||
|
};
|
||||||
|
|
||||||
|
results.push(user_result);
|
||||||
}
|
}
|
||||||
|
|
||||||
let addition = builder.execute()?;
|
if results.iter().any(Result::is_ok) {
|
||||||
|
let _addition = builder.execute()?;
|
||||||
txn.commit()?;
|
txn.commit()?;
|
||||||
|
}
|
||||||
|
|
||||||
info!("document addition done: {:?}", addition);
|
Ok(results)
|
||||||
|
|
||||||
Ok(addition)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn update_settings(&self, settings: &Settings<Checked>) -> Result<()> {
|
pub fn update_settings(&self, settings: &Settings<Checked>) -> Result<()> {
|
||||||
|
@ -150,27 +150,36 @@ mod real {
|
|||||||
})
|
})
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
|
match result {
|
||||||
|
Ok(Ok(results)) => {
|
||||||
|
for (task, result) in tasks.iter_mut().zip(results) {
|
||||||
let event = match result {
|
let event = match result {
|
||||||
Ok(Ok(result)) => TaskEvent::Succeeded {
|
Ok(addition) => {
|
||||||
timestamp: OffsetDateTime::now_utc(),
|
TaskEvent::succeeded(TaskResult::DocumentAddition {
|
||||||
result: TaskResult::DocumentAddition {
|
indexed_documents: addition.indexed_documents,
|
||||||
indexed_documents: result.indexed_documents,
|
})
|
||||||
},
|
}
|
||||||
},
|
Err(error) => {
|
||||||
Ok(Err(e)) => TaskEvent::Failed {
|
TaskEvent::failed(IndexResolverError::from(error))
|
||||||
timestamp: OffsetDateTime::now_utc(),
|
}
|
||||||
error: e.into(),
|
|
||||||
},
|
|
||||||
Err(e) => TaskEvent::Failed {
|
|
||||||
timestamp: OffsetDateTime::now_utc(),
|
|
||||||
error: IndexResolverError::from(e).into(),
|
|
||||||
},
|
|
||||||
};
|
};
|
||||||
|
task.events.push(event);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
let event = TaskEvent::failed(e);
|
||||||
for task in tasks.iter_mut() {
|
for task in tasks.iter_mut() {
|
||||||
task.events.push(event.clone());
|
task.events.push(event.clone());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Err(e) => {
|
||||||
|
let event = TaskEvent::failed(IndexResolverError::from(e));
|
||||||
|
for task in tasks.iter_mut() {
|
||||||
|
task.events.push(event.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
_ => panic!("invalid batch!"),
|
_ => panic!("invalid batch!"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -41,27 +41,10 @@ pub struct IndexerOpts {
|
|||||||
|
|
||||||
#[derive(Debug, Clone, Parser, Default, Serialize)]
|
#[derive(Debug, Clone, Parser, Default, Serialize)]
|
||||||
pub struct SchedulerConfig {
|
pub struct SchedulerConfig {
|
||||||
/// enable the autobatching experimental feature
|
/// The engine will disable task auto-batching,
|
||||||
#[clap(long, hide = true)]
|
/// and will sequencialy compute each task one by one.
|
||||||
pub enable_auto_batching: bool,
|
#[clap(long, env = "DISABLE_AUTO_BATCHING")]
|
||||||
|
pub disable_auto_batching: bool,
|
||||||
// The maximum number of updates of the same type that can be batched together.
|
|
||||||
// If unspecified, this is unlimited. A value of 0 is interpreted as 1.
|
|
||||||
#[clap(long, requires = "enable-auto-batching", hide = true)]
|
|
||||||
pub max_batch_size: Option<usize>,
|
|
||||||
|
|
||||||
// The maximum number of documents in a document batch. Since batches must contain at least one
|
|
||||||
// update for the scheduler to make progress, the number of documents in a batch will be at
|
|
||||||
// least the number of documents of its first update.
|
|
||||||
#[clap(long, requires = "enable-auto-batching", hide = true)]
|
|
||||||
pub max_documents_per_batch: Option<usize>,
|
|
||||||
|
|
||||||
/// Debounce duration in seconds
|
|
||||||
///
|
|
||||||
/// When a new task is enqueued, the scheduler waits for `debounce_duration_sec` seconds for new updates before
|
|
||||||
/// starting to process a batch of updates.
|
|
||||||
#[clap(long, requires = "enable-auto-batching", hide = true)]
|
|
||||||
pub debounce_duration_sec: Option<u64>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TryFrom<&IndexerOpts> for IndexerConfig {
|
impl TryFrom<&IndexerOpts> for IndexerConfig {
|
||||||
|
@ -3,7 +3,6 @@ use std::collections::{hash_map::Entry, BinaryHeap, HashMap, VecDeque};
|
|||||||
use std::ops::{Deref, DerefMut};
|
use std::ops::{Deref, DerefMut};
|
||||||
use std::slice;
|
use std::slice;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
|
||||||
|
|
||||||
use atomic_refcell::AtomicRefCell;
|
use atomic_refcell::AtomicRefCell;
|
||||||
use milli::update::IndexDocumentsMethod;
|
use milli::update::IndexDocumentsMethod;
|
||||||
@ -248,17 +247,10 @@ impl Scheduler {
|
|||||||
pub fn new(
|
pub fn new(
|
||||||
store: TaskStore,
|
store: TaskStore,
|
||||||
performers: Vec<Arc<dyn BatchHandler + Sync + Send + 'static>>,
|
performers: Vec<Arc<dyn BatchHandler + Sync + Send + 'static>>,
|
||||||
mut config: SchedulerConfig,
|
config: SchedulerConfig,
|
||||||
) -> Result<Arc<RwLock<Self>>> {
|
) -> Result<Arc<RwLock<Self>>> {
|
||||||
let (notifier, rcv) = watch::channel(());
|
let (notifier, rcv) = watch::channel(());
|
||||||
|
|
||||||
let debounce_time = config.debounce_duration_sec;
|
|
||||||
|
|
||||||
// Disable autobatching
|
|
||||||
if !config.enable_auto_batching {
|
|
||||||
config.max_batch_size = Some(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
let this = Self {
|
let this = Self {
|
||||||
snapshots: VecDeque::new(),
|
snapshots: VecDeque::new(),
|
||||||
tasks: TaskQueue::default(),
|
tasks: TaskQueue::default(),
|
||||||
@ -275,12 +267,7 @@ impl Scheduler {
|
|||||||
|
|
||||||
let this = Arc::new(RwLock::new(this));
|
let this = Arc::new(RwLock::new(this));
|
||||||
|
|
||||||
let update_loop = UpdateLoop::new(
|
let update_loop = UpdateLoop::new(this.clone(), performers, rcv);
|
||||||
this.clone(),
|
|
||||||
performers,
|
|
||||||
debounce_time.filter(|&v| v > 0).map(Duration::from_secs),
|
|
||||||
rcv,
|
|
||||||
);
|
|
||||||
|
|
||||||
tokio::task::spawn_local(update_loop.run());
|
tokio::task::spawn_local(update_loop.run());
|
||||||
|
|
||||||
@ -497,27 +484,17 @@ fn make_batch(tasks: &mut TaskQueue, config: &SchedulerConfig) -> Processing {
|
|||||||
match list.peek() {
|
match list.peek() {
|
||||||
Some(pending) if pending.kind == kind => {
|
Some(pending) if pending.kind == kind => {
|
||||||
// We always need to process at least one task for the scheduler to make progress.
|
// We always need to process at least one task for the scheduler to make progress.
|
||||||
if task_list.len() >= config.max_batch_size.unwrap_or(usize::MAX).max(1)
|
if config.disable_auto_batching && !task_list.is_empty() {
|
||||||
{
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
let pending = list.pop().unwrap();
|
let pending = list.pop().unwrap();
|
||||||
task_list.push(pending.id);
|
task_list.push(pending.id);
|
||||||
|
|
||||||
// We add the number of documents to the count if we are scheduling document additions and
|
// We add the number of documents to the count if we are scheduling document additions.
|
||||||
// stop adding if we already have enough.
|
|
||||||
//
|
|
||||||
// We check that bound only after adding the current task to the batch, so that a batch contains at least one task.
|
|
||||||
match pending.kind {
|
match pending.kind {
|
||||||
TaskType::DocumentUpdate { number }
|
TaskType::DocumentUpdate { number }
|
||||||
| TaskType::DocumentAddition { number } => {
|
| TaskType::DocumentAddition { number } => {
|
||||||
doc_count += number;
|
doc_count += number;
|
||||||
|
|
||||||
if doc_count
|
|
||||||
>= config.max_documents_per_batch.unwrap_or(usize::MAX)
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
_ => (),
|
_ => (),
|
||||||
}
|
}
|
||||||
|
@ -1,9 +1,7 @@
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
|
||||||
|
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
use tokio::sync::{watch, RwLock};
|
use tokio::sync::{watch, RwLock};
|
||||||
use tokio::time::interval_at;
|
|
||||||
|
|
||||||
use super::batch::Batch;
|
use super::batch::Batch;
|
||||||
use super::error::Result;
|
use super::error::Result;
|
||||||
@ -17,20 +15,17 @@ pub struct UpdateLoop {
|
|||||||
performers: Vec<Arc<dyn BatchHandler + Send + Sync + 'static>>,
|
performers: Vec<Arc<dyn BatchHandler + Send + Sync + 'static>>,
|
||||||
|
|
||||||
notifier: Option<watch::Receiver<()>>,
|
notifier: Option<watch::Receiver<()>>,
|
||||||
debounce_duration: Option<Duration>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl UpdateLoop {
|
impl UpdateLoop {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
scheduler: Arc<RwLock<Scheduler>>,
|
scheduler: Arc<RwLock<Scheduler>>,
|
||||||
performers: Vec<Arc<dyn BatchHandler + Send + Sync + 'static>>,
|
performers: Vec<Arc<dyn BatchHandler + Send + Sync + 'static>>,
|
||||||
debuf_duration: Option<Duration>,
|
|
||||||
notifier: watch::Receiver<()>,
|
notifier: watch::Receiver<()>,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
Self {
|
Self {
|
||||||
scheduler,
|
scheduler,
|
||||||
performers,
|
performers,
|
||||||
debounce_duration: debuf_duration,
|
|
||||||
notifier: Some(notifier),
|
notifier: Some(notifier),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -43,11 +38,6 @@ impl UpdateLoop {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(t) = self.debounce_duration {
|
|
||||||
let mut interval = interval_at(tokio::time::Instant::now() + t, t);
|
|
||||||
interval.tick().await;
|
|
||||||
};
|
|
||||||
|
|
||||||
if let Err(e) = self.process_next_batch().await {
|
if let Err(e) = self.process_next_batch().await {
|
||||||
log::error!("an error occurred while processing an update batch: {}", e);
|
log::error!("an error occurred while processing an update batch: {}", e);
|
||||||
}
|
}
|
||||||
|
@ -3,7 +3,7 @@ use std::io::{self, BufReader, BufWriter, Write};
|
|||||||
use std::ops::{Deref, DerefMut};
|
use std::ops::{Deref, DerefMut};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
use milli::documents::DocumentBatchReader;
|
use milli::documents::DocumentsBatchReader;
|
||||||
use serde_json::Map;
|
use serde_json::Map;
|
||||||
use tempfile::{NamedTempFile, PersistError};
|
use tempfile::{NamedTempFile, PersistError};
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
@ -44,7 +44,8 @@ into_update_store_error!(
|
|||||||
PersistError,
|
PersistError,
|
||||||
io::Error,
|
io::Error,
|
||||||
serde_json::Error,
|
serde_json::Error,
|
||||||
milli::documents::Error
|
milli::documents::Error,
|
||||||
|
milli::documents::DocumentsBatchCursorError
|
||||||
);
|
);
|
||||||
|
|
||||||
impl UpdateFile {
|
impl UpdateFile {
|
||||||
@ -149,12 +150,13 @@ mod store {
|
|||||||
|
|
||||||
let update_file = File::open(update_file_path)?;
|
let update_file = File::open(update_file_path)?;
|
||||||
let mut dst_file = NamedTempFile::new_in(&dump_path)?;
|
let mut dst_file = NamedTempFile::new_in(&dump_path)?;
|
||||||
let mut document_reader = DocumentBatchReader::from_reader(update_file)?;
|
let (mut document_cursor, index) =
|
||||||
|
DocumentsBatchReader::from_reader(update_file)?.into_cursor_and_fields_index();
|
||||||
|
|
||||||
let mut document_buffer = Map::new();
|
let mut document_buffer = Map::new();
|
||||||
// TODO: we need to find a way to do this more efficiently. (create a custom serializer
|
// TODO: we need to find a way to do this more efficiently. (create a custom serializer
|
||||||
// for jsonl for example...)
|
// for jsonl for example...)
|
||||||
while let Some((index, document)) = document_reader.next_document_with_index()? {
|
while let Some(document) = document_cursor.next_document()? {
|
||||||
for (field_id, content) in document.iter() {
|
for (field_id, content) in document.iter() {
|
||||||
if let Some(field_name) = index.name(field_id) {
|
if let Some(field_name) = index.name(field_id) {
|
||||||
let content = serde_json::from_slice(content)?;
|
let content = serde_json::from_slice(content)?;
|
||||||
|
@ -49,7 +49,7 @@ fn contained_in(selector: &str, key: &str) -> bool {
|
|||||||
/// map_leaf_values(
|
/// map_leaf_values(
|
||||||
/// value.as_object_mut().unwrap(),
|
/// value.as_object_mut().unwrap(),
|
||||||
/// ["jean.race.name"],
|
/// ["jean.race.name"],
|
||||||
/// |key, value| match (value, dbg!(key)) {
|
/// |key, value| match (value, key) {
|
||||||
/// (Value::String(name), "jean.race.name") => *name = "patou".to_string(),
|
/// (Value::String(name), "jean.race.name") => *name = "patou".to_string(),
|
||||||
/// _ => unreachable!(),
|
/// _ => unreachable!(),
|
||||||
/// },
|
/// },
|
||||||
@ -729,7 +729,7 @@ mod tests {
|
|||||||
map_leaf_values(
|
map_leaf_values(
|
||||||
value.as_object_mut().unwrap(),
|
value.as_object_mut().unwrap(),
|
||||||
["jean.race.name"],
|
["jean.race.name"],
|
||||||
|key, value| match (value, dbg!(key)) {
|
|key, value| match (value, key) {
|
||||||
(Value::String(name), "jean.race.name") => *name = S("patou"),
|
(Value::String(name), "jean.race.name") => *name = S("patou"),
|
||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
},
|
},
|
||||||
|
Loading…
Reference in New Issue
Block a user