MeiliSearch/meilisearch/src/lib.rs

548 lines
21 KiB
Rust
Raw Normal View History

#![allow(rustdoc::private_intra_doc_links)]
2021-06-15 17:55:27 +02:00
#[macro_use]
2020-12-12 13:32:06 +01:00
pub mod error;
pub mod analytics;
2021-06-24 14:22:12 +02:00
#[macro_use]
2021-06-23 14:56:02 +02:00
pub mod extractors;
pub mod metrics;
pub mod middleware;
2021-03-15 18:11:10 +01:00
pub mod option;
pub mod routes;
pub mod search;
pub mod search_queue;
2022-10-20 18:00:07 +02:00
use std::fs::File;
use std::io::{BufReader, BufWriter};
use std::num::NonZeroUsize;
2022-10-20 18:00:07 +02:00
use std::path::Path;
2024-07-08 20:58:27 +02:00
use std::str::FromStr;
2022-10-20 18:00:07 +02:00
use std::sync::Arc;
use std::thread::{self, available_parallelism};
use std::time::Duration;
2021-09-28 18:10:09 +02:00
use actix_cors::Cors;
use actix_http::body::MessageBody;
2022-10-20 18:00:07 +02:00
use actix_web::dev::{ServiceFactory, ServiceResponse};
use actix_web::error::JsonPayloadError;
use actix_web::http::header::{CONTENT_TYPE, USER_AGENT};
2022-10-20 18:00:07 +02:00
use actix_web::web::Data;
use actix_web::{web, HttpRequest};
use analytics::Analytics;
2022-10-16 01:39:01 +02:00
use anyhow::bail;
use error::PayloadError;
2021-06-24 16:25:52 +02:00
use extractors::payload::PayloadConfig;
use index_scheduler::{IndexScheduler, IndexSchedulerOptions};
use meilisearch_auth::AuthController;
2022-10-20 18:00:07 +02:00
use meilisearch_types::milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use meilisearch_types::milli::update::{IndexDocumentsConfig, IndexDocumentsMethod};
use meilisearch_types::settings::apply_settings_to_builder;
use meilisearch_types::tasks::KindWithContent;
2022-10-25 15:51:15 +02:00
use meilisearch_types::versioning::{check_version_file, create_version_file};
use meilisearch_types::{compression, milli, VERSION_FILE_NAME};
2022-10-20 18:00:07 +02:00
pub use option::Opt;
use option::ScheduleSnapshot;
use search_queue::SearchQueue;
2024-02-08 13:49:13 +01:00
use tracing::{error, info_span};
2024-01-30 16:31:42 +01:00
use tracing_subscriber::filter::Targets;
2022-10-20 18:00:07 +02:00
use crate::error::MeilisearchHttpError;
2021-09-20 15:31:03 +02:00
/// Default number of simultaneously opened indexes.
///
/// This value is used when dynamic computation of how many indexes can be opened at once was skipped (e.g., in tests).
///
/// Lower for Windows that dedicates a smaller virtual address space to processes.
2023-01-11 17:34:46 +01:00
///
/// The value was chosen this way:
///
/// - Windows provides a small virtual address space of about 10TiB to processes.
/// - The chosen value allows for indexes to use the default map size of 2TiB safely.
2023-01-11 17:34:46 +01:00
#[cfg(windows)]
const DEFAULT_INDEX_COUNT: usize = 4;
2023-01-11 17:34:46 +01:00
/// Default number of simultaneously opened indexes.
///
/// This value is used when dynamic computation of how many indexes can be opened at once was skipped (e.g., in tests).
///
2023-01-11 17:34:46 +01:00
/// The higher, the better for avoiding reopening indexes.
///
/// The value was chosen this way:
///
/// - Opening an index consumes a file descriptor.
/// - The default on many unices is about 256 file descriptors for a process.
/// - 100 is a little bit less than half this value.
/// - The chosen value allows for indexes to use the default map size of 2TiB safely.
2023-01-11 17:34:46 +01:00
#[cfg(not(windows))]
const DEFAULT_INDEX_COUNT: usize = 20;
2023-01-11 17:34:46 +01:00
2022-10-16 01:39:01 +02:00
/// Check if a db is empty. It does not provide any information on the
/// validity of the data in it.
/// We consider a database as non empty when it's a non empty directory.
fn is_empty_db(db_path: impl AsRef<Path>) -> bool {
let db_path = db_path.as_ref();
if !db_path.exists() {
true
// if we encounter an error or if the db is a file we consider the db non empty
} else if let Ok(dir) = db_path.read_dir() {
dir.count() == 0
} else {
true
}
}
2024-01-29 17:56:43 +01:00
/// The handle used to update the logs at runtime. Must be accessible from the `main.rs` and the `route/logs.rs`.
pub type LogRouteHandle =
2024-01-29 18:45:55 +01:00
tracing_subscriber::reload::Handle<LogRouteType, tracing_subscriber::Registry>;
2024-01-30 16:31:42 +01:00
2024-01-29 18:45:55 +01:00
pub type LogRouteType = tracing_subscriber::filter::Filtered<
Option<Box<dyn tracing_subscriber::Layer<tracing_subscriber::Registry> + Send + Sync>>,
2024-01-30 16:31:42 +01:00
Targets,
2024-01-29 18:45:55 +01:00
tracing_subscriber::Registry,
>;
2024-01-29 17:56:43 +01:00
pub type SubscriberForSecondLayer = tracing_subscriber::layer::Layered<
tracing_subscriber::reload::Layer<LogRouteType, tracing_subscriber::Registry>,
tracing_subscriber::Registry,
>;
pub type LogStderrHandle =
tracing_subscriber::reload::Handle<LogStderrType, SubscriberForSecondLayer>;
pub type LogStderrType = tracing_subscriber::filter::Filtered<
Box<dyn tracing_subscriber::Layer<SubscriberForSecondLayer> + Send + Sync>,
Targets,
SubscriberForSecondLayer,
>;
pub fn create_app(
index_scheduler: Data<IndexScheduler>,
auth_controller: Data<AuthController>,
opt: Opt,
logs: (LogRouteHandle, LogStderrHandle),
analytics: Arc<dyn Analytics>,
enable_dashboard: bool,
) -> actix_web::App<
impl ServiceFactory<
actix_web::dev::ServiceRequest,
Config = (),
Response = ServiceResponse<impl MessageBody>,
Error = actix_web::Error,
InitError = (),
>,
> {
let app = actix_web::App::new()
.configure(|s| {
configure_data(
s,
index_scheduler.clone(),
auth_controller.clone(),
&opt,
2024-01-29 17:56:43 +01:00
logs,
analytics.clone(),
)
})
.configure(routes::configure)
.configure(|s| dashboard(s, enable_dashboard));
2022-10-22 16:35:42 +02:00
let app = app.wrap(middleware::RouteMetrics);
2022-10-22 16:35:42 +02:00
app.wrap(
Cors::default()
.send_wildcard()
.allow_any_header()
.allow_any_origin()
.allow_any_method()
.max_age(86_400), // 24h
)
2024-02-08 13:49:13 +01:00
.wrap(tracing_actix_web::TracingLogger::<AwebTracingLogger>::new())
.wrap(actix_web::middleware::Compress::default())
.wrap(actix_web::middleware::NormalizePath::new(actix_web::middleware::TrailingSlash::Trim))
}
2024-02-08 13:49:13 +01:00
struct AwebTracingLogger;
impl tracing_actix_web::RootSpanBuilder for AwebTracingLogger {
fn on_request_start(request: &actix_web::dev::ServiceRequest) -> tracing::Span {
use tracing::field::Empty;
let conn_info = request.connection_info();
let headers = request.headers();
let user_agent = headers
.get(USER_AGENT)
2024-02-08 13:49:13 +01:00
.map(|value| String::from_utf8_lossy(value.as_bytes()).into_owned())
.unwrap_or_default();
info_span!("HTTP request", method = %request.method(), host = conn_info.host(), route = %request.path(), query_parameters = %request.query_string(), %user_agent, status_code = Empty, error = Empty)
}
fn on_request_end<B: MessageBody>(
span: tracing::Span,
outcome: &Result<ServiceResponse<B>, actix_web::Error>,
) {
match &outcome {
Ok(response) => {
let code: i32 = response.response().status().as_u16().into();
span.record("status_code", code);
if let Some(error) = response.response().error() {
// use the status code already constructed for the outgoing HTTP response
span.record("error", &tracing::field::display(error.as_response_error()));
}
}
Err(error) => {
let code: i32 = error.error_response().status().as_u16().into();
span.record("status_code", code);
span.record("error", &tracing::field::display(error.as_response_error()));
}
};
}
}
enum OnFailure {
RemoveDb,
KeepDb,
}
2022-10-16 01:39:01 +02:00
pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<(Arc<IndexScheduler>, Arc<AuthController>)> {
2022-10-25 15:51:15 +02:00
let empty_db = is_empty_db(&opt.db_path);
let (index_scheduler, auth_controller) = if let Some(ref snapshot_path) = opt.import_snapshot {
let snapshot_path_exists = snapshot_path.exists();
// the db is empty and the snapshot exists, import it
if empty_db && snapshot_path_exists {
match compression::from_tar_gz(snapshot_path, &opt.db_path) {
2022-12-13 17:25:49 +01:00
Ok(()) => open_or_create_database_unchecked(opt, OnFailure::RemoveDb)?,
Err(e) => {
std::fs::remove_dir_all(&opt.db_path)?;
return Err(e);
}
}
// the db already exists and we should not ignore the snapshot => throw an error
} else if !empty_db && !opt.ignore_snapshot_if_db_exists {
bail!(
"database already exists at {:?}, try to delete it or rename it",
opt.db_path.canonicalize().unwrap_or_else(|_| opt.db_path.to_owned())
)
// the snapshot doesn't exist and we can't ignore it => throw an error
} else if !snapshot_path_exists && !opt.ignore_missing_snapshot {
bail!("snapshot doesn't exist at {}", snapshot_path.display())
// the snapshot and the db exist, and we can ignore the snapshot because of the ignore_snapshot_if_db_exists flag
} else {
2022-12-13 17:25:49 +01:00
open_or_create_database(opt, empty_db)?
}
2022-10-16 01:39:01 +02:00
} else if let Some(ref path) = opt.import_dump {
let src_path_exists = path.exists();
// the db is empty and the dump exists, import it
2022-10-16 01:39:01 +02:00
if empty_db && src_path_exists {
Bring back `release-v0.30.0` into `release-v0.30.0-temp` (final: into `main`) (#3145) * Fix error code of the "duplicate index found" error * Use the content of the ProcessingTasks in the tasks cancelation system * Change the missing_filters error code into missing_task_filters * WIP Introduce the invalid_task_uid error code * Use more precise error codes/message for the task routes + Allow star operator in delete/cancel tasks + rename originalQuery to originalFilters + Display error/canceled_by in task view even when they are = null + Rename task filter fields by using their plural forms + Prepare an error code for canceledBy filter + Only return global tasks if the API key action `index.*` is there * Add canceledBy task filter * Update tests following task API changes * Rename original_query to original_filters everywhere * Update more insta-snap tests * Make clippy happy They're a happy clip now. * Make rustfmt happy >:-( * Fix Index name parsing error message to fit the specification * Bump milli version to 0.35.1 * Fix the new error messages * fix the error messages and add tests * rename the error codes for the sake of consistency * refactor the way we send the cli informations + add the analytics for the config file and ssl usage * Apply suggestions from code review Co-authored-by: Clément Renault <clement@meilisearch.com> * add a comment over the new infos structure * reformat, sorry @kero * Store analytics for the documents deletions * Add analytics on all the settings * Spawn threads with names * Spawn rayon threads with names * update the distinct attributes to the spec update * update the analytics on the search route * implements the analytics on the health and version routes * Fix task details serialization * Add the question mark to the task deletion query filter * Add the question mark to the task cancelation query filter * Fix tests * add analytics on the task route * Add all the missing fields of the new task query type * Create a new analytics for the task deletion * Create a new analytics for the task creation * batch the tasks seen events * Update the finite pagination analytics * add the analytics of the swap-indexes route * Stop removing the DB when failing to read it * Rename originalFilters into originalFilters * Rename matchedDocuments into providedIds * Add `workflow_dispatch` to flaky.yml * Bump grenad to 0.4.4 * Bump milli to version v0.37.0 * Don't multiply total memory returned by sysinfo anymore sysinfo now returns bytes rather than KB * Add a dispatch to the publish binaries workflow * Fix publish release CI * Don't use gold but the default linker * Always display details for the indexDeletion task * Fix the insta tests * refactorize the whole test suite 1. Make a call to assert_internally_consistent automatically when snapshoting the scheduler. There is no point in snapshoting something broken and expect the dumb humans to notice. 2. Replace every possible call to assert_internally_consistent by a snapshot of the scheduler. It takes as many lines and ensure we never change something without noticing in any tests ever. 3. Name every snapshots: it's easier to debug when something goes wrong and easier to review in general. 4. Stop skipping breakpoints, it's too easy to miss something. Now you must explicitely show which path is the scheduler supposed to use. 5. Add a timeout on the channel.recv, it eases the process of writing tests, now when something file you get a failure instead of a deadlock. * rebase on release-v0.30 * makes clippy happy * update the snapshots after a rebase * try to remove the flakyness of the failing test * Add more analytics on the ranking rules positions * Update the dump test to check for the dumpUid dumpCreation task details * send the ranking rules as a string because amplitude is too dumb to process an array as a single value * Display a null dumpUid until we computed the dump itself on disk * Update tests * Check if the master key is missing before returning an error Co-authored-by: Loïc Lecrenier <loic.lecrenier@me.com> Co-authored-by: bors[bot] <26634292+bors[bot]@users.noreply.github.com> Co-authored-by: Kerollmops <clement@meilisearch.com> Co-authored-by: ManyTheFish <many@meilisearch.com> Co-authored-by: Tamo <tamo@meilisearch.com> Co-authored-by: Louis Dureuil <louis@meilisearch.com>
2022-11-28 16:27:41 +01:00
let (mut index_scheduler, mut auth_controller) =
2022-12-13 17:25:49 +01:00
open_or_create_database_unchecked(opt, OnFailure::RemoveDb)?;
match import_dump(&opt.db_path, path, &mut index_scheduler, &mut auth_controller) {
Ok(()) => (index_scheduler, auth_controller),
Err(e) => {
std::fs::remove_dir_all(&opt.db_path)?;
2022-10-22 16:35:42 +02:00
return Err(e);
}
}
// the db already exists and we should not ignore the dump option => throw an error
2022-10-16 01:39:01 +02:00
} else if !empty_db && !opt.ignore_dump_if_db_exists {
bail!(
"database already exists at {:?}, try to delete it or rename it",
2022-10-20 18:00:07 +02:00
opt.db_path.canonicalize().unwrap_or_else(|_| opt.db_path.to_owned())
2022-10-16 01:39:01 +02:00
)
// the dump doesn't exist and we can't ignore it => throw an error
2022-10-16 01:39:01 +02:00
} else if !src_path_exists && !opt.ignore_missing_dump {
bail!("dump doesn't exist at {:?}", path)
// the dump and the db exist and we can ignore the dump because of the ignore_dump_if_db_exists flag
// or, the dump is missing but we can ignore that because of the ignore_missing_dump flag
2022-10-16 01:39:01 +02:00
} else {
2022-12-13 17:25:49 +01:00
open_or_create_database(opt, empty_db)?
2022-10-16 01:39:01 +02:00
}
} else {
2022-12-13 17:25:49 +01:00
open_or_create_database(opt, empty_db)?
2022-10-16 01:39:01 +02:00
};
2022-09-27 16:33:37 +02:00
// We create a loop in a thread that registers snapshotCreation tasks
let index_scheduler = Arc::new(index_scheduler);
let auth_controller = Arc::new(auth_controller);
if let ScheduleSnapshot::Enabled(snapshot_delay) = opt.schedule_snapshot {
let snapshot_delay = Duration::from_secs(snapshot_delay);
let index_scheduler = index_scheduler.clone();
Bring back `release-v0.30.0` into `release-v0.30.0-temp` (final: into `main`) (#3145) * Fix error code of the "duplicate index found" error * Use the content of the ProcessingTasks in the tasks cancelation system * Change the missing_filters error code into missing_task_filters * WIP Introduce the invalid_task_uid error code * Use more precise error codes/message for the task routes + Allow star operator in delete/cancel tasks + rename originalQuery to originalFilters + Display error/canceled_by in task view even when they are = null + Rename task filter fields by using their plural forms + Prepare an error code for canceledBy filter + Only return global tasks if the API key action `index.*` is there * Add canceledBy task filter * Update tests following task API changes * Rename original_query to original_filters everywhere * Update more insta-snap tests * Make clippy happy They're a happy clip now. * Make rustfmt happy >:-( * Fix Index name parsing error message to fit the specification * Bump milli version to 0.35.1 * Fix the new error messages * fix the error messages and add tests * rename the error codes for the sake of consistency * refactor the way we send the cli informations + add the analytics for the config file and ssl usage * Apply suggestions from code review Co-authored-by: Clément Renault <clement@meilisearch.com> * add a comment over the new infos structure * reformat, sorry @kero * Store analytics for the documents deletions * Add analytics on all the settings * Spawn threads with names * Spawn rayon threads with names * update the distinct attributes to the spec update * update the analytics on the search route * implements the analytics on the health and version routes * Fix task details serialization * Add the question mark to the task deletion query filter * Add the question mark to the task cancelation query filter * Fix tests * add analytics on the task route * Add all the missing fields of the new task query type * Create a new analytics for the task deletion * Create a new analytics for the task creation * batch the tasks seen events * Update the finite pagination analytics * add the analytics of the swap-indexes route * Stop removing the DB when failing to read it * Rename originalFilters into originalFilters * Rename matchedDocuments into providedIds * Add `workflow_dispatch` to flaky.yml * Bump grenad to 0.4.4 * Bump milli to version v0.37.0 * Don't multiply total memory returned by sysinfo anymore sysinfo now returns bytes rather than KB * Add a dispatch to the publish binaries workflow * Fix publish release CI * Don't use gold but the default linker * Always display details for the indexDeletion task * Fix the insta tests * refactorize the whole test suite 1. Make a call to assert_internally_consistent automatically when snapshoting the scheduler. There is no point in snapshoting something broken and expect the dumb humans to notice. 2. Replace every possible call to assert_internally_consistent by a snapshot of the scheduler. It takes as many lines and ensure we never change something without noticing in any tests ever. 3. Name every snapshots: it's easier to debug when something goes wrong and easier to review in general. 4. Stop skipping breakpoints, it's too easy to miss something. Now you must explicitely show which path is the scheduler supposed to use. 5. Add a timeout on the channel.recv, it eases the process of writing tests, now when something file you get a failure instead of a deadlock. * rebase on release-v0.30 * makes clippy happy * update the snapshots after a rebase * try to remove the flakyness of the failing test * Add more analytics on the ranking rules positions * Update the dump test to check for the dumpUid dumpCreation task details * send the ranking rules as a string because amplitude is too dumb to process an array as a single value * Display a null dumpUid until we computed the dump itself on disk * Update tests * Check if the master key is missing before returning an error Co-authored-by: Loïc Lecrenier <loic.lecrenier@me.com> Co-authored-by: bors[bot] <26634292+bors[bot]@users.noreply.github.com> Co-authored-by: Kerollmops <clement@meilisearch.com> Co-authored-by: ManyTheFish <many@meilisearch.com> Co-authored-by: Tamo <tamo@meilisearch.com> Co-authored-by: Louis Dureuil <louis@meilisearch.com>
2022-11-28 16:27:41 +01:00
thread::Builder::new()
.name(String::from("register-snapshot-tasks"))
.spawn(move || loop {
thread::sleep(snapshot_delay);
2024-02-21 11:21:26 +01:00
if let Err(e) =
index_scheduler.register(KindWithContent::SnapshotCreation, None, false)
{
Bring back `release-v0.30.0` into `release-v0.30.0-temp` (final: into `main`) (#3145) * Fix error code of the "duplicate index found" error * Use the content of the ProcessingTasks in the tasks cancelation system * Change the missing_filters error code into missing_task_filters * WIP Introduce the invalid_task_uid error code * Use more precise error codes/message for the task routes + Allow star operator in delete/cancel tasks + rename originalQuery to originalFilters + Display error/canceled_by in task view even when they are = null + Rename task filter fields by using their plural forms + Prepare an error code for canceledBy filter + Only return global tasks if the API key action `index.*` is there * Add canceledBy task filter * Update tests following task API changes * Rename original_query to original_filters everywhere * Update more insta-snap tests * Make clippy happy They're a happy clip now. * Make rustfmt happy >:-( * Fix Index name parsing error message to fit the specification * Bump milli version to 0.35.1 * Fix the new error messages * fix the error messages and add tests * rename the error codes for the sake of consistency * refactor the way we send the cli informations + add the analytics for the config file and ssl usage * Apply suggestions from code review Co-authored-by: Clément Renault <clement@meilisearch.com> * add a comment over the new infos structure * reformat, sorry @kero * Store analytics for the documents deletions * Add analytics on all the settings * Spawn threads with names * Spawn rayon threads with names * update the distinct attributes to the spec update * update the analytics on the search route * implements the analytics on the health and version routes * Fix task details serialization * Add the question mark to the task deletion query filter * Add the question mark to the task cancelation query filter * Fix tests * add analytics on the task route * Add all the missing fields of the new task query type * Create a new analytics for the task deletion * Create a new analytics for the task creation * batch the tasks seen events * Update the finite pagination analytics * add the analytics of the swap-indexes route * Stop removing the DB when failing to read it * Rename originalFilters into originalFilters * Rename matchedDocuments into providedIds * Add `workflow_dispatch` to flaky.yml * Bump grenad to 0.4.4 * Bump milli to version v0.37.0 * Don't multiply total memory returned by sysinfo anymore sysinfo now returns bytes rather than KB * Add a dispatch to the publish binaries workflow * Fix publish release CI * Don't use gold but the default linker * Always display details for the indexDeletion task * Fix the insta tests * refactorize the whole test suite 1. Make a call to assert_internally_consistent automatically when snapshoting the scheduler. There is no point in snapshoting something broken and expect the dumb humans to notice. 2. Replace every possible call to assert_internally_consistent by a snapshot of the scheduler. It takes as many lines and ensure we never change something without noticing in any tests ever. 3. Name every snapshots: it's easier to debug when something goes wrong and easier to review in general. 4. Stop skipping breakpoints, it's too easy to miss something. Now you must explicitely show which path is the scheduler supposed to use. 5. Add a timeout on the channel.recv, it eases the process of writing tests, now when something file you get a failure instead of a deadlock. * rebase on release-v0.30 * makes clippy happy * update the snapshots after a rebase * try to remove the flakyness of the failing test * Add more analytics on the ranking rules positions * Update the dump test to check for the dumpUid dumpCreation task details * send the ranking rules as a string because amplitude is too dumb to process an array as a single value * Display a null dumpUid until we computed the dump itself on disk * Update tests * Check if the master key is missing before returning an error Co-authored-by: Loïc Lecrenier <loic.lecrenier@me.com> Co-authored-by: bors[bot] <26634292+bors[bot]@users.noreply.github.com> Co-authored-by: Kerollmops <clement@meilisearch.com> Co-authored-by: ManyTheFish <many@meilisearch.com> Co-authored-by: Tamo <tamo@meilisearch.com> Co-authored-by: Louis Dureuil <louis@meilisearch.com>
2022-11-28 16:27:41 +01:00
error!("Error while registering snapshot: {}", e);
}
})
.unwrap();
2021-09-28 18:10:09 +02:00
}
2022-10-16 01:39:01 +02:00
Ok((index_scheduler, auth_controller))
}
2022-12-13 17:25:49 +01:00
/// Try to start the IndexScheduler and AuthController without checking the VERSION file or anything.
fn open_or_create_database_unchecked(
opt: &Opt,
on_failure: OnFailure,
) -> anyhow::Result<(IndexScheduler, AuthController)> {
// we don't want to create anything in the data.ms yet, thus we
// wrap our two builders in a closure that'll be executed later.
let auth_controller = AuthController::new(&opt.db_path, &opt.master_key);
let instance_features = opt.to_instance_features();
let index_scheduler_builder = || -> anyhow::Result<_> {
Ok(IndexScheduler::new(IndexSchedulerOptions {
version_file_path: opt.db_path.join(VERSION_FILE_NAME),
auth_path: opt.db_path.join("auth"),
tasks_path: opt.db_path.join("tasks"),
update_file_path: opt.db_path.join("update_files"),
indexes_path: opt.db_path.join("indexes"),
snapshots_path: opt.snapshot_dir.clone(),
2022-12-14 20:02:39 +01:00
dumps_path: opt.dump_dir.clone(),
2023-11-28 16:28:11 +01:00
webhook_url: opt.task_webhook_url.as_ref().map(|url| url.to_string()),
webhook_authorization_header: opt.task_webhook_authorization_header.clone(),
2024-07-08 20:58:27 +02:00
task_db_size: opt.max_task_db_size.as_u64() as usize,
index_base_map_size: opt.max_index_size.as_u64() as usize,
enable_mdb_writemap: opt.experimental_reduce_indexing_memory_usage,
indexer_config: (&opt.indexer_options).try_into()?,
autobatching_enabled: true,
2024-02-21 14:33:40 +01:00
cleanup_enabled: !opt.experimental_replication_parameters,
max_number_of_tasks: 1_000_000,
max_number_of_batched_tasks: opt.experimental_max_number_of_batched_tasks,
2024-07-08 20:58:27 +02:00
index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().as_u64() as usize,
2023-01-11 17:34:46 +01:00
index_count: DEFAULT_INDEX_COUNT,
instance_features,
})?)
};
match (
index_scheduler_builder(),
auth_controller.map_err(anyhow::Error::from),
create_version_file(&opt.db_path).map_err(anyhow::Error::from),
) {
(Ok(i), Ok(a), Ok(())) => Ok((i, a)),
(Err(e), _, _) | (_, Err(e), _) | (_, _, Err(e)) => {
if matches!(on_failure, OnFailure::RemoveDb) {
std::fs::remove_dir_all(&opt.db_path)?;
}
Err(e)
}
}
}
2022-12-13 17:25:49 +01:00
/// Ensure you're in a valid state and open the IndexScheduler + AuthController for you.
fn open_or_create_database(
opt: &Opt,
empty_db: bool,
) -> anyhow::Result<(IndexScheduler, AuthController)> {
if !empty_db {
check_version_file(&opt.db_path)?;
}
2022-12-13 17:25:49 +01:00
open_or_create_database_unchecked(opt, OnFailure::KeepDb)
}
2022-10-16 01:39:01 +02:00
fn import_dump(
db_path: &Path,
dump_path: &Path,
index_scheduler: &mut IndexScheduler,
auth: &mut AuthController,
) -> Result<(), anyhow::Error> {
let reader = File::open(dump_path)?;
let mut dump_reader = dump::DumpReader::open(reader)?;
if let Some(date) = dump_reader.date() {
tracing::info!(
2024-02-07 17:55:40 +01:00
version = ?dump_reader.version(), // TODO: get the meilisearch version instead of the dump version
%date,
"Importing a dump of meilisearch"
2022-10-16 01:39:01 +02:00
);
} else {
tracing::info!(
2024-02-07 17:55:40 +01:00
version = ?dump_reader.version(), // TODO: get the meilisearch version instead of the dump version
"Importing a dump of meilisearch",
2022-10-16 01:39:01 +02:00
);
}
let instance_uid = dump_reader.instance_uid()?;
// 1. Import the instance-uid.
if let Some(ref instance_uid) = instance_uid {
// we don't want to panic if there is an error with the instance-uid.
2022-10-20 18:00:07 +02:00
let _ = std::fs::write(db_path.join("instance-uid"), instance_uid.to_string().as_bytes());
2022-10-16 01:39:01 +02:00
};
// 2. Import the `Key`s.
let mut keys = Vec::new();
auth.raw_delete_all_keys()?;
for key in dump_reader.keys()? {
2022-10-16 01:39:01 +02:00
let key = key?;
auth.raw_insert_key(key.clone())?;
keys.push(key);
}
2023-06-26 12:24:55 +02:00
// 3. Import the runtime features.
let features = dump_reader.features()?.unwrap_or_default();
index_scheduler.put_runtime_features(features)?;
2022-10-22 16:35:42 +02:00
let indexer_config = index_scheduler.indexer_config();
2022-10-16 01:39:01 +02:00
// /!\ The tasks must be imported AFTER importing the indexes or else the scheduler might
// try to process tasks while we're trying to import the indexes.
2022-10-16 01:39:01 +02:00
2023-06-26 12:24:55 +02:00
// 4. Import the indexes.
2022-10-16 01:39:01 +02:00
for index_reader in dump_reader.indexes()? {
let mut index_reader = index_reader?;
let metadata = index_reader.metadata();
tracing::info!("Importing index `{}`.", metadata.uid);
2022-12-22 11:46:17 +01:00
let date = Some((metadata.created_at, metadata.updated_at));
2022-12-21 14:28:00 +01:00
let index = index_scheduler.create_raw_index(&metadata.uid, date)?;
2022-10-16 01:39:01 +02:00
let mut wtxn = index.write_txn()?;
let mut builder = milli::update::Settings::new(&mut wtxn, &index, indexer_config);
2023-06-26 12:24:55 +02:00
// 4.1 Import the primary key if there is one.
2022-10-16 01:39:01 +02:00
if let Some(ref primary_key) = metadata.primary_key {
builder.set_primary_key(primary_key.to_string());
}
2023-06-26 12:24:55 +02:00
// 4.2 Import the settings.
tracing::info!("Importing the settings.");
2022-10-16 01:39:01 +02:00
let settings = index_reader.settings()?;
apply_settings_to_builder(&settings, &mut builder);
builder
.execute(|indexing_step| tracing::debug!("update: {:?}", indexing_step), || false)?;
2022-10-16 01:39:01 +02:00
2023-06-26 12:24:55 +02:00
// 4.3 Import the documents.
// 4.3.1 We need to recreate the grenad+obkv format accepted by the index.
tracing::info!("Importing the documents.");
let file = tempfile::tempfile()?;
let mut builder = DocumentsBatchBuilder::new(BufWriter::new(file));
2022-10-16 01:39:01 +02:00
for document in index_reader.documents()? {
builder.append_json_object(&document?)?;
}
// This flush the content of the batch builder.
let file = builder.into_inner()?.into_inner()?;
2022-10-16 01:39:01 +02:00
2023-06-26 12:24:55 +02:00
// 4.3.2 We feed it to the milli index.
2022-10-16 01:39:01 +02:00
let reader = BufReader::new(file);
let reader = DocumentsBatchReader::from_reader(reader)?;
let embedder_configs = index.embedding_configs(&wtxn)?;
let embedders = index_scheduler.embedders(embedder_configs)?;
2022-10-16 01:39:01 +02:00
let builder = milli::update::IndexDocuments::new(
&mut wtxn,
&index,
indexer_config,
IndexDocumentsConfig {
update_method: IndexDocumentsMethod::ReplaceDocuments,
..Default::default()
},
|indexing_step| tracing::trace!("update: {:?}", indexing_step),
|| false,
2022-10-16 01:39:01 +02:00
)?;
let builder = builder.with_embedders(embedders);
2022-10-16 01:39:01 +02:00
let (builder, user_result) = builder.add_documents(reader)?;
2024-02-07 17:55:40 +01:00
let user_result = user_result?;
tracing::info!(documents_found = user_result, "{} documents found.", user_result);
2022-10-16 01:39:01 +02:00
builder.execute()?;
wtxn.commit()?;
tracing::info!("All documents successfully imported.");
2022-10-16 01:39:01 +02:00
}
let mut index_scheduler_dump = index_scheduler.register_dumped_task()?;
2023-06-26 12:24:55 +02:00
// 5. Import the tasks.
for ret in dump_reader.tasks()? {
let (task, file) = ret?;
index_scheduler_dump.register_dumped_task(task, file)?;
}
Ok(index_scheduler_dump.finish()?)
2021-09-28 18:10:09 +02:00
}
pub fn configure_data(
config: &mut web::ServiceConfig,
2022-09-27 16:33:37 +02:00
index_scheduler: Data<IndexScheduler>,
auth: Data<AuthController>,
opt: &Opt,
(logs_route, logs_stderr): (LogRouteHandle, LogStderrHandle),
analytics: Arc<dyn Analytics>,
) {
let search_queue = SearchQueue::new(
opt.experimental_search_queue_size,
available_parallelism().unwrap_or(NonZeroUsize::new(2).unwrap()),
);
2024-07-08 20:58:27 +02:00
let http_payload_size_limit = opt.http_payload_size_limit.as_u64() as usize;
2021-06-23 13:21:48 +02:00
config
2022-09-27 16:33:37 +02:00
.app_data(index_scheduler)
.app_data(auth)
.app_data(web::Data::new(search_queue))
.app_data(web::Data::from(analytics))
.app_data(web::Data::new(logs_route))
.app_data(web::Data::new(logs_stderr))
.app_data(web::Data::new(opt.clone()))
2021-06-23 13:21:48 +02:00
.app_data(
web::JsonConfig::default()
.limit(http_payload_size_limit)
.content_type(|mime| mime == mime::APPLICATION_JSON)
.error_handler(|err, req: &HttpRequest| match err {
JsonPayloadError::ContentType => match req.headers().get(CONTENT_TYPE) {
Some(content_type) => MeilisearchHttpError::InvalidContentType(
content_type.to_str().unwrap_or("unknown").to_string(),
vec![mime::APPLICATION_JSON.to_string()],
)
.into(),
None => MeilisearchHttpError::MissingContentType(vec![
mime::APPLICATION_JSON.to_string(),
])
.into(),
},
err => PayloadError::from(err).into(),
}),
2021-06-23 13:21:48 +02:00
)
2021-06-23 13:58:22 +02:00
.app_data(PayloadConfig::new(http_payload_size_limit))
2021-06-23 13:21:48 +02:00
.app_data(
web::QueryConfig::default().error_handler(|err, _req| PayloadError::from(err).into()),
2021-06-23 13:21:48 +02:00
);
}
2021-03-10 11:56:51 +01:00
2021-06-23 13:21:48 +02:00
#[cfg(feature = "mini-dashboard")]
pub fn dashboard(config: &mut web::ServiceConfig, enable_frontend: bool) {
2021-06-23 13:55:16 +02:00
use actix_web::HttpResponse;
use static_files::Resource;
2021-04-21 13:49:21 +02:00
2021-06-23 14:48:33 +02:00
mod generated {
2021-06-23 13:21:48 +02:00
include!(concat!(env!("OUT_DIR"), "/generated.rs"));
}
2021-04-21 13:49:21 +02:00
2021-06-23 13:21:48 +02:00
if enable_frontend {
2021-06-23 14:48:33 +02:00
let generated = generated::generate();
2021-06-24 16:25:52 +02:00
// Generate routes for mini-dashboard assets
for (path, resource) in generated.into_iter() {
2022-10-20 18:00:07 +02:00
let Resource { mime_type, data, .. } = resource;
2021-06-24 16:25:52 +02:00
// Redirect index.html to /
if path == "index.html" {
2022-02-26 00:28:55 +01:00
config.service(web::resource("/").route(web::get().to(move || async move {
HttpResponse::Ok().content_type(mime_type).body(data)
})));
2021-06-24 16:25:52 +02:00
} else {
2022-02-26 00:28:55 +01:00
config.service(web::resource(path).route(web::get().to(move || async move {
HttpResponse::Ok().content_type(mime_type).body(data)
})));
2021-06-23 13:21:48 +02:00
}
2021-06-24 16:25:52 +02:00
}
2021-06-23 13:21:48 +02:00
} else {
2021-06-24 19:02:28 +02:00
config.service(web::resource("/").route(web::get().to(routes::running)));
2021-06-23 13:21:48 +02:00
}
}
#[cfg(not(feature = "mini-dashboard"))]
pub fn dashboard(config: &mut web::ServiceConfig, _enable_frontend: bool) {
2021-06-24 19:02:28 +02:00
config.service(web::resource("/").route(web::get().to(routes::running)));
2021-06-23 13:21:48 +02:00
}