From e74c3b692abbd64531bf11dd997f28dfe053d4e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 12 Jun 2025 16:23:48 +0200 Subject: [PATCH 01/36] Introduce a new route to export documents and enqueue the export task --- crates/dump/src/lib.rs | 14 +++ crates/index-scheduler/src/dump.rs | 14 +++ crates/index-scheduler/src/insta_snapshot.rs | 3 + crates/index-scheduler/src/processing.rs | 9 ++ .../src/scheduler/autobatcher.rs | 1 + .../src/scheduler/create_batch.rs | 29 ++++- .../src/scheduler/process_batch.rs | 24 +++- crates/index-scheduler/src/utils.rs | 9 ++ crates/meilisearch-types/src/error.rs | 5 + crates/meilisearch-types/src/keys.rs | 5 + crates/meilisearch-types/src/task_view.rs | 45 ++++++++ crates/meilisearch-types/src/tasks.rs | 47 +++++++- crates/meilisearch/src/routes/export.rs | 105 ++++++++++++++++++ crates/meilisearch/src/routes/mod.rs | 3 + 14 files changed, 303 insertions(+), 10 deletions(-) create mode 100644 crates/meilisearch/src/routes/export.rs diff --git a/crates/dump/src/lib.rs b/crates/dump/src/lib.rs index 285818a87..29007e9ce 100644 --- a/crates/dump/src/lib.rs +++ b/crates/dump/src/lib.rs @@ -141,6 +141,12 @@ pub enum KindDump { instance_uid: Option, }, SnapshotCreation, + Export { + url: String, + indexes: Vec, + skip_embeddings: bool, + api_key: Option, + }, UpgradeDatabase { from: (u32, u32, u32), }, @@ -213,6 +219,14 @@ impl From for KindDump { KindDump::DumpCreation { keys, instance_uid } } KindWithContent::SnapshotCreation => KindDump::SnapshotCreation, + KindWithContent::Export { url, indexes, skip_embeddings, api_key } => { + KindDump::Export { + url, + indexes: indexes.into_iter().map(|pattern| pattern.to_string()).collect(), + skip_embeddings, + api_key, + } + } KindWithContent::UpgradeDatabase { from: version } => { KindDump::UpgradeDatabase { from: version } } diff --git a/crates/index-scheduler/src/dump.rs b/crates/index-scheduler/src/dump.rs index ca26e50c8..457d80597 100644 --- a/crates/index-scheduler/src/dump.rs +++ b/crates/index-scheduler/src/dump.rs @@ -4,6 +4,7 @@ use std::io; use dump::{KindDump, TaskDump, UpdateFile}; use meilisearch_types::batches::{Batch, BatchId}; use meilisearch_types::heed::RwTxn; +use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::milli; use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task}; use roaring::RoaringBitmap; @@ -211,6 +212,19 @@ impl<'a> Dump<'a> { KindWithContent::DumpCreation { keys, instance_uid } } KindDump::SnapshotCreation => KindWithContent::SnapshotCreation, + KindDump::Export { url, indexes, skip_embeddings, api_key } => { + KindWithContent::Export { + url, + indexes: indexes + .into_iter() + .map(|index| { + IndexUidPattern::try_from(index).map_err(|_| Error::CorruptedDump) + }) + .collect::, Error>>()?, + skip_embeddings, + api_key, + } + } KindDump::UpgradeDatabase { from } => KindWithContent::UpgradeDatabase { from }, }, }; diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index d01548319..d1db77b2f 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -289,6 +289,9 @@ fn snapshot_details(d: &Details) -> String { Details::IndexSwap { swaps } => { format!("{{ swaps: {swaps:?} }}") } + Details::Export { url, api_key, exported_documents, skip_embeddings } => { + format!("{{ url: {url:?}, api_key: {api_key:?}, exported_documents: {exported_documents:?}, skip_embeddings: {skip_embeddings:?} }}") + } Details::UpgradeDatabase { from, to } => { format!("{{ from: {from:?}, to: {to:?} }}") } diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs index f23b811e5..5d4ac11c3 100644 --- a/crates/index-scheduler/src/processing.rs +++ b/crates/index-scheduler/src/processing.rs @@ -175,8 +175,17 @@ make_enum_progress! { } } +make_enum_progress! { + pub enum Export { + EnsuringCorrectnessOfTheTarget, + ExportTheSettings, + ExportTheDocuments, + } +} + make_atomic_progress!(Task alias AtomicTaskStep => "task" ); make_atomic_progress!(Document alias AtomicDocumentStep => "document" ); +make_atomic_progress!(Index alias AtomicIndexStep => "index" ); make_atomic_progress!(Batch alias AtomicBatchStep => "batch" ); make_atomic_progress!(UpdateFile alias AtomicUpdateFileStep => "update file" ); diff --git a/crates/index-scheduler/src/scheduler/autobatcher.rs b/crates/index-scheduler/src/scheduler/autobatcher.rs index b57983291..b3f7d2743 100644 --- a/crates/index-scheduler/src/scheduler/autobatcher.rs +++ b/crates/index-scheduler/src/scheduler/autobatcher.rs @@ -71,6 +71,7 @@ impl From for AutobatchKind { KindWithContent::TaskCancelation { .. } | KindWithContent::TaskDeletion { .. } | KindWithContent::DumpCreation { .. } + | KindWithContent::Export { .. } | KindWithContent::UpgradeDatabase { .. } | KindWithContent::SnapshotCreation => { panic!("The autobatcher should never be called with tasks that don't apply to an index.") diff --git a/crates/index-scheduler/src/scheduler/create_batch.rs b/crates/index-scheduler/src/scheduler/create_batch.rs index e3763881b..7a6fa4a9b 100644 --- a/crates/index-scheduler/src/scheduler/create_batch.rs +++ b/crates/index-scheduler/src/scheduler/create_batch.rs @@ -47,6 +47,9 @@ pub(crate) enum Batch { IndexSwap { task: Task, }, + Export { + task: Task, + }, UpgradeDatabase { tasks: Vec, }, @@ -103,6 +106,7 @@ impl Batch { Batch::TaskCancelation { task, .. } | Batch::Dump(task) | Batch::IndexCreation { task, .. } + | Batch::Export { task } | Batch::IndexUpdate { task, .. } => { RoaringBitmap::from_sorted_iter(std::iter::once(task.uid)).unwrap() } @@ -142,6 +146,7 @@ impl Batch { | TaskDeletions(_) | SnapshotCreation(_) | Dump(_) + | Export { .. } | UpgradeDatabase { .. } | IndexSwap { .. } => None, IndexOperation { op, .. } => Some(op.index_uid()), @@ -167,6 +172,7 @@ impl fmt::Display for Batch { Batch::IndexUpdate { .. } => f.write_str("IndexUpdate")?, Batch::IndexDeletion { .. } => f.write_str("IndexDeletion")?, Batch::IndexSwap { .. } => f.write_str("IndexSwap")?, + Batch::Export { .. } => f.write_str("Export")?, Batch::UpgradeDatabase { .. } => f.write_str("UpgradeDatabase")?, }; match index_uid { @@ -426,9 +432,10 @@ impl IndexScheduler { /// 0. We get the *last* task to cancel. /// 1. We get the tasks to upgrade. /// 2. We get the *next* task to delete. - /// 3. We get the *next* snapshot to process. - /// 4. We get the *next* dump to process. - /// 5. We get the *next* tasks to process for a specific index. + /// 3. We get the *next* export to process. + /// 4. We get the *next* snapshot to process. + /// 5. We get the *next* dump to process. + /// 6. We get the *next* tasks to process for a specific index. #[tracing::instrument(level = "trace", skip(self, rtxn), target = "indexing::scheduler")] pub(crate) fn create_next_batch( &self, @@ -500,7 +507,17 @@ impl IndexScheduler { return Ok(Some((Batch::TaskDeletions(tasks), current_batch))); } - // 3. we batch the snapshot. + // 3. we batch the export. + let to_export = self.queue.tasks.get_kind(rtxn, Kind::Export)? & enqueued; + if !to_export.is_empty() { + let mut tasks = self.queue.tasks.get_existing_tasks(rtxn, to_export)?; + current_batch.processing(&mut tasks); + let task = tasks.pop().expect("There must be only one export task"); + current_batch.reason(BatchStopReason::TaskKindCannotBeBatched { kind: Kind::Export }); + return Ok(Some((Batch::Export { task }, current_batch))); + } + + // 4. we batch the snapshot. let to_snapshot = self.queue.tasks.get_kind(rtxn, Kind::SnapshotCreation)? & enqueued; if !to_snapshot.is_empty() { let mut tasks = self.queue.tasks.get_existing_tasks(rtxn, to_snapshot)?; @@ -510,7 +527,7 @@ impl IndexScheduler { return Ok(Some((Batch::SnapshotCreation(tasks), current_batch))); } - // 4. we batch the dumps. + // 5. we batch the dumps. let to_dump = self.queue.tasks.get_kind(rtxn, Kind::DumpCreation)? & enqueued; if let Some(to_dump) = to_dump.min() { let mut task = @@ -523,7 +540,7 @@ impl IndexScheduler { return Ok(Some((Batch::Dump(task), current_batch))); } - // 5. We make a batch from the unprioritised tasks. Start by taking the next enqueued task. + // 6. We make a batch from the unprioritised tasks. Start by taking the next enqueued task. let task_id = if let Some(task_id) = enqueued.min() { task_id } else { return Ok(None) }; let mut task = self.queue.tasks.get_task(rtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs index c349f90ad..1f6c4eb2c 100644 --- a/crates/index-scheduler/src/scheduler/process_batch.rs +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -1,6 +1,7 @@ use std::collections::{BTreeSet, HashMap, HashSet}; use std::panic::{catch_unwind, AssertUnwindSafe}; use std::sync::atomic::Ordering; +use std::time::Duration; use meilisearch_types::batches::{BatchEnqueuedAt, BatchId}; use meilisearch_types::heed::{RoTxn, RwTxn}; @@ -13,9 +14,9 @@ use roaring::RoaringBitmap; use super::create_batch::Batch; use crate::processing::{ - AtomicBatchStep, AtomicTaskStep, CreateIndexProgress, DeleteIndexProgress, FinalizingIndexStep, - InnerSwappingTwoIndexes, SwappingTheIndexes, TaskCancelationProgress, TaskDeletionProgress, - UpdateIndexProgress, + AtomicBatchStep, AtomicTaskStep, CreateIndexProgress, DeleteIndexProgress, Export, + FinalizingIndexStep, InnerSwappingTwoIndexes, SwappingTheIndexes, TaskCancelationProgress, + TaskDeletionProgress, UpdateIndexProgress, }; use crate::utils::{ self, remove_n_tasks_datetime_earlier_than, remove_task_datetime, swap_index_uid_in_task, @@ -361,6 +362,23 @@ impl IndexScheduler { task.status = Status::Succeeded; Ok((vec![task], ProcessBatchInfo::default())) } + Batch::Export { mut task } => { + progress.update_progress(Export::EnsuringCorrectnessOfTheTarget); + + // TODO send check requests with the API Key + + let mut wtxn = self.env.write_txn()?; + let KindWithContent::Export { url, indexes, skip_embeddings, api_key } = &task.kind + else { + unreachable!() + }; + + eprintln!("Exporting data to {}...", url); + std::thread::sleep(Duration::from_secs(30)); + + task.status = Status::Succeeded; + Ok((vec![task], ProcessBatchInfo::default())) + } Batch::UpgradeDatabase { mut tasks } => { let KindWithContent::UpgradeDatabase { from } = tasks.last().unwrap().kind else { unreachable!(); diff --git a/crates/index-scheduler/src/utils.rs b/crates/index-scheduler/src/utils.rs index 67e8fc090..7fe44d1c1 100644 --- a/crates/index-scheduler/src/utils.rs +++ b/crates/index-scheduler/src/utils.rs @@ -273,6 +273,7 @@ pub fn swap_index_uid_in_task(task: &mut Task, swap: (&str, &str)) { K::TaskCancelation { .. } | K::TaskDeletion { .. } | K::DumpCreation { .. } + | K::Export { .. } // TODO I have patterns, not index uids | K::UpgradeDatabase { .. } | K::SnapshotCreation => (), }; @@ -600,6 +601,14 @@ impl crate::IndexScheduler { Details::Dump { dump_uid: _ } => { assert_eq!(kind.as_kind(), Kind::DumpCreation); } + Details::Export { + url: _, + api_key: _, + exported_documents: _, + skip_embeddings: _, + } => { + assert_eq!(kind.as_kind(), Kind::Export); + } Details::UpgradeDatabase { from: _, to: _ } => { assert_eq!(kind.as_kind(), Kind::UpgradeDatabase); } diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index d2500b7e1..22c668d59 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -389,6 +389,11 @@ InvalidDocumentEditionContext , InvalidRequest , BAD_REQU InvalidDocumentEditionFunctionFilter , InvalidRequest , BAD_REQUEST ; EditDocumentsByFunctionError , InvalidRequest , BAD_REQUEST ; InvalidSettingsIndexChat , InvalidRequest , BAD_REQUEST ; +// Export +InvalidExportUrl , InvalidRequest , BAD_REQUEST ; +InvalidExportApiKey , InvalidRequest , BAD_REQUEST ; +InvalidExportIndexesPatterns , InvalidRequest , BAD_REQUEST ; +InvalidExportSkipEmbeddings , InvalidRequest , BAD_REQUEST ; // Experimental features - Chat Completions UnimplementedExternalFunctionCalling , InvalidRequest , NOT_IMPLEMENTED ; UnimplementedNonStreamingChatCompletions , InvalidRequest , NOT_IMPLEMENTED ; diff --git a/crates/meilisearch-types/src/keys.rs b/crates/meilisearch-types/src/keys.rs index df2810727..3ba31c2cb 100644 --- a/crates/meilisearch-types/src/keys.rs +++ b/crates/meilisearch-types/src/keys.rs @@ -317,6 +317,9 @@ pub enum Action { #[serde(rename = "experimental.update")] #[deserr(rename = "experimental.update")] ExperimentalFeaturesUpdate, + #[serde(rename = "export")] + #[deserr(rename = "export")] + Export, #[serde(rename = "network.get")] #[deserr(rename = "network.get")] NetworkGet, @@ -438,6 +441,8 @@ pub mod actions { pub const EXPERIMENTAL_FEATURES_GET: u8 = ExperimentalFeaturesGet.repr(); pub const EXPERIMENTAL_FEATURES_UPDATE: u8 = ExperimentalFeaturesUpdate.repr(); + pub const EXPORT: u8 = Export.repr(); + pub const NETWORK_GET: u8 = NetworkGet.repr(); pub const NETWORK_UPDATE: u8 = NetworkUpdate.repr(); diff --git a/crates/meilisearch-types/src/task_view.rs b/crates/meilisearch-types/src/task_view.rs index 86a00426b..06fda0835 100644 --- a/crates/meilisearch-types/src/task_view.rs +++ b/crates/meilisearch-types/src/task_view.rs @@ -1,3 +1,5 @@ +use std::collections::BTreeMap; + use milli::Object; use serde::{Deserialize, Serialize}; use time::{Duration, OffsetDateTime}; @@ -118,6 +120,15 @@ pub struct DetailsView { pub upgrade_from: Option, #[serde(skip_serializing_if = "Option::is_none")] pub upgrade_to: Option, + // exporting + #[serde(skip_serializing_if = "Option::is_none")] + pub url: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub api_key: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub exported_documents: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub skip_embeddings: Option, } impl DetailsView { @@ -238,6 +249,37 @@ impl DetailsView { Some(left) } }, + url: match (self.url.clone(), other.url.clone()) { + (None, None) => None, + (None, Some(url)) | (Some(url), None) => Some(url), + // We should never be able to batch multiple exports at the same time. + // So we return the first one we encounter but that shouldn't be an issue anyway. + (Some(left), Some(_right)) => Some(left), + }, + api_key: match (self.api_key.clone(), other.api_key.clone()) { + (None, None) => None, + (None, Some(key)) | (Some(key), None) => Some(key), + // We should never be able to batch multiple exports at the same time. + // So we return the first one we encounter but that shouldn't be an issue anyway. + (Some(left), Some(_right)) => Some(left), + }, + exported_documents: match ( + self.exported_documents.clone(), + other.exported_documents.clone(), + ) { + (None, None) => None, + (None, Some(exp)) | (Some(exp), None) => Some(exp), + // We should never be able to batch multiple exports at the same time. + // So we return the first one we encounter but that shouldn't be an issue anyway. + (Some(left), Some(_right)) => Some(left), + }, + skip_embeddings: match (self.skip_embeddings, other.skip_embeddings) { + (None, None) => None, + (None, Some(skip)) | (Some(skip), None) => Some(skip), + // We should never be able to batch multiple exports at the same time. + // So we return the first one we encounter but that shouldn't be an issue anyway. + (Some(left), Some(_right)) => Some(left), + }, // We want the earliest version upgrade_from: match (self.upgrade_from.clone(), other.upgrade_from.clone()) { (None, None) => None, @@ -327,6 +369,9 @@ impl From
for DetailsView { Details::IndexSwap { swaps } => { DetailsView { swaps: Some(swaps), ..Default::default() } } + Details::Export { url, api_key, exported_documents, skip_embeddings } => { + DetailsView { exported_documents: Some(exported_documents), ..Default::default() } + } Details::UpgradeDatabase { from, to } => DetailsView { upgrade_from: Some(format!("v{}.{}.{}", from.0, from.1, from.2)), upgrade_to: Some(format!("v{}.{}.{}", to.0, to.1, to.2)), diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 95c52d9a6..e31e6062b 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -1,5 +1,5 @@ use core::fmt; -use std::collections::HashSet; +use std::collections::{BTreeMap, HashSet}; use std::fmt::{Display, Write}; use std::str::FromStr; @@ -14,6 +14,7 @@ use uuid::Uuid; use crate::batches::BatchId; use crate::error::ResponseError; +use crate::index_uid_pattern::IndexUidPattern; use crate::keys::Key; use crate::settings::{Settings, Unchecked}; use crate::{versioning, InstanceUid}; @@ -50,6 +51,7 @@ impl Task { | SnapshotCreation | TaskCancelation { .. } | TaskDeletion { .. } + | Export { .. } | UpgradeDatabase { .. } | IndexSwap { .. } => None, DocumentAdditionOrUpdate { index_uid, .. } @@ -86,6 +88,7 @@ impl Task { | KindWithContent::TaskDeletion { .. } | KindWithContent::DumpCreation { .. } | KindWithContent::SnapshotCreation + | KindWithContent::Export { .. } | KindWithContent::UpgradeDatabase { .. } => None, } } @@ -152,6 +155,12 @@ pub enum KindWithContent { instance_uid: Option, }, SnapshotCreation, + Export { + url: String, + api_key: Option, + indexes: Vec, + skip_embeddings: bool, + }, UpgradeDatabase { from: (u32, u32, u32), }, @@ -180,6 +189,7 @@ impl KindWithContent { KindWithContent::TaskDeletion { .. } => Kind::TaskDeletion, KindWithContent::DumpCreation { .. } => Kind::DumpCreation, KindWithContent::SnapshotCreation => Kind::SnapshotCreation, + KindWithContent::Export { .. } => Kind::Export, KindWithContent::UpgradeDatabase { .. } => Kind::UpgradeDatabase, } } @@ -192,6 +202,7 @@ impl KindWithContent { | SnapshotCreation | TaskCancelation { .. } | TaskDeletion { .. } + | Export { .. } // TODO Should I resolve the index names? | UpgradeDatabase { .. } => vec![], DocumentAdditionOrUpdate { index_uid, .. } | DocumentEdition { index_uid, .. } @@ -269,6 +280,14 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, + KindWithContent::Export { url, api_key, indexes: _, skip_embeddings } => { + Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + exported_documents: Default::default(), + skip_embeddings: *skip_embeddings, + }) + } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: (from.0, from.1, from.2), to: ( @@ -335,6 +354,14 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, + KindWithContent::Export { url, api_key, indexes: _, skip_embeddings } => { + Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + exported_documents: Default::default(), + skip_embeddings: skip_embeddings.clone(), + }) + } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: *from, to: ( @@ -383,6 +410,14 @@ impl From<&KindWithContent> for Option
{ }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, + KindWithContent::Export { url, api_key, indexes: _, skip_embeddings } => { + Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + exported_documents: BTreeMap::default(), + skip_embeddings: skip_embeddings.clone(), + }) + } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: *from, to: ( @@ -499,6 +534,7 @@ pub enum Kind { TaskDeletion, DumpCreation, SnapshotCreation, + Export, UpgradeDatabase, } @@ -516,6 +552,7 @@ impl Kind { | Kind::TaskCancelation | Kind::TaskDeletion | Kind::DumpCreation + | Kind::Export | Kind::UpgradeDatabase | Kind::SnapshotCreation => false, } @@ -536,6 +573,7 @@ impl Display for Kind { Kind::TaskDeletion => write!(f, "taskDeletion"), Kind::DumpCreation => write!(f, "dumpCreation"), Kind::SnapshotCreation => write!(f, "snapshotCreation"), + Kind::Export => write!(f, "export"), Kind::UpgradeDatabase => write!(f, "upgradeDatabase"), } } @@ -643,6 +681,12 @@ pub enum Details { IndexSwap { swaps: Vec, }, + Export { + url: String, + api_key: Option, + exported_documents: BTreeMap, + skip_embeddings: bool, + }, UpgradeDatabase { from: (u32, u32, u32), to: (u32, u32, u32), @@ -667,6 +711,7 @@ impl Details { Self::SettingsUpdate { .. } | Self::IndexInfo { .. } | Self::Dump { .. } + | Self::Export { .. } | Self::UpgradeDatabase { .. } | Self::IndexSwap { .. } => (), } diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs new file mode 100644 index 000000000..666799273 --- /dev/null +++ b/crates/meilisearch/src/routes/export.rs @@ -0,0 +1,105 @@ +use actix_web::web::{self, Data}; +use actix_web::{HttpRequest, HttpResponse}; +use deserr::actix_web::AwebJson; +use deserr::Deserr; +use index_scheduler::IndexScheduler; +use meilisearch_types::deserr::DeserrJsonError; +use meilisearch_types::error::deserr_codes::*; +use meilisearch_types::error::ResponseError; +use meilisearch_types::index_uid_pattern::IndexUidPattern; +use meilisearch_types::keys::actions; +use meilisearch_types::tasks::KindWithContent; +use serde::Serialize; +use tracing::debug; +use utoipa::{OpenApi, ToSchema}; + +use crate::analytics::Analytics; +use crate::extractors::authentication::policies::ActionPolicy; +use crate::extractors::authentication::GuardedData; +use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView}; +use crate::Opt; + +#[derive(OpenApi)] +#[openapi( + paths(export), + tags(( + name = "Export", + description = "The `/export` route allows you to trigger an export process to a remote Meilisearch instance.", + external_docs(url = "https://www.meilisearch.com/docs/reference/api/export"), + )), +)] +pub struct ExportApi; + +pub fn configure(cfg: &mut web::ServiceConfig) { + cfg.service(web::resource("").route(web::post().to(export))); +} + +#[utoipa::path( + get, + path = "", + tag = "Export", + security(("Bearer" = ["export", "*"])), + responses( + (status = OK, description = "Known nodes are returned", body = Export, content_type = "application/json", example = json!( + { + "indexes": ["movie", "steam-*"], + "skip_embeddings": true, + "apiKey": "meilisearch-api-key" + })), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] +async fn export( + index_scheduler: GuardedData, Data>, + export: AwebJson, + req: HttpRequest, + opt: web::Data, + _analytics: Data, +) -> Result { + // TODO make it experimental? + // index_scheduler.features().check_network("Using the /network route")?; + + let export = export.into_inner(); + debug!(returns = ?export, "Trigger export"); + + let Export { url, api_key, indexes, skip_embeddings } = export; + let task = KindWithContent::Export { url, api_key, indexes, skip_embeddings }; + let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; + let task: SummarizedTaskView = + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); + + Ok(HttpResponse::Ok().json(task)) +} + +#[derive(Debug, Deserr, ToSchema, Serialize)] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] +pub struct Export { + #[schema(value_type = Option, example = json!("https://ms-1234.heaven.meilisearch.com"))] + #[serde(default)] + #[deserr(default, error = DeserrJsonError)] + pub url: String, + #[schema(value_type = Option, example = json!("1234abcd"))] + #[serde(default)] + #[deserr(default, error = DeserrJsonError)] + pub api_key: Option, + #[schema(value_type = Option>, example = json!(["movies", "steam-*"]))] + #[deserr(default, error = DeserrJsonError)] + #[serde(default)] + pub indexes: Vec, + #[schema(value_type = Option, example = json!("true"))] + #[serde(default)] + #[deserr(default, error = DeserrJsonError)] + pub skip_embeddings: bool, +} diff --git a/crates/meilisearch/src/routes/mod.rs b/crates/meilisearch/src/routes/mod.rs index cc62e43c3..748cd5d83 100644 --- a/crates/meilisearch/src/routes/mod.rs +++ b/crates/meilisearch/src/routes/mod.rs @@ -54,6 +54,7 @@ mod api_key; pub mod batches; pub mod chats; mod dump; +mod export; pub mod features; pub mod indexes; mod logs; @@ -84,6 +85,7 @@ mod tasks_test; (path = "/multi-search", api = multi_search::MultiSearchApi), (path = "/swap-indexes", api = swap_indexes::SwapIndexesApi), (path = "/experimental-features", api = features::ExperimentalFeaturesApi), + (path = "/export", api = export::ExportApi), (path = "/network", api = network::NetworkApi), ), paths(get_health, get_version, get_stats), @@ -115,6 +117,7 @@ pub fn configure(cfg: &mut web::ServiceConfig) { .service(web::scope("/metrics").configure(metrics::configure)) .service(web::scope("/experimental-features").configure(features::configure)) .service(web::scope("/network").configure(network::configure)) + .service(web::scope("/export").configure(export::configure)) .service(web::scope("/chats").configure(chats::configure)); #[cfg(feature = "swagger")] From e023ee4b6b1a5f2a87f245579742dde43300f117 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 14 Jun 2025 11:39:53 +0200 Subject: [PATCH 02/36] Working first implementation --- crates/dump/src/lib.rs | 25 ++-- crates/index-scheduler/src/dump.rs | 27 ++-- crates/index-scheduler/src/error.rs | 4 + crates/index-scheduler/src/insta_snapshot.rs | 4 +- crates/index-scheduler/src/scheduler/mod.rs | 1 + .../src/scheduler/process_batch.rs | 45 ++++-- .../src/scheduler/process_export.rs | 141 ++++++++++++++++++ .../mod.rs => process_upgrade.rs} | 0 crates/index-scheduler/src/test_utils.rs | 1 + crates/index-scheduler/src/utils.rs | 7 +- crates/meilisearch-types/src/error.rs | 3 +- .../src/index_uid_pattern.rs | 2 +- crates/meilisearch-types/src/task_view.rs | 36 +++-- crates/meilisearch-types/src/tasks.rs | 71 +++++---- crates/meilisearch/src/routes/export.rs | 34 ++++- 15 files changed, 298 insertions(+), 103 deletions(-) create mode 100644 crates/index-scheduler/src/scheduler/process_export.rs rename crates/index-scheduler/src/scheduler/{process_upgrade/mod.rs => process_upgrade.rs} (100%) diff --git a/crates/dump/src/lib.rs b/crates/dump/src/lib.rs index 29007e9ce..5c67d7a94 100644 --- a/crates/dump/src/lib.rs +++ b/crates/dump/src/lib.rs @@ -1,12 +1,16 @@ #![allow(clippy::type_complexity)] #![allow(clippy::wrong_self_convention)] +use std::collections::BTreeMap; + use meilisearch_types::batches::BatchId; use meilisearch_types::error::ResponseError; use meilisearch_types::keys::Key; use meilisearch_types::milli::update::IndexDocumentsMethod; use meilisearch_types::settings::Unchecked; -use meilisearch_types::tasks::{Details, IndexSwap, KindWithContent, Status, Task, TaskId}; +use meilisearch_types::tasks::{ + Details, ExportIndexSettings, IndexSwap, KindWithContent, Status, Task, TaskId, +}; use meilisearch_types::InstanceUid; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; @@ -143,9 +147,8 @@ pub enum KindDump { SnapshotCreation, Export { url: String, - indexes: Vec, - skip_embeddings: bool, api_key: Option, + indexes: BTreeMap, }, UpgradeDatabase { from: (u32, u32, u32), @@ -219,14 +222,14 @@ impl From for KindDump { KindDump::DumpCreation { keys, instance_uid } } KindWithContent::SnapshotCreation => KindDump::SnapshotCreation, - KindWithContent::Export { url, indexes, skip_embeddings, api_key } => { - KindDump::Export { - url, - indexes: indexes.into_iter().map(|pattern| pattern.to_string()).collect(), - skip_embeddings, - api_key, - } - } + KindWithContent::Export { url, api_key, indexes } => KindDump::Export { + url, + api_key, + indexes: indexes + .into_iter() + .map(|(pattern, settings)| (pattern.to_string(), settings)) + .collect(), + }, KindWithContent::UpgradeDatabase { from: version } => { KindDump::UpgradeDatabase { from: version } } diff --git a/crates/index-scheduler/src/dump.rs b/crates/index-scheduler/src/dump.rs index 457d80597..2a99a74aa 100644 --- a/crates/index-scheduler/src/dump.rs +++ b/crates/index-scheduler/src/dump.rs @@ -212,19 +212,20 @@ impl<'a> Dump<'a> { KindWithContent::DumpCreation { keys, instance_uid } } KindDump::SnapshotCreation => KindWithContent::SnapshotCreation, - KindDump::Export { url, indexes, skip_embeddings, api_key } => { - KindWithContent::Export { - url, - indexes: indexes - .into_iter() - .map(|index| { - IndexUidPattern::try_from(index).map_err(|_| Error::CorruptedDump) - }) - .collect::, Error>>()?, - skip_embeddings, - api_key, - } - } + KindDump::Export { url, indexes, api_key } => KindWithContent::Export { + url, + api_key, + indexes: indexes + .into_iter() + .map(|(pattern, settings)| { + Ok(( + IndexUidPattern::try_from(pattern) + .map_err(|_| Error::CorruptedDump)?, + settings, + )) + }) + .collect::>()?, + }, KindDump::UpgradeDatabase { from } => KindWithContent::UpgradeDatabase { from }, }, }; diff --git a/crates/index-scheduler/src/error.rs b/crates/index-scheduler/src/error.rs index cb798b385..2020ac597 100644 --- a/crates/index-scheduler/src/error.rs +++ b/crates/index-scheduler/src/error.rs @@ -151,6 +151,8 @@ pub enum Error { CorruptedTaskQueue, #[error(transparent)] DatabaseUpgrade(Box), + #[error(transparent)] + Export(Box), #[error("Failed to rollback for index `{index}`: {rollback_outcome} ")] RollbackFailed { index: String, rollback_outcome: RollbackOutcome }, #[error(transparent)] @@ -221,6 +223,7 @@ impl Error { | Error::IoError(_) | Error::Persist(_) | Error::FeatureNotEnabled(_) + | Error::Export(_) | Error::Anyhow(_) => true, Error::CreateBatch(_) | Error::CorruptedTaskQueue @@ -294,6 +297,7 @@ impl ErrorCode for Error { Error::CorruptedTaskQueue => Code::Internal, Error::CorruptedDump => Code::Internal, Error::DatabaseUpgrade(_) => Code::Internal, + Error::Export(_) => Code::Internal, Error::RollbackFailed { .. } => Code::Internal, Error::UnrecoverableError(_) => Code::Internal, Error::IndexSchedulerVersionMismatch { .. } => Code::Internal, diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index d1db77b2f..138b591ff 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -289,8 +289,8 @@ fn snapshot_details(d: &Details) -> String { Details::IndexSwap { swaps } => { format!("{{ swaps: {swaps:?} }}") } - Details::Export { url, api_key, exported_documents, skip_embeddings } => { - format!("{{ url: {url:?}, api_key: {api_key:?}, exported_documents: {exported_documents:?}, skip_embeddings: {skip_embeddings:?} }}") + Details::Export { url, api_key, indexes } => { + format!("{{ url: {url:?}, api_key: {api_key:?}, indexes: {indexes:?} }}") } Details::UpgradeDatabase { from, to } => { format!("{{ from: {from:?}, to: {to:?} }}") diff --git a/crates/index-scheduler/src/scheduler/mod.rs b/crates/index-scheduler/src/scheduler/mod.rs index 0e258e27b..5ac591143 100644 --- a/crates/index-scheduler/src/scheduler/mod.rs +++ b/crates/index-scheduler/src/scheduler/mod.rs @@ -4,6 +4,7 @@ mod autobatcher_test; mod create_batch; mod process_batch; mod process_dump_creation; +mod process_export; mod process_index_operation; mod process_snapshot_creation; mod process_upgrade; diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs index 1f6c4eb2c..99278756d 100644 --- a/crates/index-scheduler/src/scheduler/process_batch.rs +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -1,7 +1,6 @@ use std::collections::{BTreeSet, HashMap, HashSet}; use std::panic::{catch_unwind, AssertUnwindSafe}; use std::sync::atomic::Ordering; -use std::time::Duration; use meilisearch_types::batches::{BatchEnqueuedAt, BatchId}; use meilisearch_types::heed::{RoTxn, RwTxn}; @@ -14,9 +13,9 @@ use roaring::RoaringBitmap; use super::create_batch::Batch; use crate::processing::{ - AtomicBatchStep, AtomicTaskStep, CreateIndexProgress, DeleteIndexProgress, Export, - FinalizingIndexStep, InnerSwappingTwoIndexes, SwappingTheIndexes, TaskCancelationProgress, - TaskDeletionProgress, UpdateIndexProgress, + AtomicBatchStep, AtomicTaskStep, CreateIndexProgress, DeleteIndexProgress, FinalizingIndexStep, + InnerSwappingTwoIndexes, SwappingTheIndexes, TaskCancelationProgress, TaskDeletionProgress, + UpdateIndexProgress, }; use crate::utils::{ self, remove_n_tasks_datetime_earlier_than, remove_task_datetime, swap_index_uid_in_task, @@ -363,18 +362,32 @@ impl IndexScheduler { Ok((vec![task], ProcessBatchInfo::default())) } Batch::Export { mut task } => { - progress.update_progress(Export::EnsuringCorrectnessOfTheTarget); - - // TODO send check requests with the API Key - - let mut wtxn = self.env.write_txn()?; - let KindWithContent::Export { url, indexes, skip_embeddings, api_key } = &task.kind - else { + let KindWithContent::Export { url, indexes, api_key } = &task.kind else { unreachable!() }; - eprintln!("Exporting data to {}...", url); - std::thread::sleep(Duration::from_secs(30)); + let ret = catch_unwind(AssertUnwindSafe(|| { + self.process_export(url, indexes, api_key.as_deref(), progress) + })); + + match ret { + // TODO return the matched and exported documents + Ok(Ok(())) => (), + Ok(Err(Error::AbortedTask)) => return Err(Error::AbortedTask), + Ok(Err(e)) => return Err(Error::Export(Box::new(e))), + Err(e) => { + let msg = match e.downcast_ref::<&'static str>() { + Some(s) => *s, + None => match e.downcast_ref::() { + Some(s) => &s[..], + None => "Box", + }, + }; + return Err(Error::Export(Box::new(Error::ProcessBatchPanicked( + msg.to_string(), + )))); + } + } task.status = Status::Succeeded; Ok((vec![task], ProcessBatchInfo::default())) @@ -726,9 +739,11 @@ impl IndexScheduler { from.1, from.2 ); - match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + let ret = catch_unwind(std::panic::AssertUnwindSafe(|| { self.process_rollback(from, progress) - })) { + })); + + match ret { Ok(Ok(())) => {} Ok(Err(err)) => return Err(Error::DatabaseUpgrade(Box::new(err))), Err(e) => { diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs new file mode 100644 index 000000000..e01ddf2e4 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -0,0 +1,141 @@ +use std::collections::BTreeMap; +use std::time::Duration; + +use meilisearch_types::index_uid_pattern::IndexUidPattern; +use meilisearch_types::milli::progress::{Progress, VariableNameStep}; +use meilisearch_types::milli::{obkv_to_json, Filter}; +use meilisearch_types::settings::{self, SecretPolicy}; +use meilisearch_types::tasks::ExportIndexSettings; +use ureq::{json, Agent}; + +use crate::{Error, IndexScheduler, Result}; + +impl IndexScheduler { + pub(super) fn process_export( + &self, + url: &str, + indexes: &BTreeMap, + api_key: Option<&str>, + progress: Progress, + ) -> Result<()> { + #[cfg(test)] + self.maybe_fail(crate::test_utils::FailureLocation::ProcessExport)?; + + let indexes: Vec<_> = self + .index_names()? + .into_iter() + .flat_map(|uid| { + indexes + .iter() + .find(|(pattern, _)| pattern.matches_str(&uid)) + .map(|(_pattern, settings)| (uid, settings)) + }) + .collect(); + + let agent: Agent = ureq::AgentBuilder::new().timeout(Duration::from_secs(5)).build(); + + for (i, (uid, settings)) in indexes.iter().enumerate() { + let must_stop_processing = self.scheduler.must_stop_processing.clone(); + if must_stop_processing.get() { + return Err(Error::AbortedTask); + } + + progress.update_progress(VariableNameStep::::new( + format!("Exporting index `{uid}`"), + i as u32, + indexes.len() as u32, + )); + + let ExportIndexSettings { skip_embeddings, filter } = settings; + let index = self.index(uid)?; + let index_rtxn = index.read_txn()?; + + // Send the primary key + let primary_key = index.primary_key(&index_rtxn).unwrap(); + // TODO implement retry logic + let mut request = agent.post(&format!("{url}/indexes")); + if let Some(api_key) = api_key { + request = request.set("Authorization", &format!("Bearer {api_key}")); + } + request.send_json(&json!({ "uid": uid, "primaryKey": primary_key })).unwrap(); + + // Send the index settings + let settings = settings::settings(&index, &index_rtxn, SecretPolicy::RevealSecrets) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + // TODO implement retry logic + // improve error reporting (get error message) + let mut request = agent.patch(&format!("{url}/indexes/{uid}/settings")); + if let Some(api_key) = api_key { + request = request.set("Authorization", &format!("Bearer {api_key}")); + } + request.send_json(settings).unwrap(); + + let filter = filter + .as_deref() + .map(Filter::from_str) + .transpose() + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))? + .flatten(); + + let filter_universe = filter + .map(|f| f.evaluate(&index_rtxn, &index)) + .transpose() + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + let whole_universe = index + .documents_ids(&index_rtxn) + .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; + let universe = filter_universe.unwrap_or(whole_universe); + + let fields_ids_map = index.fields_ids_map(&index_rtxn)?; + let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); + let embedding_configs = index + .embedding_configs(&index_rtxn) + .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; + + let limit = 50 * 1024 * 1024; // 50 MiB + let mut buffer = Vec::new(); + let mut tmp_buffer = Vec::new(); + for docid in universe { + let document = index + .document(&index_rtxn, docid) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + + let value = obkv_to_json(&all_fields, &fields_ids_map, document) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + + tmp_buffer.clear(); + serde_json::to_writer(&mut tmp_buffer, &value) + .map_err(meilisearch_types::milli::InternalError::from) + .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; + + if buffer.len() + tmp_buffer.len() > limit { + // TODO implement retry logic + post_serialized_documents(&agent, url, uid, api_key, &buffer).unwrap(); + buffer.clear(); + } + buffer.extend_from_slice(&tmp_buffer); + } + + post_serialized_documents(&agent, url, uid, api_key, &buffer).unwrap(); + } + + Ok(()) + } +} + +fn post_serialized_documents( + agent: &Agent, + url: &str, + uid: &str, + api_key: Option<&str>, + buffer: &[u8], +) -> Result { + let mut request = agent.post(&format!("{url}/indexes/{uid}/documents")); + request = request.set("Content-Type", "application/x-ndjson"); + if let Some(api_key) = api_key { + request = request.set("Authorization", &format!("Bearer {api_key}")); + } + request.send_bytes(buffer) +} + +enum ExportIndex {} diff --git a/crates/index-scheduler/src/scheduler/process_upgrade/mod.rs b/crates/index-scheduler/src/scheduler/process_upgrade.rs similarity index 100% rename from crates/index-scheduler/src/scheduler/process_upgrade/mod.rs rename to crates/index-scheduler/src/scheduler/process_upgrade.rs diff --git a/crates/index-scheduler/src/test_utils.rs b/crates/index-scheduler/src/test_utils.rs index 5f206b55c..bfed7f53a 100644 --- a/crates/index-scheduler/src/test_utils.rs +++ b/crates/index-scheduler/src/test_utils.rs @@ -37,6 +37,7 @@ pub(crate) enum FailureLocation { InsideCreateBatch, InsideProcessBatch, PanicInsideProcessBatch, + ProcessExport, ProcessUpgrade, AcquiringWtxn, UpdatingTaskAfterProcessBatchSuccess { task_uid: u32 }, diff --git a/crates/index-scheduler/src/utils.rs b/crates/index-scheduler/src/utils.rs index 7fe44d1c1..79571745b 100644 --- a/crates/index-scheduler/src/utils.rs +++ b/crates/index-scheduler/src/utils.rs @@ -601,12 +601,7 @@ impl crate::IndexScheduler { Details::Dump { dump_uid: _ } => { assert_eq!(kind.as_kind(), Kind::DumpCreation); } - Details::Export { - url: _, - api_key: _, - exported_documents: _, - skip_embeddings: _, - } => { + Details::Export { url: _, api_key: _, indexes: _ } => { assert_eq!(kind.as_kind(), Kind::Export); } Details::UpgradeDatabase { from: _, to: _ } => { diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index 22c668d59..08ee803ef 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -393,7 +393,8 @@ InvalidSettingsIndexChat , InvalidRequest , BAD_REQU InvalidExportUrl , InvalidRequest , BAD_REQUEST ; InvalidExportApiKey , InvalidRequest , BAD_REQUEST ; InvalidExportIndexesPatterns , InvalidRequest , BAD_REQUEST ; -InvalidExportSkipEmbeddings , InvalidRequest , BAD_REQUEST ; +InvalidExportIndexSkipEmbeddings , InvalidRequest , BAD_REQUEST ; +InvalidExportIndexFilter , InvalidRequest , BAD_REQUEST ; // Experimental features - Chat Completions UnimplementedExternalFunctionCalling , InvalidRequest , NOT_IMPLEMENTED ; UnimplementedNonStreamingChatCompletions , InvalidRequest , NOT_IMPLEMENTED ; diff --git a/crates/meilisearch-types/src/index_uid_pattern.rs b/crates/meilisearch-types/src/index_uid_pattern.rs index baf0249e2..f90fc7aee 100644 --- a/crates/meilisearch-types/src/index_uid_pattern.rs +++ b/crates/meilisearch-types/src/index_uid_pattern.rs @@ -12,7 +12,7 @@ use crate::index_uid::{IndexUid, IndexUidFormatError}; /// An index uid pattern is composed of only ascii alphanumeric characters, - and _, between 1 and 400 /// bytes long and optionally ending with a *. -#[derive(Serialize, Deserialize, Deserr, Debug, Clone, PartialEq, Eq, Hash)] +#[derive(Serialize, Deserialize, Deserr, Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] #[deserr(try_from(&String) = FromStr::from_str -> IndexUidPatternFormatError)] pub struct IndexUidPattern(String); diff --git a/crates/meilisearch-types/src/task_view.rs b/crates/meilisearch-types/src/task_view.rs index 06fda0835..0a8d7b8fe 100644 --- a/crates/meilisearch-types/src/task_view.rs +++ b/crates/meilisearch-types/src/task_view.rs @@ -8,7 +8,9 @@ use utoipa::ToSchema; use crate::batches::BatchId; use crate::error::ResponseError; use crate::settings::{Settings, Unchecked}; -use crate::tasks::{serialize_duration, Details, IndexSwap, Kind, Status, Task, TaskId}; +use crate::tasks::{ + serialize_duration, Details, DetailsExportIndexSettings, IndexSwap, Kind, Status, Task, TaskId, +}; #[derive(Debug, Clone, PartialEq, Serialize, ToSchema)] #[serde(rename_all = "camelCase")] @@ -126,9 +128,7 @@ pub struct DetailsView { #[serde(skip_serializing_if = "Option::is_none")] pub api_key: Option, #[serde(skip_serializing_if = "Option::is_none")] - pub exported_documents: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub skip_embeddings: Option, + pub indexes: Option>, } impl DetailsView { @@ -263,19 +263,9 @@ impl DetailsView { // So we return the first one we encounter but that shouldn't be an issue anyway. (Some(left), Some(_right)) => Some(left), }, - exported_documents: match ( - self.exported_documents.clone(), - other.exported_documents.clone(), - ) { + indexes: match (self.indexes.clone(), other.indexes.clone()) { (None, None) => None, - (None, Some(exp)) | (Some(exp), None) => Some(exp), - // We should never be able to batch multiple exports at the same time. - // So we return the first one we encounter but that shouldn't be an issue anyway. - (Some(left), Some(_right)) => Some(left), - }, - skip_embeddings: match (self.skip_embeddings, other.skip_embeddings) { - (None, None) => None, - (None, Some(skip)) | (Some(skip), None) => Some(skip), + (None, Some(indexes)) | (Some(indexes), None) => Some(indexes), // We should never be able to batch multiple exports at the same time. // So we return the first one we encounter but that shouldn't be an issue anyway. (Some(left), Some(_right)) => Some(left), @@ -369,9 +359,17 @@ impl From
for DetailsView { Details::IndexSwap { swaps } => { DetailsView { swaps: Some(swaps), ..Default::default() } } - Details::Export { url, api_key, exported_documents, skip_embeddings } => { - DetailsView { exported_documents: Some(exported_documents), ..Default::default() } - } + Details::Export { url, api_key, indexes } => DetailsView { + url: Some(url), + api_key, + indexes: Some( + indexes + .into_iter() + .map(|(pattern, settings)| (pattern.to_string(), settings)) + .collect(), + ), + ..Default::default() + }, Details::UpgradeDatabase { from, to } => DetailsView { upgrade_from: Some(format!("v{}.{}.{}", from.0, from.1, from.2)), upgrade_to: Some(format!("v{}.{}.{}", to.0, to.1, to.2)), diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index e31e6062b..1f8f7e7cb 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -9,7 +9,7 @@ use milli::Object; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize, Serializer}; use time::{Duration, OffsetDateTime}; -use utoipa::ToSchema; +use utoipa::{schema, ToSchema}; use uuid::Uuid; use crate::batches::BatchId; @@ -158,8 +158,7 @@ pub enum KindWithContent { Export { url: String, api_key: Option, - indexes: Vec, - skip_embeddings: bool, + indexes: BTreeMap, }, UpgradeDatabase { from: (u32, u32, u32), @@ -172,6 +171,13 @@ pub struct IndexSwap { pub indexes: (String, String), } +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] +#[serde(rename_all = "camelCase")] +pub struct ExportIndexSettings { + pub skip_embeddings: bool, + pub filter: Option, +} + impl KindWithContent { pub fn as_kind(&self) -> Kind { match self { @@ -280,14 +286,11 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, indexes: _, skip_embeddings } => { - Some(Details::Export { - url: url.clone(), - api_key: api_key.clone(), - exported_documents: Default::default(), - skip_embeddings: *skip_embeddings, - }) - } + KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + indexes: indexes.into_iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + }), KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: (from.0, from.1, from.2), to: ( @@ -354,14 +357,11 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, indexes: _, skip_embeddings } => { - Some(Details::Export { - url: url.clone(), - api_key: api_key.clone(), - exported_documents: Default::default(), - skip_embeddings: skip_embeddings.clone(), - }) - } + KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + indexes: indexes.into_iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + }), KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: *from, to: ( @@ -410,14 +410,11 @@ impl From<&KindWithContent> for Option
{ }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, indexes: _, skip_embeddings } => { - Some(Details::Export { - url: url.clone(), - api_key: api_key.clone(), - exported_documents: BTreeMap::default(), - skip_embeddings: skip_embeddings.clone(), - }) - } + KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + indexes: indexes.into_iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + }), KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: *from, to: ( @@ -684,8 +681,7 @@ pub enum Details { Export { url: String, api_key: Option, - exported_documents: BTreeMap, - skip_embeddings: bool, + indexes: BTreeMap, }, UpgradeDatabase { from: (u32, u32, u32), @@ -693,6 +689,23 @@ pub enum Details { }, } +#[derive(Debug, PartialEq, Clone, Serialize, Deserialize, ToSchema)] +#[schema(rename_all = "camelCase")] +pub struct DetailsExportIndexSettings { + #[serde(flatten)] + settings: ExportIndexSettings, + #[serde(skip_serializing_if = "Option::is_none")] + matched_documents: Option, + #[serde(skip_serializing_if = "Option::is_none")] + exported_documents: Option, +} + +impl From for DetailsExportIndexSettings { + fn from(settings: ExportIndexSettings) -> Self { + DetailsExportIndexSettings { settings, matched_documents: None, exported_documents: None } + } +} + impl Details { pub fn to_failed(&self) -> Self { let mut details = self.clone(); diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 666799273..7029f0ebf 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -1,3 +1,5 @@ +use std::collections::BTreeMap; + use actix_web::web::{self, Data}; use actix_web::{HttpRequest, HttpResponse}; use deserr::actix_web::AwebJson; @@ -8,7 +10,7 @@ use meilisearch_types::error::deserr_codes::*; use meilisearch_types::error::ResponseError; use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::keys::actions; -use meilisearch_types::tasks::KindWithContent; +use meilisearch_types::tasks::{ExportIndexSettings as DbExportIndexSettings, KindWithContent}; use serde::Serialize; use tracing::debug; use utoipa::{OpenApi, ToSchema}; @@ -69,8 +71,17 @@ async fn export( let export = export.into_inner(); debug!(returns = ?export, "Trigger export"); - let Export { url, api_key, indexes, skip_embeddings } = export; - let task = KindWithContent::Export { url, api_key, indexes, skip_embeddings }; + let Export { url, api_key, indexes } = export; + let task = KindWithContent::Export { + url, + api_key, + indexes: indexes + .into_iter() + .map(|(pattern, ExportIndexSettings { skip_embeddings, filter })| { + (pattern, DbExportIndexSettings { skip_embeddings, filter }) + }) + .collect(), + }; let uid = get_task_id(&req, &opt)?; let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = @@ -95,11 +106,22 @@ pub struct Export { #[deserr(default, error = DeserrJsonError)] pub api_key: Option, #[schema(value_type = Option>, example = json!(["movies", "steam-*"]))] - #[deserr(default, error = DeserrJsonError)] + #[deserr(default)] #[serde(default)] - pub indexes: Vec, + pub indexes: BTreeMap, +} + +#[derive(Debug, Deserr, ToSchema, Serialize)] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] +pub struct ExportIndexSettings { #[schema(value_type = Option, example = json!("true"))] #[serde(default)] - #[deserr(default, error = DeserrJsonError)] + #[deserr(default, error = DeserrJsonError)] pub skip_embeddings: bool, + #[schema(value_type = Option, example = json!("genres = action"))] + #[serde(default)] + #[deserr(default, error = DeserrJsonError)] + pub filter: Option, } From e8795d2608326dff111098d64ea25b646ff4361c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 14 Jun 2025 12:43:24 +0200 Subject: [PATCH 03/36] Export embeddings --- .../src/scheduler/process_export.rs | 73 ++++++++++++++++++- 1 file changed, 70 insertions(+), 3 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index e01ddf2e4..1686472ab 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -1,13 +1,17 @@ use std::collections::BTreeMap; +use std::sync::atomic; use std::time::Duration; use meilisearch_types::index_uid_pattern::IndexUidPattern; +use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; use meilisearch_types::milli::progress::{Progress, VariableNameStep}; +use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use meilisearch_types::milli::{obkv_to_json, Filter}; use meilisearch_types::settings::{self, SecretPolicy}; use meilisearch_types::tasks::ExportIndexSettings; use ureq::{json, Agent}; +use crate::processing::AtomicDocumentStep; use crate::{Error, IndexScheduler, Result}; impl IndexScheduler { @@ -92,19 +96,77 @@ impl IndexScheduler { .embedding_configs(&index_rtxn) .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; + let total_documents = universe.len() as u32; + let (step, progress_step) = AtomicDocumentStep::new(total_documents); + progress.update_progress(progress_step); + let limit = 50 * 1024 * 1024; // 50 MiB let mut buffer = Vec::new(); let mut tmp_buffer = Vec::new(); - for docid in universe { + for (i, docid) in universe.into_iter().enumerate() { let document = index .document(&index_rtxn, docid) .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; - let value = obkv_to_json(&all_fields, &fields_ids_map, document) + let mut document = obkv_to_json(&all_fields, &fields_ids_map, document) .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + // TODO definitely factorize this code + if !*skip_embeddings { + 'inject_vectors: { + let embeddings = index + .embeddings(&index_rtxn, docid) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + + if embeddings.is_empty() { + break 'inject_vectors; + } + + let vectors = document + .entry(RESERVED_VECTORS_FIELD_NAME) + .or_insert(serde_json::Value::Object(Default::default())); + + let serde_json::Value::Object(vectors) = vectors else { + return Err(Error::from_milli( + meilisearch_types::milli::Error::UserError( + meilisearch_types::milli::UserError::InvalidVectorsMapType { + document_id: { + if let Ok(Some(Ok(index))) = index + .external_id_of(&index_rtxn, std::iter::once(docid)) + .map(|it| it.into_iter().next()) + { + index + } else { + format!("internal docid={docid}") + } + }, + value: vectors.clone(), + }, + ), + Some(uid.to_string()), + )); + }; + + for (embedder_name, embeddings) in embeddings { + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == embedder_name) + .is_some_and(|conf| conf.user_provided.contains(docid)); + + let embeddings = ExplicitVectors { + embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( + embeddings, + )), + regenerate: !user_provided, + }; + vectors + .insert(embedder_name, serde_json::to_value(embeddings).unwrap()); + } + } + } + tmp_buffer.clear(); - serde_json::to_writer(&mut tmp_buffer, &value) + serde_json::to_writer(&mut tmp_buffer, &document) .map_err(meilisearch_types::milli::InternalError::from) .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; @@ -114,9 +176,14 @@ impl IndexScheduler { buffer.clear(); } buffer.extend_from_slice(&tmp_buffer); + + if i % 100 == 0 { + step.fetch_add(100, atomic::Ordering::Relaxed); + } } post_serialized_documents(&agent, url, uid, api_key, &buffer).unwrap(); + step.store(total_documents, atomic::Ordering::Relaxed); } Ok(()) From acb7c0a449462d682448d5362cc189ad6410d155 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Jun 2025 11:35:47 +0200 Subject: [PATCH 04/36] Implement a retry strategy --- Cargo.lock | 1 + crates/index-scheduler/Cargo.toml | 1 + crates/index-scheduler/src/error.rs | 4 + .../src/scheduler/process_export.rs | 108 ++++++++++++++---- crates/meilisearch-types/src/settings.rs | 1 + 5 files changed, 91 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7455ff1b4..a883b749f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2997,6 +2997,7 @@ name = "index-scheduler" version = "1.15.2" dependencies = [ "anyhow", + "backoff", "big_s", "bincode", "bumpalo", diff --git a/crates/index-scheduler/Cargo.toml b/crates/index-scheduler/Cargo.toml index f4901b2f2..de0d01935 100644 --- a/crates/index-scheduler/Cargo.toml +++ b/crates/index-scheduler/Cargo.toml @@ -44,6 +44,7 @@ time = { version = "0.3.41", features = [ tracing = "0.1.41" ureq = "2.12.1" uuid = { version = "1.17.0", features = ["serde", "v4"] } +backoff = "0.4.0" [dev-dependencies] big_s = "1.0.2" diff --git a/crates/index-scheduler/src/error.rs b/crates/index-scheduler/src/error.rs index 2020ac597..60669ff2d 100644 --- a/crates/index-scheduler/src/error.rs +++ b/crates/index-scheduler/src/error.rs @@ -153,6 +153,8 @@ pub enum Error { DatabaseUpgrade(Box), #[error(transparent)] Export(Box), + #[error("Failed to export documents to remote server {code} ({type}): {message} <{link}>")] + FromRemoteWhenExporting { message: String, code: String, r#type: String, link: String }, #[error("Failed to rollback for index `{index}`: {rollback_outcome} ")] RollbackFailed { index: String, rollback_outcome: RollbackOutcome }, #[error(transparent)] @@ -214,6 +216,7 @@ impl Error { | Error::BatchNotFound(_) | Error::TaskDeletionWithEmptyQuery | Error::TaskCancelationWithEmptyQuery + | Error::FromRemoteWhenExporting { .. } | Error::AbortedTask | Error::Dump(_) | Error::Heed(_) @@ -285,6 +288,7 @@ impl ErrorCode for Error { Error::Dump(e) => e.error_code(), Error::Milli { error, .. } => error.error_code(), Error::ProcessBatchPanicked(_) => Code::Internal, + Error::FromRemoteWhenExporting { .. } => Code::Internal, Error::Heed(e) => e.error_code(), Error::HeedTransaction(e) => e.error_code(), Error::FileStore(e) => e.error_code(), diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 1686472ab..7501c260e 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -1,14 +1,18 @@ use std::collections::BTreeMap; +use std::io; use std::sync::atomic; use std::time::Duration; +use backoff::ExponentialBackoff; use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; use meilisearch_types::milli::progress::{Progress, VariableNameStep}; +use meilisearch_types::milli::update::Setting; use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use meilisearch_types::milli::{obkv_to_json, Filter}; use meilisearch_types::settings::{self, SecretPolicy}; use meilisearch_types::tasks::ExportIndexSettings; +use serde::Deserialize; use ureq::{json, Agent}; use crate::processing::AtomicDocumentStep; @@ -17,7 +21,7 @@ use crate::{Error, IndexScheduler, Result}; impl IndexScheduler { pub(super) fn process_export( &self, - url: &str, + base_url: &str, indexes: &BTreeMap, api_key: Option<&str>, progress: Progress, @@ -56,24 +60,34 @@ impl IndexScheduler { // Send the primary key let primary_key = index.primary_key(&index_rtxn).unwrap(); - // TODO implement retry logic - let mut request = agent.post(&format!("{url}/indexes")); - if let Some(api_key) = api_key { - request = request.set("Authorization", &format!("Bearer {api_key}")); - } - request.send_json(&json!({ "uid": uid, "primaryKey": primary_key })).unwrap(); + let url = format!("{base_url}/indexes"); + retry(|| { + let mut request = agent.post(&url); + if let Some(api_key) = api_key { + request = request.set("Authorization", &format!("Bearer {api_key}")); + } + let index_param = json!({ "uid": uid, "primaryKey": primary_key }); + request.send_json(&index_param).map_err(into_backoff_error) + })?; // Send the index settings - let settings = settings::settings(&index, &index_rtxn, SecretPolicy::RevealSecrets) + let mut settings = settings::settings(&index, &index_rtxn, SecretPolicy::RevealSecrets) .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; - // TODO implement retry logic - // improve error reporting (get error message) - let mut request = agent.patch(&format!("{url}/indexes/{uid}/settings")); - if let Some(api_key) = api_key { - request = request.set("Authorization", &format!("Bearer {api_key}")); + // Remove the experimental chat setting if not enabled + if self.features().check_chat_completions("exporting chat settings").is_err() { + settings.chat = Setting::NotSet; } - request.send_json(settings).unwrap(); + // Retry logic for sending settings + let url = format!("{base_url}/indexes/{uid}/settings"); + retry(|| { + let mut request = agent.patch(&url); + if let Some(api_key) = api_key { + request = request.set("Authorization", &format!("Bearer {api_key}")); + } + request.send_json(settings.clone()).map_err(into_backoff_error) + })?; + // TODO support JSON Value objects let filter = filter .as_deref() .map(Filter::from_str) @@ -171,8 +185,7 @@ impl IndexScheduler { .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; if buffer.len() + tmp_buffer.len() > limit { - // TODO implement retry logic - post_serialized_documents(&agent, url, uid, api_key, &buffer).unwrap(); + post_serialized_documents(&agent, base_url, uid, api_key, &buffer).unwrap(); buffer.clear(); } buffer.extend_from_slice(&tmp_buffer); @@ -182,7 +195,7 @@ impl IndexScheduler { } } - post_serialized_documents(&agent, url, uid, api_key, &buffer).unwrap(); + post_serialized_documents(&agent, base_url, uid, api_key, &buffer).unwrap(); step.store(total_documents, atomic::Ordering::Relaxed); } @@ -190,19 +203,66 @@ impl IndexScheduler { } } +fn retry(send_request: F) -> Result +where + F: Fn() -> Result>, +{ + match backoff::retry(ExponentialBackoff::default(), || send_request()) { + Ok(response) => Ok(response), + Err(backoff::Error::Permanent(e)) => Err(ureq_error_into_error(e)), + Err(backoff::Error::Transient { err, retry_after: _ }) => Err(ureq_error_into_error(err)), + } +} + fn post_serialized_documents( agent: &Agent, - url: &str, + base_url: &str, uid: &str, api_key: Option<&str>, buffer: &[u8], -) -> Result { - let mut request = agent.post(&format!("{url}/indexes/{uid}/documents")); - request = request.set("Content-Type", "application/x-ndjson"); - if let Some(api_key) = api_key { - request = request.set("Authorization", &format!("Bearer {api_key}")); +) -> Result { + let url = format!("{base_url}/indexes/{uid}/documents"); + retry(|| { + let mut request = agent.post(&url); + request = request.set("Content-Type", "application/x-ndjson"); + if let Some(api_key) = api_key { + request = request.set("Authorization", &(format!("Bearer {api_key}"))); + } + request.send_bytes(buffer).map_err(into_backoff_error) + }) +} + +fn into_backoff_error(err: ureq::Error) -> backoff::Error { + match err { + // Those code status must trigger an automatic retry + // + ureq::Error::Status(408 | 429 | 500 | 502 | 503 | 504, _) => { + backoff::Error::Transient { err, retry_after: None } + } + ureq::Error::Status(_, _) => backoff::Error::Permanent(err), + ureq::Error::Transport(_) => backoff::Error::Transient { err, retry_after: None }, + } +} + +/// Converts a `ureq::Error` into an `Error`. +fn ureq_error_into_error(error: ureq::Error) -> Error { + #[derive(Deserialize)] + struct MeiliError { + message: String, + code: String, + r#type: String, + link: String, + } + + match error { + ureq::Error::Status(_, response) => match response.into_json() { + Ok(MeiliError { message, code, r#type, link }) => { + Error::FromRemoteWhenExporting { message, code, r#type, link } + } + Err(e) => io::Error::from(e).into(), + }, + ureq::Error::Transport(transport) => io::Error::new(io::ErrorKind::Other, transport).into(), } - request.send_bytes(buffer) } enum ExportIndex {} diff --git a/crates/meilisearch-types/src/settings.rs b/crates/meilisearch-types/src/settings.rs index 1c225b355..295318f4b 100644 --- a/crates/meilisearch-types/src/settings.rs +++ b/crates/meilisearch-types/src/settings.rs @@ -968,6 +968,7 @@ pub fn settings( if let SecretPolicy::HideSecrets = secret_policy { settings.hide_secrets() } + Ok(settings) } From 7c448bcc003c99f125ad8e75dca590b71c984187 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Jun 2025 14:53:50 +0200 Subject: [PATCH 05/36] Make clippy happy --- crates/meilisearch-types/src/tasks.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 1f8f7e7cb..3ef60cacf 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -289,7 +289,7 @@ impl KindWithContent { KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { url: url.clone(), api_key: api_key.clone(), - indexes: indexes.into_iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }), KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: (from.0, from.1, from.2), @@ -360,7 +360,7 @@ impl KindWithContent { KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { url: url.clone(), api_key: api_key.clone(), - indexes: indexes.into_iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }), KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: *from, @@ -413,7 +413,7 @@ impl From<&KindWithContent> for Option
{ KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { url: url.clone(), api_key: api_key.clone(), - indexes: indexes.into_iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }), KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: *from, From 3e2f4682137159745848bee46d637dbd35cc9cc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Jun 2025 15:34:05 +0200 Subject: [PATCH 06/36] Support task cancelation --- .../src/scheduler/process_export.rs | 54 ++++++++++--------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 7501c260e..ceac18632 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -15,6 +15,7 @@ use meilisearch_types::tasks::ExportIndexSettings; use serde::Deserialize; use ureq::{json, Agent}; +use super::MustStopProcessing; use crate::processing::AtomicDocumentStep; use crate::{Error, IndexScheduler, Result}; @@ -41,9 +42,8 @@ impl IndexScheduler { .collect(); let agent: Agent = ureq::AgentBuilder::new().timeout(Duration::from_secs(5)).build(); - + let must_stop_processing = self.scheduler.must_stop_processing.clone(); for (i, (uid, settings)) in indexes.iter().enumerate() { - let must_stop_processing = self.scheduler.must_stop_processing.clone(); if must_stop_processing.get() { return Err(Error::AbortedTask); } @@ -59,9 +59,9 @@ impl IndexScheduler { let index_rtxn = index.read_txn()?; // Send the primary key - let primary_key = index.primary_key(&index_rtxn).unwrap(); + let primary_key = index.primary_key(&index_rtxn)?; let url = format!("{base_url}/indexes"); - retry(|| { + retry(&must_stop_processing, || { let mut request = agent.post(&url); if let Some(api_key) = api_key { request = request.set("Authorization", &format!("Bearer {api_key}")); @@ -79,7 +79,7 @@ impl IndexScheduler { } // Retry logic for sending settings let url = format!("{base_url}/indexes/{uid}/settings"); - retry(|| { + retry(&must_stop_processing, || { let mut request = agent.patch(&url); if let Some(api_key) = api_key { request = request.set("Authorization", &format!("Bearer {api_key}")); @@ -115,6 +115,8 @@ impl IndexScheduler { progress.update_progress(progress_step); let limit = 50 * 1024 * 1024; // 50 MiB + let documents_url = format!("{base_url}/indexes/{uid}/documents"); + let mut buffer = Vec::new(); let mut tmp_buffer = Vec::new(); for (i, docid) in universe.into_iter().enumerate() { @@ -185,7 +187,14 @@ impl IndexScheduler { .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; if buffer.len() + tmp_buffer.len() > limit { - post_serialized_documents(&agent, base_url, uid, api_key, &buffer).unwrap(); + retry(&must_stop_processing, || { + let mut request = agent.post(&documents_url); + request = request.set("Content-Type", "application/x-ndjson"); + if let Some(api_key) = api_key { + request = request.set("Authorization", &(format!("Bearer {api_key}"))); + } + request.send_bytes(&buffer).map_err(into_backoff_error) + })?; buffer.clear(); } buffer.extend_from_slice(&tmp_buffer); @@ -195,7 +204,14 @@ impl IndexScheduler { } } - post_serialized_documents(&agent, base_url, uid, api_key, &buffer).unwrap(); + retry(&must_stop_processing, || { + let mut request = agent.post(&documents_url); + request = request.set("Content-Type", "application/x-ndjson"); + if let Some(api_key) = api_key { + request = request.set("Authorization", &(format!("Bearer {api_key}"))); + } + request.send_bytes(&buffer).map_err(into_backoff_error) + })?; step.store(total_documents, atomic::Ordering::Relaxed); } @@ -203,10 +219,14 @@ impl IndexScheduler { } } -fn retry(send_request: F) -> Result +fn retry(must_stop_processing: &MustStopProcessing, send_request: F) -> Result where F: Fn() -> Result>, { + if must_stop_processing.get() { + return Err(Error::AbortedTask); + } + match backoff::retry(ExponentialBackoff::default(), || send_request()) { Ok(response) => Ok(response), Err(backoff::Error::Permanent(e)) => Err(ureq_error_into_error(e)), @@ -214,24 +234,6 @@ where } } -fn post_serialized_documents( - agent: &Agent, - base_url: &str, - uid: &str, - api_key: Option<&str>, - buffer: &[u8], -) -> Result { - let url = format!("{base_url}/indexes/{uid}/documents"); - retry(|| { - let mut request = agent.post(&url); - request = request.set("Content-Type", "application/x-ndjson"); - if let Some(api_key) = api_key { - request = request.set("Authorization", &(format!("Bearer {api_key}"))); - } - request.send_bytes(buffer).map_err(into_backoff_error) - }) -} - fn into_backoff_error(err: ureq::Error) -> backoff::Error { match err { // Those code status must trigger an automatic retry From bc08cd0deb8805b126c64dc384b18d2ee203f508 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Jun 2025 15:37:15 +0200 Subject: [PATCH 07/36] Make clippy happy again --- .../index-scheduler/src/scheduler/process_export.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index ceac18632..e10c468fc 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -59,7 +59,10 @@ impl IndexScheduler { let index_rtxn = index.read_txn()?; // Send the primary key - let primary_key = index.primary_key(&index_rtxn)?; + let primary_key = index + .primary_key(&index_rtxn) + .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; + let url = format!("{base_url}/indexes"); retry(&must_stop_processing, || { let mut request = agent.post(&url); @@ -108,7 +111,7 @@ impl IndexScheduler { let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); let embedding_configs = index .embedding_configs(&index_rtxn) - .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; let total_documents = universe.len() as u32; let (step, progress_step) = AtomicDocumentStep::new(total_documents); @@ -227,7 +230,7 @@ where return Err(Error::AbortedTask); } - match backoff::retry(ExponentialBackoff::default(), || send_request()) { + match backoff::retry(ExponentialBackoff::default(), send_request) { Ok(response) => Ok(response), Err(backoff::Error::Permanent(e)) => Err(ureq_error_into_error(e)), Err(backoff::Error::Transient { err, retry_after: _ }) => Err(ureq_error_into_error(err)), @@ -261,7 +264,7 @@ fn ureq_error_into_error(error: ureq::Error) -> Error { Ok(MeiliError { message, code, r#type, link }) => { Error::FromRemoteWhenExporting { message, code, r#type, link } } - Err(e) => io::Error::from(e).into(), + Err(e) => e.into(), }, ureq::Error::Transport(transport) => io::Error::new(io::ErrorKind::Other, transport).into(), } From 3329248a8448cc1ea8b2356dac803f38b8972287 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Jun 2025 15:50:32 +0200 Subject: [PATCH 08/36] Support no pattern when exporting --- .../src/scheduler/process_export.rs | 89 +++++++++---------- crates/meilisearch-types/src/tasks.rs | 3 +- crates/meilisearch/src/routes/export.rs | 21 +++-- 3 files changed, 54 insertions(+), 59 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index e10c468fc..5c65ca51e 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -54,7 +54,7 @@ impl IndexScheduler { indexes.len() as u32, )); - let ExportIndexSettings { skip_embeddings, filter } = settings; + let ExportIndexSettings { filter } = settings; let index = self.index(uid)?; let index_rtxn = index.read_txn()?; @@ -131,56 +131,53 @@ impl IndexScheduler { .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; // TODO definitely factorize this code - if !*skip_embeddings { - 'inject_vectors: { - let embeddings = index - .embeddings(&index_rtxn, docid) - .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + 'inject_vectors: { + let embeddings = index + .embeddings(&index_rtxn, docid) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; - if embeddings.is_empty() { - break 'inject_vectors; - } + if embeddings.is_empty() { + break 'inject_vectors; + } - let vectors = document - .entry(RESERVED_VECTORS_FIELD_NAME) - .or_insert(serde_json::Value::Object(Default::default())); + let vectors = document + .entry(RESERVED_VECTORS_FIELD_NAME) + .or_insert(serde_json::Value::Object(Default::default())); - let serde_json::Value::Object(vectors) = vectors else { - return Err(Error::from_milli( - meilisearch_types::milli::Error::UserError( - meilisearch_types::milli::UserError::InvalidVectorsMapType { - document_id: { - if let Ok(Some(Ok(index))) = index - .external_id_of(&index_rtxn, std::iter::once(docid)) - .map(|it| it.into_iter().next()) - { - index - } else { - format!("internal docid={docid}") - } - }, - value: vectors.clone(), + let serde_json::Value::Object(vectors) = vectors else { + return Err(Error::from_milli( + meilisearch_types::milli::Error::UserError( + meilisearch_types::milli::UserError::InvalidVectorsMapType { + document_id: { + if let Ok(Some(Ok(index))) = index + .external_id_of(&index_rtxn, std::iter::once(docid)) + .map(|it| it.into_iter().next()) + { + index + } else { + format!("internal docid={docid}") + } }, - ), - Some(uid.to_string()), - )); + value: vectors.clone(), + }, + ), + Some(uid.to_string()), + )); + }; + + for (embedder_name, embeddings) in embeddings { + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == embedder_name) + .is_some_and(|conf| conf.user_provided.contains(docid)); + + let embeddings = ExplicitVectors { + embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( + embeddings, + )), + regenerate: !user_provided, }; - - for (embedder_name, embeddings) in embeddings { - let user_provided = embedding_configs - .iter() - .find(|conf| conf.name == embedder_name) - .is_some_and(|conf| conf.user_provided.contains(docid)); - - let embeddings = ExplicitVectors { - embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( - embeddings, - )), - regenerate: !user_provided, - }; - vectors - .insert(embedder_name, serde_json::to_value(embeddings).unwrap()); - } + vectors.insert(embedder_name, serde_json::to_value(embeddings).unwrap()); } } diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 3ef60cacf..b5e2581fc 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -171,10 +171,9 @@ pub struct IndexSwap { pub indexes: (String, String), } -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] #[serde(rename_all = "camelCase")] pub struct ExportIndexSettings { - pub skip_embeddings: bool, pub filter: Option, } diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 7029f0ebf..40ef20008 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -72,16 +72,19 @@ async fn export( debug!(returns = ?export, "Trigger export"); let Export { url, api_key, indexes } = export; - let task = KindWithContent::Export { - url, - api_key, - indexes: indexes + + let indexes = if indexes.is_empty() { + BTreeMap::from([(IndexUidPattern::new_unchecked("*"), DbExportIndexSettings::default())]) + } else { + indexes .into_iter() - .map(|(pattern, ExportIndexSettings { skip_embeddings, filter })| { - (pattern, DbExportIndexSettings { skip_embeddings, filter }) + .map(|(pattern, ExportIndexSettings { filter })| { + (pattern, DbExportIndexSettings { filter }) }) - .collect(), + .collect() }; + + let task = KindWithContent::Export { url, api_key, indexes }; let uid = get_task_id(&req, &opt)?; let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = @@ -116,10 +119,6 @@ pub struct Export { #[serde(rename_all = "camelCase")] #[schema(rename_all = "camelCase")] pub struct ExportIndexSettings { - #[schema(value_type = Option, example = json!("true"))] - #[serde(default)] - #[deserr(default, error = DeserrJsonError)] - pub skip_embeddings: bool, #[schema(value_type = Option, example = json!("genres = action"))] #[serde(default)] #[deserr(default, error = DeserrJsonError)] From ee812b31c4ef73305fb417869e6ca0d89b856642 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Jun 2025 15:56:26 +0200 Subject: [PATCH 09/36] Support JSON value as filters --- crates/index-scheduler/src/scheduler/process_export.rs | 5 ++--- crates/meilisearch-types/src/tasks.rs | 7 ++++--- crates/meilisearch/src/routes/export.rs | 3 ++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 5c65ca51e..e6c09e58a 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -90,10 +90,9 @@ impl IndexScheduler { request.send_json(settings.clone()).map_err(into_backoff_error) })?; - // TODO support JSON Value objects let filter = filter - .as_deref() - .map(Filter::from_str) + .as_ref() + .map(Filter::from_json) .transpose() .map_err(|e| Error::from_milli(e, Some(uid.to_string())))? .flatten(); diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index b5e2581fc..86951192c 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -8,6 +8,7 @@ use milli::update::IndexDocumentsMethod; use milli::Object; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize, Serializer}; +use serde_json::Value; use time::{Duration, OffsetDateTime}; use utoipa::{schema, ToSchema}; use uuid::Uuid; @@ -111,11 +112,11 @@ pub enum KindWithContent { }, DocumentDeletionByFilter { index_uid: String, - filter_expr: serde_json::Value, + filter_expr: Value, }, DocumentEdition { index_uid: String, - filter_expr: Option, + filter_expr: Option, context: Option, function: String, }, @@ -174,7 +175,7 @@ pub struct IndexSwap { #[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] #[serde(rename_all = "camelCase")] pub struct ExportIndexSettings { - pub filter: Option, + pub filter: Option, } impl KindWithContent { diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 40ef20008..de1fe2c38 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -12,6 +12,7 @@ use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::keys::actions; use meilisearch_types::tasks::{ExportIndexSettings as DbExportIndexSettings, KindWithContent}; use serde::Serialize; +use serde_json::Value; use tracing::debug; use utoipa::{OpenApi, ToSchema}; @@ -122,5 +123,5 @@ pub struct ExportIndexSettings { #[schema(value_type = Option, example = json!("genres = action"))] #[serde(default)] #[deserr(default, error = DeserrJsonError)] - pub filter: Option, + pub filter: Option, } From 2d4f7c635eedc00e3ecf4c07cb5c14f300379103 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Jun 2025 16:18:31 +0200 Subject: [PATCH 10/36] Make tests happy --- crates/index-scheduler/src/scheduler/test.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/index-scheduler/src/scheduler/test.rs b/crates/index-scheduler/src/scheduler/test.rs index 06bc14051..fb309f882 100644 --- a/crates/index-scheduler/src/scheduler/test.rs +++ b/crates/index-scheduler/src/scheduler/test.rs @@ -732,6 +732,7 @@ fn basic_get_stats() { "documentDeletion": 0, "documentEdition": 0, "dumpCreation": 0, + "export": 0, "indexCreation": 3, "indexDeletion": 0, "indexSwap": 0, From c6216517c7243809ae7b886eb8e07cecf34ab5b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Jun 2025 16:30:35 +0200 Subject: [PATCH 11/36] Parallelize document upload --- .../src/scheduler/process_export.rs | 189 ++++++++++-------- crates/index-scheduler/src/scheduler/test.rs | 3 + crates/milli/src/thread_pool_no_abort.rs | 18 +- .../src/update/index_documents/extract/mod.rs | 2 +- .../milli/src/update/index_documents/mod.rs | 1 + crates/milli/src/update/mod.rs | 2 +- 6 files changed, 133 insertions(+), 82 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index e6c09e58a..3054c919b 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -7,9 +7,9 @@ use backoff::ExponentialBackoff; use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; use meilisearch_types::milli::progress::{Progress, VariableNameStep}; -use meilisearch_types::milli::update::Setting; +use meilisearch_types::milli::update::{request_threads, Setting}; use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; -use meilisearch_types::milli::{obkv_to_json, Filter}; +use meilisearch_types::milli::{self, obkv_to_json, Filter, InternalError}; use meilisearch_types::settings::{self, SecretPolicy}; use meilisearch_types::tasks::ExportIndexSettings; use serde::Deserialize; @@ -112,6 +112,10 @@ impl IndexScheduler { .embedding_configs(&index_rtxn) .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + // We don't need to keep this one alive as we will + // spawn many threads to process the documents + drop(index_rtxn); + let total_documents = universe.len() as u32; let (step, progress_step) = AtomicDocumentStep::new(total_documents); progress.update_progress(progress_step); @@ -119,73 +123,107 @@ impl IndexScheduler { let limit = 50 * 1024 * 1024; // 50 MiB let documents_url = format!("{base_url}/indexes/{uid}/documents"); - let mut buffer = Vec::new(); - let mut tmp_buffer = Vec::new(); - for (i, docid) in universe.into_iter().enumerate() { - let document = index - .document(&index_rtxn, docid) - .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + request_threads() + .broadcast(|ctx| { + let index_rtxn = index + .read_txn() + .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; - let mut document = obkv_to_json(&all_fields, &fields_ids_map, document) - .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + let mut buffer = Vec::new(); + let mut tmp_buffer = Vec::new(); + for (i, docid) in universe.iter().enumerate() { + if i % ctx.num_threads() != ctx.index() { + continue; + } - // TODO definitely factorize this code - 'inject_vectors: { - let embeddings = index - .embeddings(&index_rtxn, docid) - .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + let document = index + .document(&index_rtxn, docid) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; - if embeddings.is_empty() { - break 'inject_vectors; + let mut document = obkv_to_json(&all_fields, &fields_ids_map, document) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + + // TODO definitely factorize this code + 'inject_vectors: { + let embeddings = index + .embeddings(&index_rtxn, docid) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + + if embeddings.is_empty() { + break 'inject_vectors; + } + + let vectors = document + .entry(RESERVED_VECTORS_FIELD_NAME) + .or_insert(serde_json::Value::Object(Default::default())); + + let serde_json::Value::Object(vectors) = vectors else { + return Err(Error::from_milli( + milli::Error::UserError( + milli::UserError::InvalidVectorsMapType { + document_id: { + if let Ok(Some(Ok(index))) = index + .external_id_of( + &index_rtxn, + std::iter::once(docid), + ) + .map(|it| it.into_iter().next()) + { + index + } else { + format!("internal docid={docid}") + } + }, + value: vectors.clone(), + }, + ), + Some(uid.to_string()), + )); + }; + + for (embedder_name, embeddings) in embeddings { + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == embedder_name) + .is_some_and(|conf| conf.user_provided.contains(docid)); + + let embeddings = ExplicitVectors { + embeddings: Some( + VectorOrArrayOfVectors::from_array_of_vectors(embeddings), + ), + regenerate: !user_provided, + }; + vectors.insert( + embedder_name, + serde_json::to_value(embeddings).unwrap(), + ); + } + } + + tmp_buffer.clear(); + serde_json::to_writer(&mut tmp_buffer, &document) + .map_err(milli::InternalError::from) + .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; + + if buffer.len() + tmp_buffer.len() > limit { + retry(&must_stop_processing, || { + let mut request = agent.post(&documents_url); + request = request.set("Content-Type", "application/x-ndjson"); + if let Some(api_key) = api_key { + request = request + .set("Authorization", &(format!("Bearer {api_key}"))); + } + request.send_bytes(&buffer).map_err(into_backoff_error) + })?; + buffer.clear(); + } + buffer.extend_from_slice(&tmp_buffer); + + if i % 100 == 0 { + step.fetch_add(100, atomic::Ordering::Relaxed); + } } - let vectors = document - .entry(RESERVED_VECTORS_FIELD_NAME) - .or_insert(serde_json::Value::Object(Default::default())); - - let serde_json::Value::Object(vectors) = vectors else { - return Err(Error::from_milli( - meilisearch_types::milli::Error::UserError( - meilisearch_types::milli::UserError::InvalidVectorsMapType { - document_id: { - if let Ok(Some(Ok(index))) = index - .external_id_of(&index_rtxn, std::iter::once(docid)) - .map(|it| it.into_iter().next()) - { - index - } else { - format!("internal docid={docid}") - } - }, - value: vectors.clone(), - }, - ), - Some(uid.to_string()), - )); - }; - - for (embedder_name, embeddings) in embeddings { - let user_provided = embedding_configs - .iter() - .find(|conf| conf.name == embedder_name) - .is_some_and(|conf| conf.user_provided.contains(docid)); - - let embeddings = ExplicitVectors { - embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( - embeddings, - )), - regenerate: !user_provided, - }; - vectors.insert(embedder_name, serde_json::to_value(embeddings).unwrap()); - } - } - - tmp_buffer.clear(); - serde_json::to_writer(&mut tmp_buffer, &document) - .map_err(meilisearch_types::milli::InternalError::from) - .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; - - if buffer.len() + tmp_buffer.len() > limit { retry(&must_stop_processing, || { let mut request = agent.post(&documents_url); request = request.set("Content-Type", "application/x-ndjson"); @@ -194,23 +232,16 @@ impl IndexScheduler { } request.send_bytes(&buffer).map_err(into_backoff_error) })?; - buffer.clear(); - } - buffer.extend_from_slice(&tmp_buffer); - if i % 100 == 0 { - step.fetch_add(100, atomic::Ordering::Relaxed); - } - } + Ok(()) + }) + .map_err(|e| { + Error::from_milli( + milli::Error::InternalError(InternalError::PanicInThreadPool(e)), + Some(uid.to_string()), + ) + })?; - retry(&must_stop_processing, || { - let mut request = agent.post(&documents_url); - request = request.set("Content-Type", "application/x-ndjson"); - if let Some(api_key) = api_key { - request = request.set("Authorization", &(format!("Bearer {api_key}"))); - } - request.send_bytes(&buffer).map_err(into_backoff_error) - })?; step.store(total_documents, atomic::Ordering::Relaxed); } diff --git a/crates/index-scheduler/src/scheduler/test.rs b/crates/index-scheduler/src/scheduler/test.rs index fb309f882..ee26165c7 100644 --- a/crates/index-scheduler/src/scheduler/test.rs +++ b/crates/index-scheduler/src/scheduler/test.rs @@ -766,6 +766,7 @@ fn basic_get_stats() { "documentDeletion": 0, "documentEdition": 0, "dumpCreation": 0, + "export": 0, "indexCreation": 3, "indexDeletion": 0, "indexSwap": 0, @@ -806,6 +807,7 @@ fn basic_get_stats() { "documentDeletion": 0, "documentEdition": 0, "dumpCreation": 0, + "export": 0, "indexCreation": 3, "indexDeletion": 0, "indexSwap": 0, @@ -847,6 +849,7 @@ fn basic_get_stats() { "documentDeletion": 0, "documentEdition": 0, "dumpCreation": 0, + "export": 0, "indexCreation": 3, "indexDeletion": 0, "indexSwap": 0, diff --git a/crates/milli/src/thread_pool_no_abort.rs b/crates/milli/src/thread_pool_no_abort.rs index 0c2fbb30d..66380ff36 100644 --- a/crates/milli/src/thread_pool_no_abort.rs +++ b/crates/milli/src/thread_pool_no_abort.rs @@ -1,7 +1,7 @@ use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::Arc; -use rayon::{ThreadPool, ThreadPoolBuilder}; +use rayon::{BroadcastContext, ThreadPool, ThreadPoolBuilder}; use thiserror::Error; /// A rayon ThreadPool wrapper that can catch panics in the pool @@ -32,6 +32,22 @@ impl ThreadPoolNoAbort { } } + pub fn broadcast(&self, op: OP) -> Result, PanicCatched> + where + OP: Fn(BroadcastContext<'_>) -> R + Sync, + R: Send, + { + self.active_operations.fetch_add(1, Ordering::Relaxed); + let output = self.thread_pool.broadcast(op); + self.active_operations.fetch_sub(1, Ordering::Relaxed); + // While reseting the pool panic catcher we return an error if we catched one. + if self.pool_catched_panic.swap(false, Ordering::SeqCst) { + Err(PanicCatched) + } else { + Ok(output) + } + } + pub fn current_num_threads(&self) -> usize { self.thread_pool.current_num_threads() } diff --git a/crates/milli/src/update/index_documents/extract/mod.rs b/crates/milli/src/update/index_documents/extract/mod.rs index 8cd664a2f..cb4ac03a6 100644 --- a/crates/milli/src/update/index_documents/extract/mod.rs +++ b/crates/milli/src/update/index_documents/extract/mod.rs @@ -210,7 +210,7 @@ fn run_extraction_task( }) } -fn request_threads() -> &'static ThreadPoolNoAbort { +pub fn request_threads() -> &'static ThreadPoolNoAbort { static REQUEST_THREADS: OnceLock = OnceLock::new(); REQUEST_THREADS.get_or_init(|| { diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index f547c68d4..dd0238fcb 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -12,6 +12,7 @@ use std::sync::Arc; use crossbeam_channel::{Receiver, Sender}; use enrich::enrich_documents_batch; +pub use extract::request_threads; use grenad::{Merger, MergerBuilder}; use hashbrown::HashMap; use heed::types::Str; diff --git a/crates/milli/src/update/mod.rs b/crates/milli/src/update/mod.rs index 04ce68fc7..64eb9f1d3 100644 --- a/crates/milli/src/update/mod.rs +++ b/crates/milli/src/update/mod.rs @@ -4,7 +4,7 @@ pub use self::clear_documents::ClearDocuments; pub use self::concurrent_available_ids::ConcurrentAvailableIds; pub use self::facet::bulk::FacetsUpdateBulk; pub use self::facet::incremental::FacetsUpdateIncrementalInner; -pub use self::index_documents::*; +pub use self::index_documents::{request_threads, *}; pub use self::indexer_config::{default_thread_pool_and_threads, IndexerConfig}; pub use self::new::ChannelCongestion; pub use self::settings::{validate_embedding_settings, Setting, Settings}; From a743da30618850e6e6e302b1c7e009d932d7a8b6 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 25 Jun 2025 12:29:14 +0200 Subject: [PATCH 12/36] Gzip-compress the content --- .../src/scheduler/process_export.rs | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 3054c919b..180162eda 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -1,9 +1,11 @@ use std::collections::BTreeMap; -use std::io; +use std::io::{self, Write as _}; use std::sync::atomic; use std::time::Duration; use backoff::ExponentialBackoff; +use flate2::write::GzEncoder; +use flate2::Compression; use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; use meilisearch_types::milli::progress::{Progress, VariableNameStep}; @@ -131,6 +133,7 @@ impl IndexScheduler { let mut buffer = Vec::new(); let mut tmp_buffer = Vec::new(); + let mut compressed_buffer = Vec::new(); for (i, docid) in universe.iter().enumerate() { if i % ctx.num_threads() != ctx.index() { continue; @@ -205,17 +208,31 @@ impl IndexScheduler { .map_err(milli::InternalError::from) .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; - if buffer.len() + tmp_buffer.len() > limit { + // Make sure we put at least one document in the buffer even + // though we might go above the buffer limit before sending + if !buffer.is_empty() && buffer.len() + tmp_buffer.len() > limit { + // We compress the documents before sending them + let mut encoder = + GzEncoder::new(&mut compressed_buffer, Compression::default()); + encoder + .write_all(&buffer) + .map_err(|e| Error::from_milli(e.into(), Some(uid.clone())))?; + encoder + .finish() + .map_err(|e| Error::from_milli(e.into(), Some(uid.clone())))?; + retry(&must_stop_processing, || { let mut request = agent.post(&documents_url); request = request.set("Content-Type", "application/x-ndjson"); + request = request.set("Content-Encoding", "gzip"); if let Some(api_key) = api_key { request = request .set("Authorization", &(format!("Bearer {api_key}"))); } - request.send_bytes(&buffer).map_err(into_backoff_error) + request.send_bytes(&compressed_buffer).map_err(into_backoff_error) })?; buffer.clear(); + compressed_buffer.clear(); } buffer.extend_from_slice(&tmp_buffer); From 63031219c543318258aaf4bb268b9e29bebf4968 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 25 Jun 2025 18:24:50 +0200 Subject: [PATCH 13/36] Add the payload size to the parameters --- Cargo.lock | 1 + crates/dump/src/lib.rs | 5 +- crates/index-scheduler/src/dump.rs | 31 ++++++----- crates/index-scheduler/src/insta_snapshot.rs | 4 +- .../src/scheduler/process_batch.rs | 11 +++- .../src/scheduler/process_export.rs | 6 ++- crates/index-scheduler/src/utils.rs | 2 +- crates/meilisearch-types/Cargo.toml | 1 + crates/meilisearch-types/src/error.rs | 1 + crates/meilisearch-types/src/lib.rs | 2 +- crates/meilisearch-types/src/task_view.rs | 14 ++++- crates/meilisearch-types/src/tasks.rs | 42 +++++++++------ crates/meilisearch/src/routes/export.rs | 51 ++++++++++++++++++- 13 files changed, 130 insertions(+), 41 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a883b749f..be6aa4b21 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3855,6 +3855,7 @@ dependencies = [ "anyhow", "bumpalo", "bumparaw-collections", + "byte-unit", "convert_case 0.8.0", "csv", "deserr", diff --git a/crates/dump/src/lib.rs b/crates/dump/src/lib.rs index 5c67d7a94..7fd0ea376 100644 --- a/crates/dump/src/lib.rs +++ b/crates/dump/src/lib.rs @@ -4,6 +4,7 @@ use std::collections::BTreeMap; use meilisearch_types::batches::BatchId; +use meilisearch_types::byte_unit::Byte; use meilisearch_types::error::ResponseError; use meilisearch_types::keys::Key; use meilisearch_types::milli::update::IndexDocumentsMethod; @@ -148,6 +149,7 @@ pub enum KindDump { Export { url: String, api_key: Option, + payload_size: Option, indexes: BTreeMap, }, UpgradeDatabase { @@ -222,9 +224,10 @@ impl From for KindDump { KindDump::DumpCreation { keys, instance_uid } } KindWithContent::SnapshotCreation => KindDump::SnapshotCreation, - KindWithContent::Export { url, api_key, indexes } => KindDump::Export { + KindWithContent::Export { url, api_key, payload_size, indexes } => KindDump::Export { url, api_key, + payload_size, indexes: indexes .into_iter() .map(|(pattern, settings)| (pattern.to_string(), settings)) diff --git a/crates/index-scheduler/src/dump.rs b/crates/index-scheduler/src/dump.rs index 2a99a74aa..1e681c8e8 100644 --- a/crates/index-scheduler/src/dump.rs +++ b/crates/index-scheduler/src/dump.rs @@ -212,20 +212,23 @@ impl<'a> Dump<'a> { KindWithContent::DumpCreation { keys, instance_uid } } KindDump::SnapshotCreation => KindWithContent::SnapshotCreation, - KindDump::Export { url, indexes, api_key } => KindWithContent::Export { - url, - api_key, - indexes: indexes - .into_iter() - .map(|(pattern, settings)| { - Ok(( - IndexUidPattern::try_from(pattern) - .map_err(|_| Error::CorruptedDump)?, - settings, - )) - }) - .collect::>()?, - }, + KindDump::Export { url, api_key, payload_size, indexes } => { + KindWithContent::Export { + url, + api_key, + payload_size, + indexes: indexes + .into_iter() + .map(|(pattern, settings)| { + Ok(( + IndexUidPattern::try_from(pattern) + .map_err(|_| Error::CorruptedDump)?, + settings, + )) + }) + .collect::>()?, + } + } KindDump::UpgradeDatabase { from } => KindWithContent::UpgradeDatabase { from }, }, }; diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index 138b591ff..f48821520 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -289,8 +289,8 @@ fn snapshot_details(d: &Details) -> String { Details::IndexSwap { swaps } => { format!("{{ swaps: {swaps:?} }}") } - Details::Export { url, api_key, indexes } => { - format!("{{ url: {url:?}, api_key: {api_key:?}, indexes: {indexes:?} }}") + Details::Export { url, api_key, payload_size, indexes } => { + format!("{{ url: {url:?}, api_key: {api_key:?}, payload_size: {payload_size:?}, indexes: {indexes:?} }}") } Details::UpgradeDatabase { from, to } => { format!("{{ from: {from:?}, to: {to:?} }}") diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs index 99278756d..e56b8e13a 100644 --- a/crates/index-scheduler/src/scheduler/process_batch.rs +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -362,12 +362,19 @@ impl IndexScheduler { Ok((vec![task], ProcessBatchInfo::default())) } Batch::Export { mut task } => { - let KindWithContent::Export { url, indexes, api_key } = &task.kind else { + let KindWithContent::Export { url, api_key, payload_size, indexes } = &task.kind + else { unreachable!() }; let ret = catch_unwind(AssertUnwindSafe(|| { - self.process_export(url, indexes, api_key.as_deref(), progress) + self.process_export( + url, + api_key.as_deref(), + payload_size.as_ref(), + indexes, + progress, + ) })); match ret { diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 180162eda..e777809fd 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -4,6 +4,7 @@ use std::sync::atomic; use std::time::Duration; use backoff::ExponentialBackoff; +use byte_unit::Byte; use flate2::write::GzEncoder; use flate2::Compression; use meilisearch_types::index_uid_pattern::IndexUidPattern; @@ -25,8 +26,9 @@ impl IndexScheduler { pub(super) fn process_export( &self, base_url: &str, - indexes: &BTreeMap, api_key: Option<&str>, + payload_size: Option<&Byte>, + indexes: &BTreeMap, progress: Progress, ) -> Result<()> { #[cfg(test)] @@ -122,7 +124,7 @@ impl IndexScheduler { let (step, progress_step) = AtomicDocumentStep::new(total_documents); progress.update_progress(progress_step); - let limit = 50 * 1024 * 1024; // 50 MiB + let limit = payload_size.map(|ps| ps.as_u64() as usize).unwrap_or(50 * 1024 * 1024); // defaults to 50 MiB let documents_url = format!("{base_url}/indexes/{uid}/documents"); request_threads() diff --git a/crates/index-scheduler/src/utils.rs b/crates/index-scheduler/src/utils.rs index 79571745b..594023145 100644 --- a/crates/index-scheduler/src/utils.rs +++ b/crates/index-scheduler/src/utils.rs @@ -601,7 +601,7 @@ impl crate::IndexScheduler { Details::Dump { dump_uid: _ } => { assert_eq!(kind.as_kind(), Kind::DumpCreation); } - Details::Export { url: _, api_key: _, indexes: _ } => { + Details::Export { url: _, api_key: _, payload_size: _, indexes: _ } => { assert_eq!(kind.as_kind(), Kind::Export); } Details::UpgradeDatabase { from: _, to: _ } => { diff --git a/crates/meilisearch-types/Cargo.toml b/crates/meilisearch-types/Cargo.toml index f76044078..faf59643f 100644 --- a/crates/meilisearch-types/Cargo.toml +++ b/crates/meilisearch-types/Cargo.toml @@ -15,6 +15,7 @@ actix-web = { version = "4.11.0", default-features = false } anyhow = "1.0.98" bumpalo = "3.18.1" bumparaw-collections = "0.1.4" +byte-unit = { version = "5.1.6", features = ["serde"] } convert_case = "0.8.0" csv = "1.3.1" deserr = { version = "0.6.3", features = ["actix-web"] } diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index 08ee803ef..a8f45b4ef 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -392,6 +392,7 @@ InvalidSettingsIndexChat , InvalidRequest , BAD_REQU // Export InvalidExportUrl , InvalidRequest , BAD_REQUEST ; InvalidExportApiKey , InvalidRequest , BAD_REQUEST ; +InvalidExportPayloadSize , InvalidRequest , BAD_REQUEST ; InvalidExportIndexesPatterns , InvalidRequest , BAD_REQUEST ; InvalidExportIndexSkipEmbeddings , InvalidRequest , BAD_REQUEST ; InvalidExportIndexFilter , InvalidRequest , BAD_REQUEST ; diff --git a/crates/meilisearch-types/src/lib.rs b/crates/meilisearch-types/src/lib.rs index a1a57b7e6..fe69da526 100644 --- a/crates/meilisearch-types/src/lib.rs +++ b/crates/meilisearch-types/src/lib.rs @@ -18,7 +18,7 @@ pub mod versioning; pub use milli::{heed, Index}; use uuid::Uuid; pub use versioning::VERSION_FILE_NAME; -pub use {milli, serde_cs}; +pub use {byte_unit, milli, serde_cs}; pub type Document = serde_json::Map; pub type InstanceUid = Uuid; diff --git a/crates/meilisearch-types/src/task_view.rs b/crates/meilisearch-types/src/task_view.rs index 0a8d7b8fe..1dbd5637b 100644 --- a/crates/meilisearch-types/src/task_view.rs +++ b/crates/meilisearch-types/src/task_view.rs @@ -1,5 +1,6 @@ use std::collections::BTreeMap; +use byte_unit::UnitType; use milli::Object; use serde::{Deserialize, Serialize}; use time::{Duration, OffsetDateTime}; @@ -128,6 +129,8 @@ pub struct DetailsView { #[serde(skip_serializing_if = "Option::is_none")] pub api_key: Option, #[serde(skip_serializing_if = "Option::is_none")] + pub payload_size: Option, + #[serde(skip_serializing_if = "Option::is_none")] pub indexes: Option>, } @@ -263,6 +266,13 @@ impl DetailsView { // So we return the first one we encounter but that shouldn't be an issue anyway. (Some(left), Some(_right)) => Some(left), }, + payload_size: match (self.payload_size.clone(), other.payload_size.clone()) { + (None, None) => None, + (None, Some(size)) | (Some(size), None) => Some(size), + // We should never be able to batch multiple exports at the same time. + // So we return the first one we encounter but that shouldn't be an issue anyway. + (Some(left), Some(_right)) => Some(left), + }, indexes: match (self.indexes.clone(), other.indexes.clone()) { (None, None) => None, (None, Some(indexes)) | (Some(indexes), None) => Some(indexes), @@ -359,9 +369,11 @@ impl From
for DetailsView { Details::IndexSwap { swaps } => { DetailsView { swaps: Some(swaps), ..Default::default() } } - Details::Export { url, api_key, indexes } => DetailsView { + Details::Export { url, api_key, payload_size, indexes } => DetailsView { url: Some(url), api_key, + payload_size: payload_size + .map(|ps| ps.get_appropriate_unit(UnitType::Both).to_string()), indexes: Some( indexes .into_iter() diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 86951192c..508035bb7 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -3,6 +3,7 @@ use std::collections::{BTreeMap, HashSet}; use std::fmt::{Display, Write}; use std::str::FromStr; +use byte_unit::Byte; use enum_iterator::Sequence; use milli::update::IndexDocumentsMethod; use milli::Object; @@ -159,6 +160,7 @@ pub enum KindWithContent { Export { url: String, api_key: Option, + payload_size: Option, indexes: BTreeMap, }, UpgradeDatabase { @@ -286,11 +288,14 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { - url: url.clone(), - api_key: api_key.clone(), - indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), - }), + KindWithContent::Export { url, api_key, payload_size, indexes } => { + Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + payload_size: payload_size.clone(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + }) + } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: (from.0, from.1, from.2), to: ( @@ -357,11 +362,14 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { - url: url.clone(), - api_key: api_key.clone(), - indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), - }), + KindWithContent::Export { url, api_key, payload_size, indexes } => { + Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + payload_size: payload_size.clone(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + }) + } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: *from, to: ( @@ -410,11 +418,14 @@ impl From<&KindWithContent> for Option
{ }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { - url: url.clone(), - api_key: api_key.clone(), - indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), - }), + KindWithContent::Export { url, api_key, payload_size, indexes } => { + Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + payload_size: payload_size.clone(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + }) + } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: *from, to: ( @@ -681,6 +692,7 @@ pub enum Details { Export { url: String, api_key: Option, + payload_size: Option, indexes: BTreeMap, }, UpgradeDatabase { diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index de1fe2c38..1c519224c 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -1,7 +1,10 @@ use std::collections::BTreeMap; +use std::convert::Infallible; +use std::str::FromStr as _; use actix_web::web::{self, Data}; use actix_web::{HttpRequest, HttpResponse}; +use byte_unit::Byte; use deserr::actix_web::AwebJson; use deserr::Deserr; use index_scheduler::IndexScheduler; @@ -72,7 +75,7 @@ async fn export( let export = export.into_inner(); debug!(returns = ?export, "Trigger export"); - let Export { url, api_key, indexes } = export; + let Export { url, api_key, payload_size, indexes } = export; let indexes = if indexes.is_empty() { BTreeMap::from([(IndexUidPattern::new_unchecked("*"), DbExportIndexSettings::default())]) @@ -85,7 +88,12 @@ async fn export( .collect() }; - let task = KindWithContent::Export { url, api_key, indexes }; + let task = KindWithContent::Export { + url, + api_key, + payload_size: payload_size.map(|ByteWithDeserr(bytes)| bytes), + indexes, + }; let uid = get_task_id(&req, &opt)?; let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = @@ -109,12 +117,51 @@ pub struct Export { #[serde(default)] #[deserr(default, error = DeserrJsonError)] pub api_key: Option, + #[schema(value_type = Option, example = json!("24MiB"))] + #[serde(default)] + #[deserr(default, error = DeserrJsonError)] + pub payload_size: Option, #[schema(value_type = Option>, example = json!(["movies", "steam-*"]))] #[deserr(default)] #[serde(default)] pub indexes: BTreeMap, } +/// A wrapper around the `Byte` type that implements `Deserr`. +#[derive(Debug, Serialize)] +#[serde(transparent)] +pub struct ByteWithDeserr(pub Byte); + +impl deserr::Deserr for ByteWithDeserr +where + E: deserr::DeserializeError, +{ + fn deserialize_from_value( + value: deserr::Value, + location: deserr::ValuePointerRef, + ) -> Result { + use deserr::{ErrorKind, Value, ValueKind}; + match value { + Value::Integer(integer) => Ok(ByteWithDeserr(Byte::from_u64(integer))), + Value::String(string) => Byte::from_str(&string).map(ByteWithDeserr).map_err(|e| { + deserr::take_cf_content(E::error::( + None, + ErrorKind::Unexpected { msg: e.to_string() }, + location, + )) + }), + actual => Err(deserr::take_cf_content(E::error( + None, + ErrorKind::IncorrectValueKind { + actual, + accepted: &[ValueKind::Integer, ValueKind::String], + }, + location, + ))), + } + } +} + #[derive(Debug, Deserr, ToSchema, Serialize)] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] #[serde(rename_all = "camelCase")] From e6e9a033aa153250b9fe96addb13701d49feccd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 26 Jun 2025 15:45:24 +0200 Subject: [PATCH 14/36] Introduce new analytics to the export route --- crates/meilisearch/src/routes/export.rs | 7 +- .../src/routes/export_analytics.rs | 67 +++++++++++++++++++ crates/meilisearch/src/routes/mod.rs | 1 + 3 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 crates/meilisearch/src/routes/export_analytics.rs diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 1c519224c..21a77ae32 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -22,6 +22,7 @@ use utoipa::{OpenApi, ToSchema}; use crate::analytics::Analytics; use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::GuardedData; +use crate::routes::export_analytics::ExportAnalytics; use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView}; use crate::Opt; @@ -67,7 +68,7 @@ async fn export( export: AwebJson, req: HttpRequest, opt: web::Data, - _analytics: Data, + analytics: Data, ) -> Result { // TODO make it experimental? // index_scheduler.features().check_network("Using the /network route")?; @@ -75,6 +76,8 @@ async fn export( let export = export.into_inner(); debug!(returns = ?export, "Trigger export"); + let analytics_aggregate = ExportAnalytics::from_export(&export); + let Export { url, api_key, payload_size, indexes } = export; let indexes = if indexes.is_empty() { @@ -101,6 +104,8 @@ async fn export( .await?? .into(); + analytics.publish(analytics_aggregate, &req); + Ok(HttpResponse::Ok().json(task)) } diff --git a/crates/meilisearch/src/routes/export_analytics.rs b/crates/meilisearch/src/routes/export_analytics.rs new file mode 100644 index 000000000..7299dba8d --- /dev/null +++ b/crates/meilisearch/src/routes/export_analytics.rs @@ -0,0 +1,67 @@ +use crate::analytics::Aggregate; +use crate::routes::export::Export; + +#[derive(Default)] +pub struct ExportAnalytics { + total_received: usize, + has_api_key: bool, + total_index_patterns: usize, + total_patterns_with_filter: usize, + payload_sizes: Vec, +} + +impl ExportAnalytics { + pub fn from_export(export: &Export) -> Self { + let Export { url: _, api_key, payload_size, indexes } = export; + + let has_api_key = api_key.is_some(); + let total_index_patterns = indexes.len(); + let total_patterns_with_filter = + indexes.values().filter(|settings| settings.filter.is_some()).count(); + let payload_sizes = + if let Some(crate::routes::export::ByteWithDeserr(byte_size)) = payload_size { + vec![byte_size.as_u64()] + } else { + vec![] + }; + + Self { + total_received: 1, + has_api_key, + total_index_patterns, + total_patterns_with_filter, + payload_sizes, + } + } +} + +impl Aggregate for ExportAnalytics { + fn event_name(&self) -> &'static str { + "Export Triggered" + } + + fn aggregate(mut self: Box, other: Box) -> Box { + self.total_received += other.total_received; + self.has_api_key |= other.has_api_key; + self.total_index_patterns += other.total_index_patterns; + self.total_patterns_with_filter += other.total_patterns_with_filter; + self.payload_sizes.extend(other.payload_sizes); + self + } + + fn into_event(self: Box) -> serde_json::Value { + let avg_payload_size = if self.payload_sizes.is_empty() { + None + } else { + Some(self.payload_sizes.iter().sum::() / self.payload_sizes.len() as u64) + }; + + serde_json::json!({ + "total_received": self.total_received, + "has_api_key": self.has_api_key, + "total_index_patterns": self.total_index_patterns, + "total_patterns_with_filter": self.total_patterns_with_filter, + "avg_payload_size": avg_payload_size, + }) + } +} diff --git a/crates/meilisearch/src/routes/mod.rs b/crates/meilisearch/src/routes/mod.rs index 748cd5d83..08583d20f 100644 --- a/crates/meilisearch/src/routes/mod.rs +++ b/crates/meilisearch/src/routes/mod.rs @@ -55,6 +55,7 @@ pub mod batches; pub mod chats; mod dump; mod export; +mod export_analytics; pub mod features; pub mod indexes; mod logs; From 0bb7866f1e549c8791ac752f90af0dfcbd5fd6a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 26 Jun 2025 15:48:21 +0200 Subject: [PATCH 15/36] Remove the skip embeddings boolean in the settings --- crates/meilisearch-types/src/error.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index a8f45b4ef..1c2840084 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -394,7 +394,6 @@ InvalidExportUrl , InvalidRequest , BAD_REQU InvalidExportApiKey , InvalidRequest , BAD_REQUEST ; InvalidExportPayloadSize , InvalidRequest , BAD_REQUEST ; InvalidExportIndexesPatterns , InvalidRequest , BAD_REQUEST ; -InvalidExportIndexSkipEmbeddings , InvalidRequest , BAD_REQUEST ; InvalidExportIndexFilter , InvalidRequest , BAD_REQUEST ; // Experimental features - Chat Completions UnimplementedExternalFunctionCalling , InvalidRequest , NOT_IMPLEMENTED ; From bf13268649343ad2a410ca1411b5dce4f5b0fcf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 26 Jun 2025 16:03:13 +0200 Subject: [PATCH 16/36] Better compute aggragates --- .../src/routes/export_analytics.rs | 32 +++++++++++++------ 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/crates/meilisearch/src/routes/export_analytics.rs b/crates/meilisearch/src/routes/export_analytics.rs index 7299dba8d..44dba2c9b 100644 --- a/crates/meilisearch/src/routes/export_analytics.rs +++ b/crates/meilisearch/src/routes/export_analytics.rs @@ -5,8 +5,8 @@ use crate::routes::export::Export; pub struct ExportAnalytics { total_received: usize, has_api_key: bool, - total_index_patterns: usize, - total_patterns_with_filter: usize, + sum_index_patterns: usize, + sum_patterns_with_filter: usize, payload_sizes: Vec, } @@ -15,8 +15,8 @@ impl ExportAnalytics { let Export { url: _, api_key, payload_size, indexes } = export; let has_api_key = api_key.is_some(); - let total_index_patterns = indexes.len(); - let total_patterns_with_filter = + let index_patterns_count = indexes.len(); + let patterns_with_filter_count = indexes.values().filter(|settings| settings.filter.is_some()).count(); let payload_sizes = if let Some(crate::routes::export::ByteWithDeserr(byte_size)) = payload_size { @@ -28,8 +28,8 @@ impl ExportAnalytics { Self { total_received: 1, has_api_key, - total_index_patterns, - total_patterns_with_filter, + sum_index_patterns: index_patterns_count, + sum_patterns_with_filter: patterns_with_filter_count, payload_sizes, } } @@ -43,8 +43,8 @@ impl Aggregate for ExportAnalytics { fn aggregate(mut self: Box, other: Box) -> Box { self.total_received += other.total_received; self.has_api_key |= other.has_api_key; - self.total_index_patterns += other.total_index_patterns; - self.total_patterns_with_filter += other.total_patterns_with_filter; + self.sum_index_patterns += other.sum_index_patterns; + self.sum_patterns_with_filter += other.sum_patterns_with_filter; self.payload_sizes.extend(other.payload_sizes); self } @@ -56,11 +56,23 @@ impl Aggregate for ExportAnalytics { Some(self.payload_sizes.iter().sum::() / self.payload_sizes.len() as u64) }; + let avg_index_patterns = if self.total_received == 0 { + None + } else { + Some(self.sum_index_patterns as f64 / self.total_received as f64) + }; + + let avg_patterns_with_filter = if self.total_received == 0 { + None + } else { + Some(self.sum_patterns_with_filter as f64 / self.total_received as f64) + }; + serde_json::json!({ "total_received": self.total_received, "has_api_key": self.has_api_key, - "total_index_patterns": self.total_index_patterns, - "total_patterns_with_filter": self.total_patterns_with_filter, + "avg_index_patterns": avg_index_patterns, + "avg_patterns_with_filter": avg_patterns_with_filter, "avg_payload_size": avg_payload_size, }) } From e3003c1609fda6e0a2af649b8fc7bd3bff429d74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 26 Jun 2025 16:05:12 +0200 Subject: [PATCH 17/36] Improve OpenAPI schema --- crates/meilisearch/src/routes/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/meilisearch/src/routes/mod.rs b/crates/meilisearch/src/routes/mod.rs index 08583d20f..51298411a 100644 --- a/crates/meilisearch/src/routes/mod.rs +++ b/crates/meilisearch/src/routes/mod.rs @@ -2,6 +2,7 @@ use std::collections::BTreeMap; use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; +use export::Export; use index_scheduler::IndexScheduler; use meilisearch_auth::AuthController; use meilisearch_types::batch_view::BatchView; @@ -98,7 +99,7 @@ mod tasks_test; url = "/", description = "Local server", )), - components(schemas(PaginationView, PaginationView, IndexView, DocumentDeletionByFilter, AllBatches, BatchStats, ProgressStepView, ProgressView, BatchView, RuntimeTogglableFeatures, SwapIndexesPayload, DocumentEditionByFunction, MergeFacets, FederationOptions, SearchQueryWithIndex, Federation, FederatedSearch, FederatedSearchResult, SearchResults, SearchResultWithIndex, SimilarQuery, SimilarResult, PaginationView, BrowseQuery, UpdateIndexRequest, IndexUid, IndexCreateRequest, KeyView, Action, CreateApiKey, UpdateStderrLogs, LogMode, GetLogs, IndexStats, Stats, HealthStatus, HealthResponse, VersionResponse, Code, ErrorType, AllTasks, TaskView, Status, DetailsView, ResponseError, Settings, Settings, TypoSettings, MinWordSizeTyposSetting, FacetingSettings, PaginationSettings, SummarizedTaskView, Kind, Network, Remote, FilterableAttributesRule, FilterableAttributesPatterns, AttributePatterns, FilterableAttributesFeatures, FilterFeatures)) + components(schemas(PaginationView, PaginationView, IndexView, DocumentDeletionByFilter, AllBatches, BatchStats, ProgressStepView, ProgressView, BatchView, RuntimeTogglableFeatures, SwapIndexesPayload, DocumentEditionByFunction, MergeFacets, FederationOptions, SearchQueryWithIndex, Federation, FederatedSearch, FederatedSearchResult, SearchResults, SearchResultWithIndex, SimilarQuery, SimilarResult, PaginationView, BrowseQuery, UpdateIndexRequest, IndexUid, IndexCreateRequest, KeyView, Action, CreateApiKey, UpdateStderrLogs, LogMode, GetLogs, IndexStats, Stats, HealthStatus, HealthResponse, VersionResponse, Code, ErrorType, AllTasks, TaskView, Status, DetailsView, ResponseError, Settings, Settings, TypoSettings, MinWordSizeTyposSetting, FacetingSettings, PaginationSettings, SummarizedTaskView, Kind, Network, Remote, FilterableAttributesRule, FilterableAttributesPatterns, AttributePatterns, FilterableAttributesFeatures, FilterFeatures, ExportApi, Export)) )] pub struct MeilisearchApi; From b956918c11bd66a02ca9abda1ab905aa178a0ccf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 26 Jun 2025 16:31:35 +0200 Subject: [PATCH 18/36] Fix clippy and more utoipa issues --- crates/meilisearch-types/src/tasks.rs | 6 +++--- crates/meilisearch/src/routes/mod.rs | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 508035bb7..3301b4320 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -292,7 +292,7 @@ impl KindWithContent { Some(Details::Export { url: url.clone(), api_key: api_key.clone(), - payload_size: payload_size.clone(), + payload_size: *payload_size, indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }) } @@ -366,7 +366,7 @@ impl KindWithContent { Some(Details::Export { url: url.clone(), api_key: api_key.clone(), - payload_size: payload_size.clone(), + payload_size: *payload_size, indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }) } @@ -422,7 +422,7 @@ impl From<&KindWithContent> for Option
{ Some(Details::Export { url: url.clone(), api_key: api_key.clone(), - payload_size: payload_size.clone(), + payload_size: *payload_size, indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }) } diff --git a/crates/meilisearch/src/routes/mod.rs b/crates/meilisearch/src/routes/mod.rs index 51298411a..260d973a1 100644 --- a/crates/meilisearch/src/routes/mod.rs +++ b/crates/meilisearch/src/routes/mod.rs @@ -99,7 +99,7 @@ mod tasks_test; url = "/", description = "Local server", )), - components(schemas(PaginationView, PaginationView, IndexView, DocumentDeletionByFilter, AllBatches, BatchStats, ProgressStepView, ProgressView, BatchView, RuntimeTogglableFeatures, SwapIndexesPayload, DocumentEditionByFunction, MergeFacets, FederationOptions, SearchQueryWithIndex, Federation, FederatedSearch, FederatedSearchResult, SearchResults, SearchResultWithIndex, SimilarQuery, SimilarResult, PaginationView, BrowseQuery, UpdateIndexRequest, IndexUid, IndexCreateRequest, KeyView, Action, CreateApiKey, UpdateStderrLogs, LogMode, GetLogs, IndexStats, Stats, HealthStatus, HealthResponse, VersionResponse, Code, ErrorType, AllTasks, TaskView, Status, DetailsView, ResponseError, Settings, Settings, TypoSettings, MinWordSizeTyposSetting, FacetingSettings, PaginationSettings, SummarizedTaskView, Kind, Network, Remote, FilterableAttributesRule, FilterableAttributesPatterns, AttributePatterns, FilterableAttributesFeatures, FilterFeatures, ExportApi, Export)) + components(schemas(PaginationView, PaginationView, IndexView, DocumentDeletionByFilter, AllBatches, BatchStats, ProgressStepView, ProgressView, BatchView, RuntimeTogglableFeatures, SwapIndexesPayload, DocumentEditionByFunction, MergeFacets, FederationOptions, SearchQueryWithIndex, Federation, FederatedSearch, FederatedSearchResult, SearchResults, SearchResultWithIndex, SimilarQuery, SimilarResult, PaginationView, BrowseQuery, UpdateIndexRequest, IndexUid, IndexCreateRequest, KeyView, Action, CreateApiKey, UpdateStderrLogs, LogMode, GetLogs, IndexStats, Stats, HealthStatus, HealthResponse, VersionResponse, Code, ErrorType, AllTasks, TaskView, Status, DetailsView, ResponseError, Settings, Settings, TypoSettings, MinWordSizeTyposSetting, FacetingSettings, PaginationSettings, SummarizedTaskView, Kind, Network, Remote, FilterableAttributesRule, FilterableAttributesPatterns, AttributePatterns, FilterableAttributesFeatures, FilterFeatures, Export)) )] pub struct MeilisearchApi; From 0f1dd3614cc86753ca26dc10ebd2cc659659c55a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 26 Jun 2025 17:51:57 +0200 Subject: [PATCH 19/36] Update tasks tests --- crates/meilisearch/src/routes/tasks_test.rs | 2 +- crates/meilisearch/tests/batches/errors.rs | 2 +- crates/meilisearch/tests/tasks/errors.rs | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/meilisearch/src/routes/tasks_test.rs b/crates/meilisearch/src/routes/tasks_test.rs index a17b80c82..b09eb0fb3 100644 --- a/crates/meilisearch/src/routes/tasks_test.rs +++ b/crates/meilisearch/src/routes/tasks_test.rs @@ -228,7 +228,7 @@ mod tests { let err = deserr_query_params::(params).unwrap_err(); snapshot!(meili_snap::json_string!(err), @r#" { - "message": "Invalid value in parameter `types`: `createIndex` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `upgradeDatabase`.", + "message": "Invalid value in parameter `types`: `createIndex` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `export`, `upgradeDatabase`.", "code": "invalid_task_types", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_task_types" diff --git a/crates/meilisearch/tests/batches/errors.rs b/crates/meilisearch/tests/batches/errors.rs index 7f5fedb6a..bfc0d9251 100644 --- a/crates/meilisearch/tests/batches/errors.rs +++ b/crates/meilisearch/tests/batches/errors.rs @@ -42,7 +42,7 @@ async fn batch_bad_types() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r#" { - "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `upgradeDatabase`.", + "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `export`, `upgradeDatabase`.", "code": "invalid_task_types", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_task_types" diff --git a/crates/meilisearch/tests/tasks/errors.rs b/crates/meilisearch/tests/tasks/errors.rs index 759531d42..9970bafa4 100644 --- a/crates/meilisearch/tests/tasks/errors.rs +++ b/crates/meilisearch/tests/tasks/errors.rs @@ -97,7 +97,7 @@ async fn task_bad_types() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r#" { - "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `upgradeDatabase`.", + "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `export`, `upgradeDatabase`.", "code": "invalid_task_types", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_task_types" @@ -108,7 +108,7 @@ async fn task_bad_types() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r#" { - "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `upgradeDatabase`.", + "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `export`, `upgradeDatabase`.", "code": "invalid_task_types", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_task_types" @@ -119,7 +119,7 @@ async fn task_bad_types() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r#" { - "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `upgradeDatabase`.", + "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `export`, `upgradeDatabase`.", "code": "invalid_task_types", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_task_types" From 7fa1c41190620506bd31bcd54c5e4c713903b948 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 26 Jun 2025 18:25:49 +0200 Subject: [PATCH 20/36] Fix some api key errors --- crates/meilisearch/tests/auth/api_keys.rs | 2 +- crates/meilisearch/tests/auth/errors.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/meilisearch/tests/auth/api_keys.rs b/crates/meilisearch/tests/auth/api_keys.rs index 5a18b4dbf..2688dd918 100644 --- a/crates/meilisearch/tests/auth/api_keys.rs +++ b/crates/meilisearch/tests/auth/api_keys.rs @@ -421,7 +421,7 @@ async fn error_add_api_key_invalid_parameters_actions() { meili_snap::snapshot!(code, @"400 Bad Request"); meili_snap::snapshot!(meili_snap::json_string!(response, { ".createdAt" => "[ignored]", ".updatedAt" => "[ignored]" }), @r###" { - "message": "Unknown value `doc.add` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `network.get`, `network.update`, `chatCompletions`, `chats.*`, `chats.get`, `chats.delete`, `chatsSettings.*`, `chatsSettings.get`, `chatsSettings.update`", + "message": "Unknown value `doc.add` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `export`, `network.get`, `network.update`, `chatCompletions`, `chats.*`, `chats.get`, `chats.delete`, `chatsSettings.*`, `chatsSettings.get`, `chatsSettings.update`", "code": "invalid_api_key_actions", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_api_key_actions" diff --git a/crates/meilisearch/tests/auth/errors.rs b/crates/meilisearch/tests/auth/errors.rs index ebe2e53fa..687cb67a0 100644 --- a/crates/meilisearch/tests/auth/errors.rs +++ b/crates/meilisearch/tests/auth/errors.rs @@ -93,7 +93,7 @@ async fn create_api_key_bad_actions() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Unknown value `doggo` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `network.get`, `network.update`, `chatCompletions`, `chats.*`, `chats.get`, `chats.delete`, `chatsSettings.*`, `chatsSettings.get`, `chatsSettings.update`", + "message": "Unknown value `doggo` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `export`, `network.get`, `network.update`, `chatCompletions`, `chats.*`, `chats.get`, `chats.delete`, `chatsSettings.*`, `chatsSettings.get`, `chatsSettings.update`", "code": "invalid_api_key_actions", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_api_key_actions" From 657bbf5d1e4f4dba0c816d94ff3ee9002fe0b880 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Fri, 27 Jun 2025 10:14:26 +0200 Subject: [PATCH 21/36] Fix more tests --- crates/meilisearch-types/src/tasks.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 3301b4320..a6ed593db 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -614,6 +614,8 @@ impl FromStr for Kind { Ok(Kind::DumpCreation) } else if kind.eq_ignore_ascii_case("snapshotCreation") { Ok(Kind::SnapshotCreation) + } else if kind.eq_ignore_ascii_case("export") { + Ok(Kind::Export) } else if kind.eq_ignore_ascii_case("upgradeDatabase") { Ok(Kind::UpgradeDatabase) } else { From 72192994363c8fc4060014eecb1905dd88cb979f Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Fri, 27 Jun 2025 12:23:55 +0200 Subject: [PATCH 22/36] Better handle task abortion --- .../src/scheduler/process_export.rs | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index e777809fd..57f79c83f 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -16,7 +16,7 @@ use meilisearch_types::milli::{self, obkv_to_json, Filter, InternalError}; use meilisearch_types::settings::{self, SecretPolicy}; use meilisearch_types::tasks::ExportIndexSettings; use serde::Deserialize; -use ureq::{json, Agent}; +use ureq::{json, Response}; use super::MustStopProcessing; use crate::processing::AtomicDocumentStep; @@ -45,7 +45,7 @@ impl IndexScheduler { }) .collect(); - let agent: Agent = ureq::AgentBuilder::new().timeout(Duration::from_secs(5)).build(); + let agent = ureq::AgentBuilder::new().timeout(Duration::from_secs(5)).build(); let must_stop_processing = self.scheduler.must_stop_processing.clone(); for (i, (uid, settings)) in indexes.iter().enumerate() { if must_stop_processing.get() { @@ -272,11 +272,16 @@ fn retry(must_stop_processing: &MustStopProcessing, send_request: F) -> Resul where F: Fn() -> Result>, { - if must_stop_processing.get() { - return Err(Error::AbortedTask); - } - - match backoff::retry(ExponentialBackoff::default(), send_request) { + match backoff::retry(ExponentialBackoff::default(), || { + if must_stop_processing.get() { + return Err(backoff::Error::Permanent(ureq::Error::Status( + u16::MAX, + // 444: Connection Closed Without Response + Response::new(444, "Abort", "Aborted task").unwrap(), + ))); + } + send_request() + }) { Ok(response) => Ok(response), Err(backoff::Error::Permanent(e)) => Err(ureq_error_into_error(e)), Err(backoff::Error::Transient { err, retry_after: _ }) => Err(ureq_error_into_error(err)), @@ -306,6 +311,9 @@ fn ureq_error_into_error(error: ureq::Error) -> Error { } match error { + // This is a workaround to handle task abortion - the error propagation path + // makes it difficult to cleanly surface the abortion at this level. + ureq::Error::Status(u16::MAX, _) => Error::AbortedTask, ureq::Error::Status(_, response) => match response.into_json() { Ok(MeiliError { message, code, r#type, link }) => { Error::FromRemoteWhenExporting { message, code, r#type, link } From 85037352b95d947151692307c1f00371fed134a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 30 Jun 2025 18:31:32 +0200 Subject: [PATCH 23/36] Fix most of the easy issues --- crates/index-scheduler/src/processing.rs | 4 ++-- .../src/scheduler/create_batch.rs | 6 ++--- .../src/scheduler/process_export.rs | 5 ++-- crates/index-scheduler/src/utils.rs | 2 +- crates/meilisearch-types/src/task_view.rs | 23 ++++++++++++++++++- crates/meilisearch/src/routes/export.rs | 15 ++++++------ 6 files changed, 39 insertions(+), 16 deletions(-) diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs index 5d4ac11c3..631719f73 100644 --- a/crates/index-scheduler/src/processing.rs +++ b/crates/index-scheduler/src/processing.rs @@ -178,8 +178,8 @@ make_enum_progress! { make_enum_progress! { pub enum Export { EnsuringCorrectnessOfTheTarget, - ExportTheSettings, - ExportTheDocuments, + ExporingTheSettings, + ExporingTheDocuments, } } diff --git a/crates/index-scheduler/src/scheduler/create_batch.rs b/crates/index-scheduler/src/scheduler/create_batch.rs index 7a6fa4a9b..b08d27d48 100644 --- a/crates/index-scheduler/src/scheduler/create_batch.rs +++ b/crates/index-scheduler/src/scheduler/create_batch.rs @@ -510,9 +510,9 @@ impl IndexScheduler { // 3. we batch the export. let to_export = self.queue.tasks.get_kind(rtxn, Kind::Export)? & enqueued; if !to_export.is_empty() { - let mut tasks = self.queue.tasks.get_existing_tasks(rtxn, to_export)?; - current_batch.processing(&mut tasks); - let task = tasks.pop().expect("There must be only one export task"); + let task_id = to_export.iter().next().expect("There must be only one export task"); + let mut task = self.queue.tasks.get_task(rtxn, task_id)?.unwrap(); + current_batch.processing([&mut task]); current_batch.reason(BatchStopReason::TaskKindCannotBeBatched { kind: Kind::Export }); return Ok(Some((Batch::Export { task }, current_batch))); } diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 57f79c83f..b81ff0b96 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -86,10 +86,11 @@ impl IndexScheduler { } // Retry logic for sending settings let url = format!("{base_url}/indexes/{uid}/settings"); + let bearer = api_key.map(|api_key| format!("Bearer {api_key}")); retry(&must_stop_processing, || { let mut request = agent.patch(&url); - if let Some(api_key) = api_key { - request = request.set("Authorization", &format!("Bearer {api_key}")); + if let Some(bearer) = bearer.as_ref() { + request = request.set("Authorization", bearer); } request.send_json(settings.clone()).map_err(into_backoff_error) })?; diff --git a/crates/index-scheduler/src/utils.rs b/crates/index-scheduler/src/utils.rs index 594023145..2cfe63bff 100644 --- a/crates/index-scheduler/src/utils.rs +++ b/crates/index-scheduler/src/utils.rs @@ -273,7 +273,7 @@ pub fn swap_index_uid_in_task(task: &mut Task, swap: (&str, &str)) { K::TaskCancelation { .. } | K::TaskDeletion { .. } | K::DumpCreation { .. } - | K::Export { .. } // TODO I have patterns, not index uids + | K::Export { .. } | K::UpgradeDatabase { .. } | K::SnapshotCreation => (), }; diff --git a/crates/meilisearch-types/src/task_view.rs b/crates/meilisearch-types/src/task_view.rs index 1dbd5637b..7521137c0 100644 --- a/crates/meilisearch-types/src/task_view.rs +++ b/crates/meilisearch-types/src/task_view.rs @@ -371,7 +371,10 @@ impl From
for DetailsView { } Details::Export { url, api_key, payload_size, indexes } => DetailsView { url: Some(url), - api_key, + api_key: api_key.map(|mut api_key| { + hide_secret(&mut api_key); + api_key + }), payload_size: payload_size .map(|ps| ps.get_appropriate_unit(UnitType::Both).to_string()), indexes: Some( @@ -390,3 +393,21 @@ impl From
for DetailsView { } } } + +// We definitely need to factorize the code to hide the secret key +fn hide_secret(secret: &mut String) { + match secret.len() { + x if x < 10 => { + secret.replace_range(.., "XXX..."); + } + x if x < 20 => { + secret.replace_range(2.., "XXXX..."); + } + x if x < 30 => { + secret.replace_range(3.., "XXXXX..."); + } + _x => { + secret.replace_range(5.., "XXXXXX..."); + } + } +} diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 21a77ae32..1df2d271e 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -42,17 +42,18 @@ pub fn configure(cfg: &mut web::ServiceConfig) { } #[utoipa::path( - get, + post, path = "", tag = "Export", security(("Bearer" = ["export", "*"])), responses( (status = OK, description = "Known nodes are returned", body = Export, content_type = "application/json", example = json!( - { - "indexes": ["movie", "steam-*"], - "skip_embeddings": true, - "apiKey": "meilisearch-api-key" - })), + { + "taskUid": 1, + "status": "enqueued", + "type": "export", + "enqueuedAt": "2021-08-11T09:25:53.000000Z" + })), (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( { "message": "The Authorization header is missing. It must use the bearer authorization method.", @@ -126,7 +127,7 @@ pub struct Export { #[serde(default)] #[deserr(default, error = DeserrJsonError)] pub payload_size: Option, - #[schema(value_type = Option>, example = json!(["movies", "steam-*"]))] + #[schema(value_type = Option>, example = json!({ "*": { "filter": null } }))] #[deserr(default)] #[serde(default)] pub indexes: BTreeMap, From ad03c86c4493cb1dec38897983bd0a4d6ec21631 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 30 Jun 2025 18:46:47 +0200 Subject: [PATCH 24/36] Display an accurate number of uploaded documents --- .../src/scheduler/process_batch.rs | 10 +++++---- .../src/scheduler/process_export.rs | 21 +++++++++++++------ crates/meilisearch-types/src/tasks.rs | 8 +++---- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs index e56b8e13a..090ff844d 100644 --- a/crates/index-scheduler/src/scheduler/process_batch.rs +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -377,9 +377,8 @@ impl IndexScheduler { ) })); - match ret { - // TODO return the matched and exported documents - Ok(Ok(())) => (), + let stats = match ret { + Ok(Ok(stats)) => stats, Ok(Err(Error::AbortedTask)) => return Err(Error::AbortedTask), Ok(Err(e)) => return Err(Error::Export(Box::new(e))), Err(e) => { @@ -394,9 +393,12 @@ impl IndexScheduler { msg.to_string(), )))); } - } + }; task.status = Status::Succeeded; + if let Some(Details::Export { indexes, .. }) = task.details.as_mut() { + *indexes = stats; + } Ok((vec![task], ProcessBatchInfo::default())) } Batch::UpgradeDatabase { mut tasks } => { diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index b81ff0b96..bf2917b73 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -14,7 +14,7 @@ use meilisearch_types::milli::update::{request_threads, Setting}; use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use meilisearch_types::milli::{self, obkv_to_json, Filter, InternalError}; use meilisearch_types::settings::{self, SecretPolicy}; -use meilisearch_types::tasks::ExportIndexSettings; +use meilisearch_types::tasks::{DetailsExportIndexSettings, ExportIndexSettings}; use serde::Deserialize; use ureq::{json, Response}; @@ -30,7 +30,7 @@ impl IndexScheduler { payload_size: Option<&Byte>, indexes: &BTreeMap, progress: Progress, - ) -> Result<()> { + ) -> Result> { #[cfg(test)] self.maybe_fail(crate::test_utils::FailureLocation::ProcessExport)?; @@ -41,13 +41,14 @@ impl IndexScheduler { indexes .iter() .find(|(pattern, _)| pattern.matches_str(&uid)) - .map(|(_pattern, settings)| (uid, settings)) + .map(|(pattern, settings)| (pattern, uid, settings)) }) .collect(); + let mut output = BTreeMap::new(); let agent = ureq::AgentBuilder::new().timeout(Duration::from_secs(5)).build(); let must_stop_processing = self.scheduler.must_stop_processing.clone(); - for (i, (uid, settings)) in indexes.iter().enumerate() { + for (i, (pattern, uid, export_settings)) in indexes.iter().enumerate() { if must_stop_processing.get() { return Err(Error::AbortedTask); } @@ -58,7 +59,7 @@ impl IndexScheduler { indexes.len() as u32, )); - let ExportIndexSettings { filter } = settings; + let ExportIndexSettings { filter } = export_settings; let index = self.index(uid)?; let index_rtxn = index.read_txn()?; @@ -125,6 +126,14 @@ impl IndexScheduler { let (step, progress_step) = AtomicDocumentStep::new(total_documents); progress.update_progress(progress_step); + output.insert( + (*pattern).clone(), + DetailsExportIndexSettings { + settings: (*export_settings).clone(), + matched_documents: Some(total_documents as u64), + }, + ); + let limit = payload_size.map(|ps| ps.as_u64() as usize).unwrap_or(50 * 1024 * 1024); // defaults to 50 MiB let documents_url = format!("{base_url}/indexes/{uid}/documents"); @@ -265,7 +274,7 @@ impl IndexScheduler { step.store(total_documents, atomic::Ordering::Relaxed); } - Ok(()) + Ok(output) } } diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index a6ed593db..cdbf6d3aa 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -707,16 +707,14 @@ pub enum Details { #[schema(rename_all = "camelCase")] pub struct DetailsExportIndexSettings { #[serde(flatten)] - settings: ExportIndexSettings, + pub settings: ExportIndexSettings, #[serde(skip_serializing_if = "Option::is_none")] - matched_documents: Option, - #[serde(skip_serializing_if = "Option::is_none")] - exported_documents: Option, + pub matched_documents: Option, } impl From for DetailsExportIndexSettings { fn from(settings: ExportIndexSettings) -> Self { - DetailsExportIndexSettings { settings, matched_documents: None, exported_documents: None } + DetailsExportIndexSettings { settings, matched_documents: None } } } From f4bb6cbca894e690e9789a7945cbf1f4f2d5d800 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 30 Jun 2025 18:59:16 +0200 Subject: [PATCH 25/36] Better behavior when null indexes --- crates/meilisearch-types/src/tasks.rs | 2 +- crates/meilisearch/src/routes/export.rs | 14 ++++++++------ crates/meilisearch/src/routes/export_analytics.rs | 7 ++++--- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index cdbf6d3aa..0618fa333 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -210,7 +210,7 @@ impl KindWithContent { | SnapshotCreation | TaskCancelation { .. } | TaskDeletion { .. } - | Export { .. } // TODO Should I resolve the index names? + | Export { .. } | UpgradeDatabase { .. } => vec![], DocumentAdditionOrUpdate { index_uid, .. } | DocumentEdition { index_uid, .. } diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 1df2d271e..31f8812c7 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -81,15 +81,17 @@ async fn export( let Export { url, api_key, payload_size, indexes } = export; - let indexes = if indexes.is_empty() { - BTreeMap::from([(IndexUidPattern::new_unchecked("*"), DbExportIndexSettings::default())]) - } else { - indexes + let indexes = match indexes { + Some(indexes) => indexes .into_iter() .map(|(pattern, ExportIndexSettings { filter })| { (pattern, DbExportIndexSettings { filter }) }) - .collect() + .collect(), + None => BTreeMap::from([( + IndexUidPattern::new_unchecked("*"), + DbExportIndexSettings::default(), + )]), }; let task = KindWithContent::Export { @@ -130,7 +132,7 @@ pub struct Export { #[schema(value_type = Option>, example = json!({ "*": { "filter": null } }))] #[deserr(default)] #[serde(default)] - pub indexes: BTreeMap, + pub indexes: Option>, } /// A wrapper around the `Byte` type that implements `Deserr`. diff --git a/crates/meilisearch/src/routes/export_analytics.rs b/crates/meilisearch/src/routes/export_analytics.rs index 44dba2c9b..7ac713e9b 100644 --- a/crates/meilisearch/src/routes/export_analytics.rs +++ b/crates/meilisearch/src/routes/export_analytics.rs @@ -15,9 +15,10 @@ impl ExportAnalytics { let Export { url: _, api_key, payload_size, indexes } = export; let has_api_key = api_key.is_some(); - let index_patterns_count = indexes.len(); - let patterns_with_filter_count = - indexes.values().filter(|settings| settings.filter.is_some()).count(); + let index_patterns_count = indexes.as_ref().map_or(0, |indexes| indexes.len()); + let patterns_with_filter_count = indexes.as_ref().map_or(0, |indexes| { + indexes.values().filter(|settings| settings.filter.is_some()).count() + }); let payload_sizes = if let Some(crate::routes::export::ByteWithDeserr(byte_size)) = payload_size { vec![byte_size.as_u64()] From efd5fd96ccc63a886005b0d42e79cd9a5aaa13f9 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 11:02:42 +0200 Subject: [PATCH 26/36] Add the overrideSettings parameter --- .../src/scheduler/process_export.rs | 83 +++++++++++++------ crates/meilisearch-types/src/error.rs | 1 + crates/meilisearch-types/src/tasks.rs | 1 + crates/meilisearch/src/routes/export.rs | 8 +- 4 files changed, 65 insertions(+), 28 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index bf2917b73..19b2bf743 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -59,42 +59,73 @@ impl IndexScheduler { indexes.len() as u32, )); - let ExportIndexSettings { filter } = export_settings; + let ExportIndexSettings { filter, override_settings } = export_settings; let index = self.index(uid)?; let index_rtxn = index.read_txn()?; - // Send the primary key + let url = format!("{base_url}/indexes/{uid}"); + + // First, check if the index already exists + let response = retry(&must_stop_processing, || { + let mut request = agent.get(&url); + if let Some(api_key) = api_key { + request = request.set("Authorization", &format!("Bearer {api_key}")); + } + + request.send_string("").map_err(into_backoff_error) + })?; + let already_existed = response.status() == 200; + let primary_key = index .primary_key(&index_rtxn) .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; - let url = format!("{base_url}/indexes"); - retry(&must_stop_processing, || { - let mut request = agent.post(&url); - if let Some(api_key) = api_key { - request = request.set("Authorization", &format!("Bearer {api_key}")); - } - let index_param = json!({ "uid": uid, "primaryKey": primary_key }); - request.send_json(&index_param).map_err(into_backoff_error) - })?; + // Create the index + if !already_existed { + let url = format!("{base_url}/indexes"); + retry(&must_stop_processing, || { + let mut request = agent.post(&url); + if let Some(api_key) = api_key { + request = request.set("Authorization", &format!("Bearer {api_key}")); + } + let index_param = json!({ "uid": uid, "primaryKey": primary_key }); + request.send_json(&index_param).map_err(into_backoff_error) + })?; + } + + // Patch the index primary key + if already_existed && *override_settings { + let url = format!("{base_url}/indexes/{uid}"); + retry(&must_stop_processing, || { + let mut request = agent.patch(&url); + if let Some(api_key) = api_key { + request = request.set("Authorization", &format!("Bearer {api_key}")); + } + let index_param = json!({ "primaryKey": primary_key }); + request.send_json(&index_param).map_err(into_backoff_error) + })?; + } // Send the index settings - let mut settings = settings::settings(&index, &index_rtxn, SecretPolicy::RevealSecrets) - .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; - // Remove the experimental chat setting if not enabled - if self.features().check_chat_completions("exporting chat settings").is_err() { - settings.chat = Setting::NotSet; - } - // Retry logic for sending settings - let url = format!("{base_url}/indexes/{uid}/settings"); - let bearer = api_key.map(|api_key| format!("Bearer {api_key}")); - retry(&must_stop_processing, || { - let mut request = agent.patch(&url); - if let Some(bearer) = bearer.as_ref() { - request = request.set("Authorization", bearer); + if !already_existed || *override_settings { + let mut settings = + settings::settings(&index, &index_rtxn, SecretPolicy::RevealSecrets) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + // Remove the experimental chat setting if not enabled + if self.features().check_chat_completions("exporting chat settings").is_err() { + settings.chat = Setting::NotSet; } - request.send_json(settings.clone()).map_err(into_backoff_error) - })?; + // Retry logic for sending settings + let url = format!("{base_url}/indexes/{uid}/settings"); + let bearer = api_key.map(|api_key| format!("Bearer {api_key}")); + retry(&must_stop_processing, || { + let mut request = agent.patch(&url); + if let Some(bearer) = bearer.as_ref() { + request = request.set("Authorization", bearer); + } + request.send_json(settings.clone()).map_err(into_backoff_error) + })?; + } let filter = filter .as_ref() diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index 1c2840084..30f6868f6 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -395,6 +395,7 @@ InvalidExportApiKey , InvalidRequest , BAD_REQU InvalidExportPayloadSize , InvalidRequest , BAD_REQUEST ; InvalidExportIndexesPatterns , InvalidRequest , BAD_REQUEST ; InvalidExportIndexFilter , InvalidRequest , BAD_REQUEST ; +InvalidExportIndexOverrideSettings , InvalidRequest , BAD_REQUEST ; // Experimental features - Chat Completions UnimplementedExternalFunctionCalling , InvalidRequest , NOT_IMPLEMENTED ; UnimplementedNonStreamingChatCompletions , InvalidRequest , NOT_IMPLEMENTED ; diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 0618fa333..99b04f1e3 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -178,6 +178,7 @@ pub struct IndexSwap { #[serde(rename_all = "camelCase")] pub struct ExportIndexSettings { pub filter: Option, + pub override_settings: bool, } impl KindWithContent { diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 31f8812c7..172a162c6 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -84,8 +84,8 @@ async fn export( let indexes = match indexes { Some(indexes) => indexes .into_iter() - .map(|(pattern, ExportIndexSettings { filter })| { - (pattern, DbExportIndexSettings { filter }) + .map(|(pattern, ExportIndexSettings { filter, override_settings })| { + (pattern, DbExportIndexSettings { filter, override_settings }) }) .collect(), None => BTreeMap::from([( @@ -179,4 +179,8 @@ pub struct ExportIndexSettings { #[serde(default)] #[deserr(default, error = DeserrJsonError)] pub filter: Option, + #[schema(value_type = Option, example = json!(true))] + #[serde(default)] + #[deserr(default, error = DeserrJsonError)] + pub override_settings: bool, } From 9cfbef478eb80258b1698c75abe80b5a0f92b85b Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 11:04:59 +0200 Subject: [PATCH 27/36] Add override setttings to analytics --- crates/meilisearch/src/routes/export_analytics.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/crates/meilisearch/src/routes/export_analytics.rs b/crates/meilisearch/src/routes/export_analytics.rs index 7ac713e9b..b66a5133b 100644 --- a/crates/meilisearch/src/routes/export_analytics.rs +++ b/crates/meilisearch/src/routes/export_analytics.rs @@ -7,6 +7,7 @@ pub struct ExportAnalytics { has_api_key: bool, sum_index_patterns: usize, sum_patterns_with_filter: usize, + sum_patterns_with_override_settings: usize, payload_sizes: Vec, } @@ -19,6 +20,9 @@ impl ExportAnalytics { let patterns_with_filter_count = indexes.as_ref().map_or(0, |indexes| { indexes.values().filter(|settings| settings.filter.is_some()).count() }); + let patterns_with_override_settings_count = indexes.as_ref().map_or(0, |indexes| { + indexes.values().filter(|settings| settings.override_settings).count() + }); let payload_sizes = if let Some(crate::routes::export::ByteWithDeserr(byte_size)) = payload_size { vec![byte_size.as_u64()] @@ -31,6 +35,7 @@ impl ExportAnalytics { has_api_key, sum_index_patterns: index_patterns_count, sum_patterns_with_filter: patterns_with_filter_count, + sum_patterns_with_override_settings: patterns_with_override_settings_count, payload_sizes, } } @@ -46,6 +51,7 @@ impl Aggregate for ExportAnalytics { self.has_api_key |= other.has_api_key; self.sum_index_patterns += other.sum_index_patterns; self.sum_patterns_with_filter += other.sum_patterns_with_filter; + self.sum_patterns_with_override_settings += other.sum_patterns_with_override_settings; self.payload_sizes.extend(other.payload_sizes); self } @@ -69,11 +75,18 @@ impl Aggregate for ExportAnalytics { Some(self.sum_patterns_with_filter as f64 / self.total_received as f64) }; + let avg_patterns_with_override_settings = if self.total_received == 0 { + None + } else { + Some(self.sum_patterns_with_override_settings as f64 / self.total_received as f64) + }; + serde_json::json!({ "total_received": self.total_received, "has_api_key": self.has_api_key, "avg_index_patterns": avg_index_patterns, "avg_patterns_with_filter": avg_patterns_with_filter, + "avg_patterns_with_override_settings": avg_patterns_with_override_settings, "avg_payload_size": avg_payload_size, }) } From 259fc067d33ff78593ae3b842ea2aabd169f7ac5 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 11:14:59 +0200 Subject: [PATCH 28/36] Count exported documents by index name, not pattern --- .../src/scheduler/process_export.rs | 9 ++++----- crates/meilisearch-types/src/tasks.rs | 14 +++++++------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 19b2bf743..d1f5616b7 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -30,7 +30,7 @@ impl IndexScheduler { payload_size: Option<&Byte>, indexes: &BTreeMap, progress: Progress, - ) -> Result> { + ) -> Result> { #[cfg(test)] self.maybe_fail(crate::test_utils::FailureLocation::ProcessExport)?; @@ -48,7 +48,7 @@ impl IndexScheduler { let mut output = BTreeMap::new(); let agent = ureq::AgentBuilder::new().timeout(Duration::from_secs(5)).build(); let must_stop_processing = self.scheduler.must_stop_processing.clone(); - for (i, (pattern, uid, export_settings)) in indexes.iter().enumerate() { + for (i, (_pattern, uid, export_settings)) in indexes.iter().enumerate() { if must_stop_processing.get() { return Err(Error::AbortedTask); } @@ -63,9 +63,8 @@ impl IndexScheduler { let index = self.index(uid)?; let index_rtxn = index.read_txn()?; - let url = format!("{base_url}/indexes/{uid}"); - // First, check if the index already exists + let url = format!("{base_url}/indexes/{uid}"); let response = retry(&must_stop_processing, || { let mut request = agent.get(&url); if let Some(api_key) = api_key { @@ -158,7 +157,7 @@ impl IndexScheduler { progress.update_progress(progress_step); output.insert( - (*pattern).clone(), + uid.clone(), DetailsExportIndexSettings { settings: (*export_settings).clone(), matched_documents: Some(total_documents as u64), diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 99b04f1e3..423cf539e 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -289,12 +289,12 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, payload_size, indexes } => { + KindWithContent::Export { url, api_key, payload_size, indexes: _ } => { Some(Details::Export { url: url.clone(), api_key: api_key.clone(), payload_size: *payload_size, - indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + indexes: BTreeMap::new(), }) } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { @@ -363,12 +363,12 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, payload_size, indexes } => { + KindWithContent::Export { url, api_key, payload_size, indexes: _ } => { Some(Details::Export { url: url.clone(), api_key: api_key.clone(), payload_size: *payload_size, - indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + indexes: BTreeMap::new(), }) } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { @@ -419,12 +419,12 @@ impl From<&KindWithContent> for Option
{ }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, payload_size, indexes } => { + KindWithContent::Export { url, api_key, payload_size, indexes: _ } => { Some(Details::Export { url: url.clone(), api_key: api_key.clone(), payload_size: *payload_size, - indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + indexes: BTreeMap::new(), }) } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { @@ -696,7 +696,7 @@ pub enum Details { url: String, api_key: Option, payload_size: Option, - indexes: BTreeMap, + indexes: BTreeMap, }, UpgradeDatabase { from: (u32, u32, u32), From d439a3cb9d05f6b69a41a7a1fd4370c0cd1ce128 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 14:39:24 +0200 Subject: [PATCH 29/36] Fix progress names --- crates/index-scheduler/src/processing.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs index 631719f73..2aa7cf859 100644 --- a/crates/index-scheduler/src/processing.rs +++ b/crates/index-scheduler/src/processing.rs @@ -178,8 +178,8 @@ make_enum_progress! { make_enum_progress! { pub enum Export { EnsuringCorrectnessOfTheTarget, - ExporingTheSettings, - ExporingTheDocuments, + ExportingTheSettings, + ExportingTheDocuments, } } From 074d509d9280cdc277b80950dec111737126c375 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 14:39:52 +0200 Subject: [PATCH 30/36] Fix expect message --- crates/index-scheduler/src/scheduler/create_batch.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/index-scheduler/src/scheduler/create_batch.rs b/crates/index-scheduler/src/scheduler/create_batch.rs index b08d27d48..693275c32 100644 --- a/crates/index-scheduler/src/scheduler/create_batch.rs +++ b/crates/index-scheduler/src/scheduler/create_batch.rs @@ -510,7 +510,7 @@ impl IndexScheduler { // 3. we batch the export. let to_export = self.queue.tasks.get_kind(rtxn, Kind::Export)? & enqueued; if !to_export.is_empty() { - let task_id = to_export.iter().next().expect("There must be only one export task"); + let task_id = to_export.iter().next().expect("There must be at least one export task"); let mut task = self.queue.tasks.get_task(rtxn, task_id)?.unwrap(); current_batch.processing([&mut task]); current_batch.reason(BatchStopReason::TaskKindCannotBeBatched { kind: Kind::Export }); From 9dac91efe056d17eeabe18aaafdd1da401b44416 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 14:40:39 +0200 Subject: [PATCH 31/36] Fix utoipa response --- crates/meilisearch/src/routes/export.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 172a162c6..97356f7eb 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -47,7 +47,7 @@ pub fn configure(cfg: &mut web::ServiceConfig) { tag = "Export", security(("Bearer" = ["export", "*"])), responses( - (status = OK, description = "Known nodes are returned", body = Export, content_type = "application/json", example = json!( + (status = 202, description = "Export successfully enqueued", body = SummarizedTaskView, content_type = "application/json", example = json!( { "taskUid": 1, "status": "enqueued", From c078efd730ffec4a4f2d9670437287d080269ca9 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 14:40:59 +0200 Subject: [PATCH 32/36] Remove experimental todo --- crates/meilisearch/src/routes/export.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 97356f7eb..a4b6720d1 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -71,9 +71,6 @@ async fn export( opt: web::Data, analytics: Data, ) -> Result { - // TODO make it experimental? - // index_scheduler.features().check_network("Using the /network route")?; - let export = export.into_inner(); debug!(returns = ?export, "Trigger export"); From 25c19a306b1fa4967b013066c693012293347272 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 14:42:44 +0200 Subject: [PATCH 33/36] Rename variable Co-authored-by: Kero --- crates/index-scheduler/src/scheduler/process_export.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index d1f5616b7..b5134deb9 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -71,16 +71,16 @@ impl IndexScheduler { request = request.set("Authorization", &format!("Bearer {api_key}")); } - request.send_string("").map_err(into_backoff_error) + request.send_bytes(Default::default()).map_err(into_backoff_error) })?; - let already_existed = response.status() == 200; + let index_exists = response.status() == 200; let primary_key = index .primary_key(&index_rtxn) .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; // Create the index - if !already_existed { + if !index_exists { let url = format!("{base_url}/indexes"); retry(&must_stop_processing, || { let mut request = agent.post(&url); @@ -93,7 +93,7 @@ impl IndexScheduler { } // Patch the index primary key - if already_existed && *override_settings { + if index_exists && *override_settings { let url = format!("{base_url}/indexes/{uid}"); retry(&must_stop_processing, || { let mut request = agent.patch(&url); @@ -106,7 +106,7 @@ impl IndexScheduler { } // Send the index settings - if !already_existed || *override_settings { + if !index_exists || *override_settings { let mut settings = settings::settings(&index, &index_rtxn, SecretPolicy::RevealSecrets) .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; From 37a692f942253c128980e31e6d3be75b94a12a0e Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 14:47:43 +0200 Subject: [PATCH 34/36] Keep `IndexUidPattern` --- .../src/scheduler/process_export.rs | 4 ++-- crates/meilisearch-types/src/tasks.rs | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index b5134deb9..eaad7aa34 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -30,7 +30,7 @@ impl IndexScheduler { payload_size: Option<&Byte>, indexes: &BTreeMap, progress: Progress, - ) -> Result> { + ) -> Result> { #[cfg(test)] self.maybe_fail(crate::test_utils::FailureLocation::ProcessExport)?; @@ -157,7 +157,7 @@ impl IndexScheduler { progress.update_progress(progress_step); output.insert( - uid.clone(), + IndexUidPattern::new_unchecked(uid.clone()), DetailsExportIndexSettings { settings: (*export_settings).clone(), matched_documents: Some(total_documents as u64), diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 423cf539e..99b04f1e3 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -289,12 +289,12 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, payload_size, indexes: _ } => { + KindWithContent::Export { url, api_key, payload_size, indexes } => { Some(Details::Export { url: url.clone(), api_key: api_key.clone(), payload_size: *payload_size, - indexes: BTreeMap::new(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }) } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { @@ -363,12 +363,12 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, payload_size, indexes: _ } => { + KindWithContent::Export { url, api_key, payload_size, indexes } => { Some(Details::Export { url: url.clone(), api_key: api_key.clone(), payload_size: *payload_size, - indexes: BTreeMap::new(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }) } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { @@ -419,12 +419,12 @@ impl From<&KindWithContent> for Option
{ }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, payload_size, indexes: _ } => { + KindWithContent::Export { url, api_key, payload_size, indexes } => { Some(Details::Export { url: url.clone(), api_key: api_key.clone(), payload_size: *payload_size, - indexes: BTreeMap::new(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }) } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { @@ -696,7 +696,7 @@ pub enum Details { url: String, api_key: Option, payload_size: Option, - indexes: BTreeMap, + indexes: BTreeMap, }, UpgradeDatabase { from: (u32, u32, u32), From b7bebe9bbb33b4ba87408362068f732281f609ea Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 15:03:04 +0200 Subject: [PATCH 35/36] Fix export when index already exists --- crates/index-scheduler/src/scheduler/process_export.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index eaad7aa34..676481319 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -72,8 +72,12 @@ impl IndexScheduler { } request.send_bytes(Default::default()).map_err(into_backoff_error) - })?; - let index_exists = response.status() == 200; + }); + let index_exists = match response { + Ok(response) => response.status() == 200, + Err(Error::FromRemoteWhenExporting { code, .. }) if code == "index_not_found" => false, + Err(e) => return Err(e), + }; let primary_key = index .primary_key(&index_rtxn) From 9211e94c4f019a890175a109b1ce78a43c10bb5f Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 15:03:20 +0200 Subject: [PATCH 36/36] Format --- crates/index-scheduler/src/scheduler/process_export.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 676481319..30721065e 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -75,7 +75,9 @@ impl IndexScheduler { }); let index_exists = match response { Ok(response) => response.status() == 200, - Err(Error::FromRemoteWhenExporting { code, .. }) if code == "index_not_found" => false, + Err(Error::FromRemoteWhenExporting { code, .. }) if code == "index_not_found" => { + false + } Err(e) => return Err(e), };