Merge #3550

3550: Delete documents by filter r=irevoire a=dureuill # Prototype `prototype-delete-by-filter-0` Usage: A new route is available under `POST /indexes/{index_uid}/documents/delete` that allows you to delete your documents by filter. The expected payload looks like that: ```json { "filter": "doggo = bernese", } ``` It'll then enqueue a task in your task queue that'll delete all the documents matching this filter once it's processed. Here is an example of the associated details; ```json "details": { "deletedDocuments": 53, "originalFilter": "\"doggo = bernese\"" } ``` ---------- # Pull Request ## Related issue Related to https://github.com/meilisearch/meilisearch/issues/3477 ## What does this PR do? ### User standpoint - Modifies the `/indexes/{:indexUid}/documents/delete-batch` route to accept either the existing array of documents ids, or a JSON object with a `filter` field representing a filter to apply. If that latter variant is used, any document matching the filter will be deleted. ### Implementation standpoint - (processing time version) Adds a new BatchKind that is not autobatchable and that performs the delete by filter - Reuse the `documentDeletion` task with a new `originalFilter` detail that replaces the `providedIds` detail. ## Example <details> <summary>Sample request, response and task result</summary> Request: ``` curl \ -X POST 'http://localhost:7700/indexes/index-10/documents/delete-batch' \ -H 'Content-Type: application/json' \ --data-binary '{ "filter" : "mass = 600"}' ``` Response: ``` { "taskUid": 3902, "indexUid": "index-10", "status": "enqueued", "type": "documentDeletion", "enqueuedAt": "2023-02-28T20:50:31.667502Z" } ``` Task log: ```json { "uid": 3906, "indexUid": "index-12", "status": "succeeded", "type": "documentDeletion", "canceledBy": null, "details": { "deletedDocuments": 3, "originalFilter": "\"mass = 600\"" }, "error": null, "duration": "PT0.001819S", "enqueuedAt": "2023-03-07T08:57:20.11387Z", "startedAt": "2023-03-07T08:57:20.115895Z", "finishedAt": "2023-03-07T08:57:20.117714Z" } ``` </details> ## Draft status - [ ] Error handling - [ ] Analytics - [ ] Do we want to reuse the `delete-batch` route in this way, or create a new route instead? - [ ] Should the filter be applied at request time or when the deletion task is processed? - The first commit in this PR applies the filter at request time, meaning that even if a document is modified in a way that no longer matches the filter in a later update, it will be deleted as long as the deletion task is processed after that update. - The other commits in this PR apply the filter only when the asynchronous deletion task is processed, meaning that documents that match the filter at processing time are deleted even if they didn't match the filter at request time. - [ ] If keeping the filter at request time, find a more elegant way to recover the user document ids from the internal document ids. The current way implemented in the first commit of this PR involves getting all the documents matching the filter, looking for the value of their primary key, and turning it into a string by copy-pasting routines found in milli... - [ ] Security consideration, if any - [ ] Fix the tests (but waiting until product questions are resolved) - [ ] Add delete by filter specific tests Co-authored-by: Louis Dureuil <louis@meilisearch.com> Co-authored-by: Tamo <tamo@meilisearch.com>
2025-07-03 03:47:02 +02:00 · 2023-05-04 10:44:41 +00:00 · 2023-05-04 10:44:41 +00:00 · a95128df6b
commit a95128df6b
parent e0537c3870 d5059520aa
20 changed files with 688 additions and 15 deletions
--- a/meilisearch/src/analytics/mod.rs
+++ b/meilisearch/src/analytics/mod.rs
@ -64,6 +64,7 @@ pub enum DocumentDeletionKind {
    PerDocumentId,
    ClearAll,
    PerBatch,
+    PerFilter,
 }

 pub trait Analytics: Sync + Send {
--- a/meilisearch/src/analytics/segment_analytics.rs
+++ b/meilisearch/src/analytics/segment_analytics.rs
@ -949,6 +949,7 @@ pub struct DocumentsDeletionAggregator {
    per_document_id: bool,
    clear_all: bool,
    per_batch: bool,
+    per_filter: bool,
 }

 impl DocumentsDeletionAggregator {
@ -962,6 +963,7 @@ impl DocumentsDeletionAggregator {
            DocumentDeletionKind::PerDocumentId => ret.per_document_id = true,
            DocumentDeletionKind::ClearAll => ret.clear_all = true,
            DocumentDeletionKind::PerBatch => ret.per_batch = true,
+            DocumentDeletionKind::PerFilter => ret.per_filter = true,
        }

        ret
@ -981,6 +983,7 @@ impl DocumentsDeletionAggregator {
        self.per_document_id |= other.per_document_id;
        self.clear_all |= other.clear_all;
        self.per_batch |= other.per_batch;
+        self.per_filter |= other.per_filter;
    }

    pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> {
--- a/meilisearch/src/error.rs
+++ b/meilisearch/src/error.rs
@ -20,6 +20,8 @@ pub enum MeilisearchHttpError {
    InvalidContentType(String, Vec<String>),
    #[error("Document `{0}` not found.")]
    DocumentNotFound(String),
+    #[error("Sending an empty filter is forbidden.")]
+    EmptyFilter,
    #[error("Invalid syntax for the filter parameter: `expected {}, found: {1}`.", .0.join(", "))]
    InvalidExpression(&'static [&'static str], Value),
    #[error("A {0} payload is missing.")]
@ -58,6 +60,7 @@ impl ErrorCode for MeilisearchHttpError {
            MeilisearchHttpError::MissingPayload(_) => Code::MissingPayload,
            MeilisearchHttpError::InvalidContentType(_, _) => Code::InvalidContentType,
            MeilisearchHttpError::DocumentNotFound(_) => Code::DocumentNotFound,
+            MeilisearchHttpError::EmptyFilter => Code::InvalidDocumentDeleteFilter,
            MeilisearchHttpError::InvalidExpression(_, _) => Code::InvalidSearchFilter,
            MeilisearchHttpError::PayloadTooLarge => Code::PayloadTooLarge,
            MeilisearchHttpError::SwapIndexPayloadWrongLength(_) => Code::InvalidSwapIndexes,
--- a/meilisearch/src/routes/indexes/documents.rs
+++ b/meilisearch/src/routes/indexes/documents.rs
@ -4,13 +4,13 @@ use actix_web::http::header::CONTENT_TYPE;
 use actix_web::web::Data;
 use actix_web::{web, HttpMessage, HttpRequest, HttpResponse};
 use bstr::ByteSlice;
-use deserr::actix_web::AwebQueryParameter;
+use deserr::actix_web::{AwebJson, AwebQueryParameter};
 use deserr::Deserr;
 use futures::StreamExt;
 use index_scheduler::IndexScheduler;
 use log::debug;
 use meilisearch_types::deserr::query_params::Param;
-use meilisearch_types::deserr::DeserrQueryParamError;
+use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError};
 use meilisearch_types::document_formats::{read_csv, read_json, read_ndjson, PayloadType};
 use meilisearch_types::error::deserr_codes::*;
 use meilisearch_types::error::{Code, ResponseError};
@ -71,8 +71,11 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
            .route(web::put().to(SeqHandler(update_documents)))
            .route(web::delete().to(SeqHandler(clear_all_documents))),
    )
-    // this route needs to be before the /documents/{document_id} to match properly
-    .service(web::resource("/delete-batch").route(web::post().to(SeqHandler(delete_documents))))
+    // these routes need to be before the /documents/{document_id} to match properly
+    .service(
+        web::resource("/delete-batch").route(web::post().to(SeqHandler(delete_documents_batch))),
+    )
+    .service(web::resource("/delete").route(web::post().to(SeqHandler(delete_documents_by_filter))))
    .service(
        web::resource("/{document_id}")
            .route(web::get().to(SeqHandler(get_document)))
@ -373,7 +376,7 @@ async fn document_addition(
    Ok(task.into())
 }

-pub async fn delete_documents(
+pub async fn delete_documents_batch(
    index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_DELETE }>, Data<IndexScheduler>>,
    index_uid: web::Path<String>,
    body: web::Json<Vec<Value>>,
@ -399,6 +402,42 @@ pub async fn delete_documents(
    Ok(HttpResponse::Accepted().json(task))
 }

+#[derive(Debug, Deserr)]
+#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
+pub struct DocumentDeletionByFilter {
+    #[deserr(error = DeserrJsonError<InvalidDocumentDeleteFilter>)]
+    filter: Value,
+}
+
+pub async fn delete_documents_by_filter(
+    index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_DELETE }>, Data<IndexScheduler>>,
+    index_uid: web::Path<String>,
+    body: AwebJson<DocumentDeletionByFilter, DeserrJsonError>,
+    req: HttpRequest,
+    analytics: web::Data<dyn Analytics>,
+) -> Result<HttpResponse, ResponseError> {
+    debug!("called with params: {:?}", body);
+    let index_uid = IndexUid::try_from(index_uid.into_inner())?;
+    let index_uid = index_uid.into_inner();
+    let filter = body.into_inner().filter;
+
+    analytics.delete_documents(DocumentDeletionKind::PerFilter, &req);
+
+    // we ensure the filter is well formed before enqueuing it
+    || -> Result<_, ResponseError> {
+        Ok(crate::search::parse_filter(&filter)?.ok_or(MeilisearchHttpError::EmptyFilter)?)
+    }()
+    // and whatever was the error, the error code should always be an InvalidDocumentDeleteFilter
+    .map_err(|err| ResponseError::from_msg(err.message, Code::InvalidDocumentDeleteFilter))?;
+    let task = KindWithContent::DocumentDeletionByFilter { index_uid, filter_expr: filter };
+
+    let task: SummarizedTaskView =
+        tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
+
+    debug!("returns: {:?}", task);
+    Ok(HttpResponse::Accepted().json(task))
+}
+
 pub async fn clear_all_documents(
    index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_DELETE }>, Data<IndexScheduler>>,
    index_uid: web::Path<String>,
--- a/meilisearch/src/routes/tasks.rs
+++ b/meilisearch/src/routes/tasks.rs
@ -133,6 +133,14 @@ impl From<Details> for DetailsView {
                deleted_documents: Some(deleted_documents),
                ..DetailsView::default()
            },
+            Details::DocumentDeletionByFilter { original_filter, deleted_documents } => {
+                DetailsView {
+                    provided_ids: Some(0),
+                    original_filter: Some(original_filter),
+                    deleted_documents: Some(deleted_documents),
+                    ..DetailsView::default()
+                }
+            }
            Details::ClearAll { deleted_documents } => {
                DetailsView { deleted_documents: Some(deleted_documents), ..DetailsView::default() }
            }
@ -721,7 +729,7 @@ mod tests {
            let err = deserr_query_params::<TaskDeletionOrCancelationQuery>(params).unwrap_err();
            snapshot!(meili_snap::json_string!(err), @r###"
            {
-              "message": "Invalid value in parameter `types`: `createIndex` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`.",
+              "message": "Invalid value in parameter `types`: `createIndex` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentDeletion`, `documentDeletionByFilter`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`.",
              "code": "invalid_task_types",
              "type": "invalid_request",
              "link": "https://docs.meilisearch.com/errors#invalid_task_types"
--- a/meilisearch/src/search.rs
+++ b/meilisearch/src/search.rs
@ -745,7 +745,7 @@ fn format_value<A: AsRef<[u8]>>(
    }
 }

-fn parse_filter(facets: &Value) -> Result<Option<Filter>, MeilisearchHttpError> {
+pub fn parse_filter(facets: &Value) -> Result<Option<Filter>, MeilisearchHttpError> {
    match facets {
        Value::String(expr) => {
            let condition = Filter::from_str(expr)?;