diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index e4ca39c8b..bcd8320c9 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -214,6 +214,7 @@ InvalidApiKeyUid , InvalidRequest , BAD_REQUEST ; InvalidContentType , InvalidRequest , UNSUPPORTED_MEDIA_TYPE ; InvalidDocumentCsvDelimiter , InvalidRequest , BAD_REQUEST ; InvalidDocumentFields , InvalidRequest , BAD_REQUEST ; +InvalidDocumentFilter , InvalidRequest , BAD_REQUEST ; InvalidDocumentGeoField , InvalidRequest , BAD_REQUEST ; InvalidDocumentId , InvalidRequest , BAD_REQUEST ; InvalidDocumentLimit , InvalidRequest , BAD_REQUEST ; diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index a6ee8d16e..eb0f5a59e 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -17,6 +17,7 @@ use meilisearch_types::error::{Code, ResponseError}; use meilisearch_types::heed::RoTxn; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli::update::IndexDocumentsMethod; +use meilisearch_types::milli::DocumentId; use meilisearch_types::star_or::OptionStarOrList; use meilisearch_types::tasks::KindWithContent; use meilisearch_types::{milli, Document, Index}; @@ -36,6 +37,7 @@ use crate::extractors::authentication::GuardedData; use crate::extractors::payload::Payload; use crate::extractors::sequential_extractor::SeqHandler; use crate::routes::{PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT}; +use crate::search::parse_filter; static ACCEPTED_CONTENT_TYPE: Lazy> = Lazy::new(|| { vec!["application/json".to_string(), "application/x-ndjson".to_string(), "text/csv".to_string()] @@ -66,7 +68,7 @@ pub struct DocumentParam { pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service( web::resource("") - .route(web::get().to(SeqHandler(get_all_documents))) + .route(web::get().to(SeqHandler(get_documents))) .route(web::post().to(SeqHandler(replace_documents))) .route(web::put().to(SeqHandler(update_documents))) .route(web::delete().to(SeqHandler(clear_all_documents))), @@ -76,6 +78,7 @@ pub fn configure(cfg: &mut web::ServiceConfig) { web::resource("/delete-batch").route(web::post().to(SeqHandler(delete_documents_batch))), ) .service(web::resource("/delete").route(web::post().to(SeqHandler(delete_documents_by_filter)))) + .service(web::resource("/fetch").route(web::post().to(SeqHandler(documents_by_query_post)))) .service( web::resource("/{document_id}") .route(web::get().to(SeqHandler(get_document))) @@ -130,29 +133,79 @@ pub async fn delete_document( #[derive(Debug, Deserr)] #[deserr(error = DeserrQueryParamError, rename_all = camelCase, deny_unknown_fields)] -pub struct BrowseQuery { +pub struct BrowseQueryGet { #[deserr(default, error = DeserrQueryParamError)] offset: Param, #[deserr(default = Param(PAGINATION_DEFAULT_LIMIT), error = DeserrQueryParamError)] limit: Param, #[deserr(default, error = DeserrQueryParamError)] fields: OptionStarOrList, + #[deserr(default, error = DeserrQueryParamError)] + filter: Option, } -pub async fn get_all_documents( +#[derive(Debug, Deserr)] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +pub struct BrowseQuery { + #[deserr(default, error = DeserrJsonError)] + offset: usize, + #[deserr(default = PAGINATION_DEFAULT_LIMIT, error = DeserrJsonError)] + limit: usize, + #[deserr(default, error = DeserrJsonError)] + fields: Option>, + #[deserr(default, error = DeserrJsonError)] + filter: Option, +} + +pub async fn documents_by_query_post( index_scheduler: GuardedData, Data>, index_uid: web::Path, - params: AwebQueryParameter, + body: AwebJson, +) -> Result { + debug!("called with body: {:?}", body); + + documents_by_query(&index_scheduler, index_uid, body.into_inner()) +} + +pub async fn get_documents( + index_scheduler: GuardedData, Data>, + index_uid: web::Path, + params: AwebQueryParameter, +) -> Result { + debug!("called with params: {:?}", params); + + let BrowseQueryGet { limit, offset, fields, filter } = params.into_inner(); + + let filter = match filter { + Some(f) => match serde_json::from_str(&f) { + Ok(v) => Some(v), + _ => Some(Value::String(f)), + }, + None => None, + }; + + let query = BrowseQuery { + offset: offset.0, + limit: limit.0, + fields: fields.merge_star_and_none(), + filter, + }; + + documents_by_query(&index_scheduler, index_uid, query) +} + +fn documents_by_query( + index_scheduler: &IndexScheduler, + index_uid: web::Path, + query: BrowseQuery, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; - debug!("called with params: {:?}", params); - let BrowseQuery { limit, offset, fields } = params.into_inner(); - let attributes_to_retrieve = fields.merge_star_and_none(); + let BrowseQuery { offset, limit, fields, filter } = query; let index = index_scheduler.index(&index_uid)?; - let (total, documents) = retrieve_documents(&index, offset.0, limit.0, attributes_to_retrieve)?; + let (total, documents) = retrieve_documents(&index, offset, limit, filter, fields)?; - let ret = PaginationView::new(offset.0, limit.0, total as usize, documents); + let ret = PaginationView::new(offset, limit, total as usize, documents); debug!("returns: {:?}", ret); Ok(HttpResponse::Ok().json(ret)) @@ -455,14 +508,15 @@ pub async fn clear_all_documents( Ok(HttpResponse::Accepted().json(task)) } -fn all_documents<'a>( - index: &Index, - rtxn: &'a RoTxn, +fn some_documents<'a, 't: 'a>( + index: &'a Index, + rtxn: &'t RoTxn, + doc_ids: impl IntoIterator + 'a, ) -> Result> + 'a, ResponseError> { let fields_ids_map = index.fields_ids_map(rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); - Ok(index.all_documents(rtxn)?.map(move |ret| { + Ok(index.iter_documents(rtxn, doc_ids)?.map(move |ret| { ret.map_err(ResponseError::from).and_then(|(_key, document)| -> Result<_, ResponseError> { Ok(milli::obkv_to_json(&all_fields, &fields_ids_map, document)?) }) @@ -473,24 +527,45 @@ fn retrieve_documents>( index: &Index, offset: usize, limit: usize, + filter: Option, attributes_to_retrieve: Option>, ) -> Result<(u64, Vec), ResponseError> { let rtxn = index.read_txn()?; + let filter = &filter; + let filter = if let Some(filter) = filter { + parse_filter(filter) + .map_err(|err| ResponseError::from_msg(err.to_string(), Code::InvalidDocumentFilter))? + } else { + None + }; - let mut documents = Vec::new(); - for document in all_documents(index, &rtxn)?.skip(offset).take(limit) { - let document = match &attributes_to_retrieve { - Some(attributes_to_retrieve) => permissive_json_pointer::select_values( - &document?, - attributes_to_retrieve.iter().map(|s| s.as_ref()), - ), - None => document?, - }; - documents.push(document); - } + let candidates = if let Some(filter) = filter { + filter.evaluate(&rtxn, index)? + } else { + index.documents_ids(&rtxn)? + }; - let number_of_documents = index.number_of_documents(&rtxn)?; - Ok((number_of_documents, documents)) + let (it, number_of_documents) = { + let number_of_documents = candidates.len(); + ( + some_documents(index, &rtxn, candidates.into_iter().skip(offset).take(limit))?, + number_of_documents, + ) + }; + + let documents: Result, ResponseError> = it + .map(|document| { + Ok(match &attributes_to_retrieve { + Some(attributes_to_retrieve) => permissive_json_pointer::select_values( + &document?, + attributes_to_retrieve.iter().map(|s| s.as_ref()), + ), + None => document?, + }) + }) + .collect(); + + Ok((number_of_documents, documents?)) } fn retrieve_document>( diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 81c5972b8..581f4b653 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -745,7 +745,7 @@ fn format_value>( } } -pub fn parse_filter(facets: &Value) -> Result, MeilisearchHttpError> { +pub(crate) fn parse_filter(facets: &Value) -> Result, MeilisearchHttpError> { match facets { Value::String(expr) => { let condition = Filter::from_str(expr)?; diff --git a/meilisearch/tests/common/index.rs b/meilisearch/tests/common/index.rs index 38b43e910..beab9970b 100644 --- a/meilisearch/tests/common/index.rs +++ b/meilisearch/tests/common/index.rs @@ -198,6 +198,11 @@ impl Index<'_> { self.service.get(url).await } + pub async fn get_document_by_filter(&self, payload: Value) -> (Value, StatusCode) { + let url = format!("/indexes/{}/documents/fetch", urlencode(self.uid.as_ref())); + self.service.post(url, payload).await + } + pub async fn get_all_documents_raw(&self, options: &str) -> (Value, StatusCode) { let url = format!("/indexes/{}/documents{}", urlencode(self.uid.as_ref()), options); self.service.get(url).await diff --git a/meilisearch/tests/documents/errors.rs b/meilisearch/tests/documents/errors.rs index b857d4e1d..b72dc40f3 100644 --- a/meilisearch/tests/documents/errors.rs +++ b/meilisearch/tests/documents/errors.rs @@ -82,6 +82,111 @@ async fn get_all_documents_bad_limit() { "###); } +#[actix_rt::test] +async fn get_all_documents_bad_filter() { + let server = Server::new().await; + let index = server.index("test"); + + // Since the filter can't be parsed automatically by deserr, we have the wrong error message + // if the index does not exist: we could expect to get an error message about the invalid filter before + // the existence of the index is checked, but it is not the case. + let (response, code) = index.get_all_documents_raw("?filter").await; + snapshot!(code, @"404 Not Found"); + snapshot!(json_string!(response), @r###" + { + "message": "Index `test` not found.", + "code": "index_not_found", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#index_not_found" + } + "###); + + let (response, code) = index.get_all_documents_raw("?filter=doggo").await; + snapshot!(code, @"404 Not Found"); + snapshot!(json_string!(response), @r###" + { + "message": "Index `test` not found.", + "code": "index_not_found", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#index_not_found" + } + "###); + + let (response, code) = index.get_all_documents_raw("?filter=doggo=bernese").await; + snapshot!(code, @"404 Not Found"); + snapshot!(json_string!(response), @r###" + { + "message": "Index `test` not found.", + "code": "index_not_found", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#index_not_found" + } + "###); + + let (response, code) = index.create(None).await; + snapshot!(code, @"202 Accepted"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "taskUid": 0, + "indexUid": "test", + "status": "enqueued", + "type": "indexCreation", + "enqueuedAt": "[date]" + } + "###); + let response = server.wait_task(0).await; + snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" + { + "uid": 0, + "indexUid": "test", + "status": "succeeded", + "type": "indexCreation", + "canceledBy": null, + "details": { + "primaryKey": null + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (response, code) = index.get_all_documents_raw("?filter").await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "results": [], + "offset": 0, + "limit": 20, + "total": 0 + } + "###); + + let (response, code) = index.get_all_documents_raw("?filter=doggo").await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `doggo`.\n1:6 doggo", + "code": "invalid_document_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_filter" + } + "###); + + let (response, code) = index.get_all_documents_raw("?filter=doggo=bernese").await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Attribute `doggo` is not filterable. This index does not have configured filterable attributes.\n1:6 doggo=bernese", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); +} + #[actix_rt::test] async fn delete_documents_batch() { let server = Server::new().await; @@ -570,3 +675,77 @@ async fn delete_document_by_filter() { } "###); } + +#[actix_rt::test] +async fn fetch_document_by_filter() { + let server = Server::new().await; + let index = server.index("doggo"); + index.update_settings_filterable_attributes(json!(["color"])).await; + index + .add_documents( + json!([ + { "id": 0, "color": "red" }, + { "id": 1, "color": "blue" }, + { "id": 2, "color": "blue" }, + { "id": 3 }, + ]), + Some("id"), + ) + .await; + index.wait_task(1).await; + + let (response, code) = index.get_document_by_filter(json!(null)).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type: expected an object, but found null", + "code": "bad_request", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#bad_request" + } + "###); + + let (response, code) = index.get_document_by_filter(json!({ "offset": "doggo" })).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.offset`: expected a positive integer, but found a string: `\"doggo\"`", + "code": "invalid_document_offset", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_offset" + } + "###); + + let (response, code) = index.get_document_by_filter(json!({ "limit": "doggo" })).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.limit`: expected a positive integer, but found a string: `\"doggo\"`", + "code": "invalid_document_limit", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_limit" + } + "###); + + let (response, code) = index.get_document_by_filter(json!({ "fields": "doggo" })).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.fields`: expected an array, but found a string: `\"doggo\"`", + "code": "invalid_document_fields", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_fields" + } + "###); + + let (response, code) = index.get_document_by_filter(json!({ "filter": true })).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid syntax for the filter parameter: `expected String, Array, found: true`.", + "code": "invalid_document_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_filter" + } + "###); +} diff --git a/meilisearch/tests/documents/get_documents.rs b/meilisearch/tests/documents/get_documents.rs index 9bc54973e..7e6c5a2a9 100644 --- a/meilisearch/tests/documents/get_documents.rs +++ b/meilisearch/tests/documents/get_documents.rs @@ -1,5 +1,6 @@ use actix_web::test; use http::header::ACCEPT_ENCODING; +use meili_snap::*; use serde_json::{json, Value}; use urlencoding::encode as urlencode; @@ -378,3 +379,164 @@ async fn get_documents_displayed_attributes_is_ignored() { assert_eq!(response.as_object().unwrap().keys().count(), 16); assert!(response.as_object().unwrap().get("gender").is_some()); } + +#[actix_rt::test] +async fn get_document_by_filter() { + let server = Server::new().await; + let index = server.index("doggo"); + index.update_settings_filterable_attributes(json!(["color"])).await; + index + .add_documents( + json!([ + { "id": 0, "color": "red" }, + { "id": 1, "color": "blue" }, + { "id": 2, "color": "blue" }, + { "id": 3 }, + ]), + Some("id"), + ) + .await; + index.wait_task(1).await; + + let (response, code) = index.get_document_by_filter(json!({})).await; + let (response2, code2) = index.get_all_documents_raw("").await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [ + { + "id": 0, + "color": "red" + }, + { + "id": 1, + "color": "blue" + }, + { + "id": 2, + "color": "blue" + }, + { + "id": 3 + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "###); + assert_eq!(code, code2); + assert_eq!(response, response2); + + let (response, code) = index.get_document_by_filter(json!({ "filter": "color = blue" })).await; + let (response2, code2) = index.get_all_documents_raw("?filter=color=blue").await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [ + { + "id": 1, + "color": "blue" + }, + { + "id": 2, + "color": "blue" + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + assert_eq!(code, code2); + assert_eq!(response, response2); + + let (response, code) = index + .get_document_by_filter(json!({ "offset": 1, "limit": 1, "filter": "color != blue" })) + .await; + let (response2, code2) = + index.get_all_documents_raw("?filter=color!=blue&offset=1&limit=1").await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [ + { + "id": 3 + } + ], + "offset": 1, + "limit": 1, + "total": 2 + } + "###); + assert_eq!(code, code2); + assert_eq!(response, response2); + + let (response, code) = index + .get_document_by_filter( + json!({ "limit": 1, "filter": "color != blue", "fields": ["color"] }), + ) + .await; + let (response2, code2) = + index.get_all_documents_raw("?limit=1&filter=color!=blue&fields=color").await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [ + { + "color": "red" + } + ], + "offset": 0, + "limit": 1, + "total": 2 + } + "###); + assert_eq!(code, code2); + assert_eq!(response, response2); + + // Now testing more complex filter that the get route can't represent + + let (response, code) = + index.get_document_by_filter(json!({ "filter": [["color = blue", "color = red"]] })).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [ + { + "id": 0, + "color": "red" + }, + { + "id": 1, + "color": "blue" + }, + { + "id": 2, + "color": "blue" + } + ], + "offset": 0, + "limit": 20, + "total": 3 + } + "###); + + let (response, code) = index + .get_document_by_filter(json!({ "filter": [["color != blue"], "color EXISTS"] })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [ + { + "id": 0, + "color": "red" + } + ], + "offset": 0, + "limit": 20, + "total": 1 + } + "###); +} diff --git a/milli/src/index.rs b/milli/src/index.rs index f02719f92..ad53e79ea 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1032,16 +1032,15 @@ impl Index { /* documents */ - /// Returns a [`Vec`] of the requested documents. Returns an error if a document is missing. - pub fn documents<'t>( - &self, + /// Returns an iterator over the requested documents. The next item will be an error if a document is missing. + pub fn iter_documents<'a, 't: 'a>( + &'a self, rtxn: &'t RoTxn, - ids: impl IntoIterator, - ) -> Result)>> { + ids: impl IntoIterator + 'a, + ) -> Result)>> + 'a> { let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?; - let mut documents = Vec::new(); - for id in ids { + Ok(ids.into_iter().map(move |id| { if soft_deleted_documents.contains(id) { return Err(UserError::AccessingSoftDeletedDocument { document_id: id })?; } @@ -1049,27 +1048,25 @@ impl Index { .documents .get(rtxn, &BEU32::new(id))? .ok_or(UserError::UnknownInternalDocumentId { document_id: id })?; - documents.push((id, kv)); - } + Ok((id, kv)) + })) + } - Ok(documents) + /// Returns a [`Vec`] of the requested documents. Returns an error if a document is missing. + pub fn documents<'t>( + &self, + rtxn: &'t RoTxn, + ids: impl IntoIterator, + ) -> Result)>> { + self.iter_documents(rtxn, ids)?.collect() } /// Returns an iterator over all the documents in the index. - pub fn all_documents<'t>( - &self, + pub fn all_documents<'a, 't: 'a>( + &'a self, rtxn: &'t RoTxn, - ) -> Result)>>> { - let soft_deleted_docids = self.soft_deleted_documents_ids(rtxn)?; - - Ok(self - .documents - .iter(rtxn)? - // we cast the BEU32 to a DocumentId - .map(|document| document.map(|(id, obkv)| (id.get(), obkv))) - .filter(move |document| { - document.as_ref().map_or(true, |(id, _)| !soft_deleted_docids.contains(*id)) - })) + ) -> Result)>> + 'a> { + self.iter_documents(rtxn, self.documents_ids(rtxn)?) } pub fn facets_distribution<'a>(&'a self, rtxn: &'a RoTxn) -> FacetDistribution<'a> {