From 1d3c4642a645ae52befdb3098f915224b183bea7 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 3 Mar 2025 16:51:34 +0100 Subject: [PATCH 1/7] Don't use Deserr for ExternalDocumentId, instead convert to error afterward --- .../meilisearch/src/routes/indexes/similar.rs | 20 ++++++++----------- crates/meilisearch/src/search/mod.rs | 16 +++++++++------ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/similar.rs b/crates/meilisearch/src/routes/indexes/similar.rs index fcd64291e..400be331b 100644 --- a/crates/meilisearch/src/routes/indexes/similar.rs +++ b/crates/meilisearch/src/routes/indexes/similar.rs @@ -5,7 +5,7 @@ use index_scheduler::IndexScheduler; use meilisearch_types::deserr::query_params::Param; use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError}; use meilisearch_types::error::deserr_codes::*; -use meilisearch_types::error::{ErrorCode as _, ResponseError}; +use meilisearch_types::error::ResponseError; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::keys::actions; use meilisearch_types::serde_cs::vec::CS; @@ -111,7 +111,7 @@ pub async fn similar_get( ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; - let query = params.0.try_into()?; + let query = params.0.into(); let mut aggregate = SimilarAggregator::::from_query(&query); @@ -295,10 +295,8 @@ impl std::convert::TryFrom for RankingScoreThresholdGet { } } -impl TryFrom for SimilarQuery { - type Error = ResponseError; - - fn try_from( +impl From for SimilarQuery { + fn from( SimilarQueryGet { id, offset, @@ -311,7 +309,7 @@ impl TryFrom for SimilarQuery { embedder, ranking_score_threshold, }: SimilarQueryGet, - ) -> Result { + ) -> Self { let filter = match filter { Some(f) => match serde_json::from_str(&f) { Ok(v) => Some(v), @@ -320,10 +318,8 @@ impl TryFrom for SimilarQuery { None => None, }; - Ok(SimilarQuery { - id: id.0.try_into().map_err(|code: InvalidSimilarId| { - ResponseError::from_msg(code.to_string(), code.error_code()) - })?, + SimilarQuery { + id: serde_json::Value::String(id.0), offset: offset.0, limit: limit.0, filter, @@ -333,6 +329,6 @@ impl TryFrom for SimilarQuery { show_ranking_score: show_ranking_score.0, show_ranking_score_details: show_ranking_score_details.0, ranking_score_threshold: ranking_score_threshold.map(|x| x.0), - }) + } } } diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index 565dbccf1..d01466c90 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -635,7 +635,7 @@ impl SearchQueryWithIndex { pub struct SimilarQuery { #[deserr(error = DeserrJsonError)] #[schema(value_type = String)] - pub id: ExternalDocumentId, + pub id: serde_json::Value, #[deserr(default = DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError)] pub offset: usize, #[deserr(default = DEFAULT_SEARCH_LIMIT(), error = DeserrJsonError)] @@ -657,8 +657,7 @@ pub struct SimilarQuery { pub ranking_score_threshold: Option, } -#[derive(Debug, Clone, PartialEq, Deserr)] -#[deserr(try_from(Value) = TryFrom::try_from -> InvalidSimilarId)] +#[derive(Debug, Clone, PartialEq)] pub struct ExternalDocumentId(String); impl AsRef for ExternalDocumentId { @@ -674,7 +673,7 @@ impl ExternalDocumentId { } impl TryFrom for ExternalDocumentId { - type Error = InvalidSimilarId; + type Error = milli::UserError; fn try_from(value: String) -> Result { serde_json::Value::String(value).try_into() @@ -682,10 +681,10 @@ impl TryFrom for ExternalDocumentId { } impl TryFrom for ExternalDocumentId { - type Error = InvalidSimilarId; + type Error = milli::UserError; fn try_from(value: Value) -> Result { - Ok(Self(milli::documents::validate_document_id_value(value).map_err(|_| InvalidSimilarId)?)) + Ok(Self(milli::documents::validate_document_id_value(value)?)) } } @@ -1597,6 +1596,11 @@ pub fn perform_similar( ranking_score_threshold, } = query; + let id: ExternalDocumentId = id.try_into().map_err(|error| { + let msg = format!("Invalid value at `.id`: {error}"); + ResponseError::from_msg(msg, Code::InvalidSimilarId) + })?; + // using let-else rather than `?` so that the borrow checker identifies we're always returning here, // preventing a use-after-move let Some(internal_id) = index.external_documents_ids().get(&rtxn, &id)? else { From f292fc9ac0567c40ccf0b85d280c66855c3f5ff1 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 3 Mar 2025 11:48:24 +0100 Subject: [PATCH 2/7] Add `ids` parameter to GET documents and POST documents/fetch --- crates/meilisearch-types/src/error.rs | 2 + .../src/routes/indexes/documents.rs | 59 ++++++++++++++++--- 2 files changed, 53 insertions(+), 8 deletions(-) diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index 5a0451b6c..77c324d97 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -241,6 +241,7 @@ InvalidDocumentGeoField , InvalidRequest , BAD_REQUEST ; InvalidVectorDimensions , InvalidRequest , BAD_REQUEST ; InvalidVectorsType , InvalidRequest , BAD_REQUEST ; InvalidDocumentId , InvalidRequest , BAD_REQUEST ; +InvalidDocumentIds , InvalidRequest , BAD_REQUEST ; InvalidDocumentLimit , InvalidRequest , BAD_REQUEST ; InvalidDocumentOffset , InvalidRequest , BAD_REQUEST ; InvalidSearchEmbedder , InvalidRequest , BAD_REQUEST ; @@ -383,6 +384,7 @@ UnsupportedMediaType , InvalidRequest , UNSUPPORTED_MEDIA // Experimental features VectorEmbeddingError , InvalidRequest , BAD_REQUEST ; NotFoundSimilarId , InvalidRequest , BAD_REQUEST ; +NotFoundDocumentId , InvalidRequest , BAD_REQUEST ; InvalidDocumentEditionContext , InvalidRequest , BAD_REQUEST ; InvalidDocumentEditionFunctionFilter , InvalidRequest , BAD_REQUEST ; EditDocumentsByFunctionError , InvalidRequest , BAD_REQUEST diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index 54f01b4d6..8f063b31c 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -20,11 +20,13 @@ use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli::update::IndexDocumentsMethod; use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; use meilisearch_types::milli::DocumentId; +use meilisearch_types::serde_cs::vec::CS; use meilisearch_types::star_or::OptionStarOrList; use meilisearch_types::tasks::KindWithContent; use meilisearch_types::{milli, Document, Index}; use mime::Mime; use once_cell::sync::Lazy; +use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use serde_json::Value; use tempfile::tempfile; @@ -43,7 +45,7 @@ use crate::extractors::sequential_extractor::SeqHandler; use crate::routes::{ get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT, }; -use crate::search::{parse_filter, RetrieveVectors}; +use crate::search::{parse_filter, ExternalDocumentId, RetrieveVectors}; use crate::{aggregate_methods, Opt}; static ACCEPTED_CONTENT_TYPE: Lazy> = Lazy::new(|| { @@ -387,6 +389,9 @@ pub struct BrowseQueryGet { #[param(default, value_type = Option)] #[deserr(default, error = DeserrQueryParamError)] retrieve_vectors: Param, + #[param(default, value_type = Option>)] + #[deserr(default, error = DeserrQueryParamError)] + ids: Option>, #[param(default, value_type = Option, example = "popularity > 1000")] #[deserr(default, error = DeserrQueryParamError)] filter: Option, @@ -408,6 +413,9 @@ pub struct BrowseQuery { #[schema(default, example = true)] #[deserr(default, error = DeserrJsonError)] retrieve_vectors: bool, + #[schema(value_type = Option>, example = json!(["cody", "finn", "brandy", "gambit"]))] + #[deserr(default, error = DeserrJsonError)] + ids: Option>, #[schema(default, value_type = Option, example = "popularity > 1000")] #[deserr(default, error = DeserrJsonError)] filter: Option, @@ -551,7 +559,8 @@ pub async fn get_documents( ) -> Result { debug!(parameters = ?params, "Get documents GET"); - let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter } = params.into_inner(); + let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter, ids } = + params.into_inner(); let filter = match filter { Some(f) => match serde_json::from_str(&f) { @@ -561,12 +570,15 @@ pub async fn get_documents( None => None, }; + let ids = ids.map(|ids| ids.into_iter().map(Into::into).collect()); + let query = BrowseQuery { offset: offset.0, limit: limit.0, fields: fields.merge_star_and_none(), retrieve_vectors: retrieve_vectors.0, filter, + ids, }; analytics.publish( @@ -590,15 +602,30 @@ fn documents_by_query( query: BrowseQuery, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; - let BrowseQuery { offset, limit, fields, retrieve_vectors, filter } = query; + let BrowseQuery { offset, limit, fields, retrieve_vectors, filter, ids } = query; let retrieve_vectors = RetrieveVectors::new(retrieve_vectors); + let ids = if let Some(ids) = ids { + let mut parsed_ids = Vec::with_capacity(ids.len()); + for (index, id) in ids.into_iter().enumerate() { + let id = id.try_into().map_err(|error| { + let msg = format!("In `.ids[{index}]`:{error}"); + ResponseError::from_msg(msg, Code::InvalidDocumentIds) + })?; + parsed_ids.push(id) + } + Some(parsed_ids) + } else { + None + }; + let index = index_scheduler.index(&index_uid)?; let (total, documents) = retrieve_documents( &index, offset, limit, + ids, filter, fields, retrieve_vectors, @@ -1451,10 +1478,12 @@ fn some_documents<'a, 't: 'a>( })) } +#[allow(clippy::too_many_arguments)] fn retrieve_documents>( index: &Index, offset: usize, limit: usize, + ids: Option>, filter: Option, attributes_to_retrieve: Option>, retrieve_vectors: RetrieveVectors, @@ -1468,16 +1497,30 @@ fn retrieve_documents>( None }; - let candidates = if let Some(filter) = filter { - filter.evaluate(&rtxn, index).map_err(|err| match err { + let mut candidates = if let Some(ids) = ids { + let external_document_ids = index.external_documents_ids(); + let mut candidates = RoaringBitmap::new(); + for (index, id) in ids.iter().enumerate() { + let Some(docid) = external_document_ids.get(&rtxn, id)? else { + let error = MeilisearchHttpError::DocumentNotFound(id.clone().into_inner()); + let msg = format!("In `.ids[{index}]`: {error}"); + return Err(ResponseError::from_msg(msg, Code::NotFoundDocumentId)); + }; + candidates.insert(docid); + } + candidates + } else { + index.documents_ids(&rtxn)? + }; + + if let Some(filter) = filter { + candidates &= filter.evaluate(&rtxn, index).map_err(|err| match err { milli::Error::UserError(milli::UserError::InvalidFilter(_)) => { ResponseError::from_msg(err.to_string(), Code::InvalidDocumentFilter) } e => e.into(), })? - } else { - index.documents_ids(&rtxn)? - }; + } let (it, number_of_documents) = { let number_of_documents = candidates.len(); From 21c3b3957edb3b6bdab86fc7a17e12b9bea38758 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 3 Mar 2025 16:53:05 +0100 Subject: [PATCH 3/7] tests: Change get_document_by_filter to fetch_documents --- crates/meilisearch/tests/common/index.rs | 2 +- crates/meilisearch/tests/documents/errors.rs | 18 ++++++++---------- .../tests/documents/get_documents.rs | 15 ++++++--------- 3 files changed, 15 insertions(+), 20 deletions(-) diff --git a/crates/meilisearch/tests/common/index.rs b/crates/meilisearch/tests/common/index.rs index b8c1d2824..529fb0793 100644 --- a/crates/meilisearch/tests/common/index.rs +++ b/crates/meilisearch/tests/common/index.rs @@ -411,7 +411,7 @@ impl Index<'_, State> { self.service.get(url).await } - pub async fn get_document_by_filter(&self, payload: Value) -> (Value, StatusCode) { + pub async fn fetch_documents(&self, payload: Value) -> (Value, StatusCode) { let url = format!("/indexes/{}/documents/fetch", urlencode(self.uid.as_ref())); self.service.post(url, payload).await } diff --git a/crates/meilisearch/tests/documents/errors.rs b/crates/meilisearch/tests/documents/errors.rs index 7b2ca8b5e..51b6cdf4d 100644 --- a/crates/meilisearch/tests/documents/errors.rs +++ b/crates/meilisearch/tests/documents/errors.rs @@ -667,7 +667,7 @@ async fn fetch_document_by_filter() { .await; index.wait_task(task.uid()).await.succeeded(); - let (response, code) = index.get_document_by_filter(json!(null)).await; + let (response, code) = index.fetch_documents(json!(null)).await; snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { @@ -678,7 +678,7 @@ async fn fetch_document_by_filter() { } "###); - let (response, code) = index.get_document_by_filter(json!({ "offset": "doggo" })).await; + let (response, code) = index.fetch_documents(json!({ "offset": "doggo" })).await; snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { @@ -689,7 +689,7 @@ async fn fetch_document_by_filter() { } "###); - let (response, code) = index.get_document_by_filter(json!({ "limit": "doggo" })).await; + let (response, code) = index.fetch_documents(json!({ "limit": "doggo" })).await; snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { @@ -700,7 +700,7 @@ async fn fetch_document_by_filter() { } "###); - let (response, code) = index.get_document_by_filter(json!({ "fields": "doggo" })).await; + let (response, code) = index.fetch_documents(json!({ "fields": "doggo" })).await; snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { @@ -711,7 +711,7 @@ async fn fetch_document_by_filter() { } "###); - let (response, code) = index.get_document_by_filter(json!({ "filter": true })).await; + let (response, code) = index.fetch_documents(json!({ "filter": true })).await; snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { @@ -722,7 +722,7 @@ async fn fetch_document_by_filter() { } "###); - let (response, code) = index.get_document_by_filter(json!({ "filter": "cool doggo" })).await; + let (response, code) = index.fetch_documents(json!({ "filter": "cool doggo" })).await; snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { @@ -733,8 +733,7 @@ async fn fetch_document_by_filter() { } "###); - let (response, code) = - index.get_document_by_filter(json!({ "filter": "doggo = bernese" })).await; + let (response, code) = index.fetch_documents(json!({ "filter": "doggo = bernese" })).await; snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { @@ -762,8 +761,7 @@ async fn retrieve_vectors() { "###); // FETCH ALL DOCUMENTS BY POST - let (response, _code) = - index.get_document_by_filter(json!({ "retrieveVectors": "tamo" })).await; + let (response, _code) = index.fetch_documents(json!({ "retrieveVectors": "tamo" })).await; snapshot!(response, @r###" { "message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"tamo\"`", diff --git a/crates/meilisearch/tests/documents/get_documents.rs b/crates/meilisearch/tests/documents/get_documents.rs index 966fccefd..1394e5e8d 100644 --- a/crates/meilisearch/tests/documents/get_documents.rs +++ b/crates/meilisearch/tests/documents/get_documents.rs @@ -371,7 +371,7 @@ async fn get_document_by_filter() { .await; index.wait_task(task.uid()).await.succeeded(); - let (response, code) = index.get_document_by_filter(json!({})).await; + let (response, code) = index.fetch_documents(json!({})).await; let (response2, code2) = index.get_all_documents_raw("").await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" @@ -401,7 +401,7 @@ async fn get_document_by_filter() { assert_eq!(code, code2); assert_eq!(response, response2); - let (response, code) = index.get_document_by_filter(json!({ "filter": "color = blue" })).await; + let (response, code) = index.fetch_documents(json!({ "filter": "color = blue" })).await; let (response2, code2) = index.get_all_documents_raw("?filter=color=blue").await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" @@ -424,9 +424,8 @@ async fn get_document_by_filter() { assert_eq!(code, code2); assert_eq!(response, response2); - let (response, code) = index - .get_document_by_filter(json!({ "offset": 1, "limit": 1, "filter": "color != blue" })) - .await; + let (response, code) = + index.fetch_documents(json!({ "offset": 1, "limit": 1, "filter": "color != blue" })).await; let (response2, code2) = index.get_all_documents_raw("?filter=color!=blue&offset=1&limit=1").await; snapshot!(code, @"200 OK"); @@ -446,9 +445,7 @@ async fn get_document_by_filter() { assert_eq!(response, response2); let (response, code) = index - .get_document_by_filter( - json!({ "limit": 1, "filter": "color != blue", "fields": ["color"] }), - ) + .fetch_documents(json!({ "limit": 1, "filter": "color != blue", "fields": ["color"] })) .await; let (response2, code2) = index.get_all_documents_raw("?limit=1&filter=color!=blue&fields=color").await; @@ -471,7 +468,7 @@ async fn get_document_by_filter() { // Now testing more complex filter that the get route can't represent let (response, code) = - index.get_document_by_filter(json!({ "filter": [["color = blue", "color = red"]] })).await; + index.fetch_documents(json!({ "filter": [["color = blue", "color = red"]] })).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" { From 19c9caed39624a627bb4bdc7a08e0fd1a81cf24d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 3 Mar 2025 16:53:36 +0100 Subject: [PATCH 4/7] Fix tests --- .../tests/documents/get_documents.rs | 315 +++++++++++++++++- crates/meilisearch/tests/similar/errors.rs | 9 +- 2 files changed, 317 insertions(+), 7 deletions(-) diff --git a/crates/meilisearch/tests/documents/get_documents.rs b/crates/meilisearch/tests/documents/get_documents.rs index 1394e5e8d..2b30d15b1 100644 --- a/crates/meilisearch/tests/documents/get_documents.rs +++ b/crates/meilisearch/tests/documents/get_documents.rs @@ -492,9 +492,8 @@ async fn get_document_by_filter() { } "###); - let (response, code) = index - .get_document_by_filter(json!({ "filter": [["color != blue"], "color EXISTS"] })) - .await; + let (response, code) = + index.fetch_documents(json!({ "filter": [["color != blue"], "color EXISTS"] })).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" { @@ -511,6 +510,316 @@ async fn get_document_by_filter() { "###); } +#[actix_rt::test] +async fn get_document_by_ids() { + let server = Server::new_shared(); + let index = server.unique_index(); + let (task, _code) = index + .add_documents( + json!([ + { "id": 0, "color": "red" }, + { "id": 1, "color": "blue" }, + { "id": 2, "color": "blue" }, + { "id": 3 }, + ]), + Some("id"), + ) + .await; + index.wait_task(task.uid()).await.succeeded(); + + let (response, code) = index + .fetch_documents(json!({ + "ids": ["0", 1, 2, 3] + })) + .await; + let (response2, code2) = index.get_all_documents_raw("?ids=0,1,2,3").await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [ + { + "id": 0, + "color": "red" + }, + { + "id": 1, + "color": "blue" + }, + { + "id": 2, + "color": "blue" + }, + { + "id": 3 + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "###); + assert_eq!(code, code2); + assert_eq!(response, response2); + + let (response, code) = index.fetch_documents(json!({ "ids": [2, "1"] })).await; + let (response2, code2) = index.get_all_documents_raw("?ids=2,1").await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [ + { + "id": 1, + "color": "blue" + }, + { + "id": 2, + "color": "blue" + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + assert_eq!(code, code2); + assert_eq!(response, response2); + + let (response, code) = + index.fetch_documents(json!({ "offset": 1, "limit": 1, "ids": ["0", 0, 3] })).await; + let (response2, code2) = index.get_all_documents_raw("?ids=3,0&offset=1&limit=1").await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [ + { + "id": 3 + } + ], + "offset": 1, + "limit": 1, + "total": 2 + } + "###); + assert_eq!(code, code2); + assert_eq!(response, response2); + + let (response, code) = + index.fetch_documents(json!({ "limit": 1, "ids": [0, 3], "fields": ["color"] })).await; + let (response2, code2) = index.get_all_documents_raw("?limit=1&ids=0,3&fields=color").await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [ + { + "color": "red" + } + ], + "offset": 0, + "limit": 1, + "total": 2 + } + "###); + assert_eq!(code, code2); + assert_eq!(response, response2); + + // Now testing more complex requests that the get route can't represent + + let (response, code) = index.fetch_documents(json!({ "ids": [] })).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [], + "offset": 0, + "limit": 20, + "total": 0 + } + "###); +} + +#[actix_rt::test] +async fn get_document_invalid_ids() { + let server = Server::new_shared(); + let index = server.unique_index(); + let (task, _code) = index + .add_documents( + json!([ + { "id": 0, "color": "red" }, + { "id": 1, "color": "blue" }, + { "id": 2, "color": "blue" }, + { "id": 3 }, + ]), + Some("id"), + ) + .await; + index.wait_task(task.uid()).await.succeeded(); + + let (response, code) = index.fetch_documents(json!({"ids": ["0", "illegal/docid"] })).await; + let (response2, code2) = index.get_all_documents_raw("?ids=0,illegal/docid").await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "message": "In `.ids[1]`:Document identifier `\"illegal/docid\"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.", + "code": "invalid_document_ids", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_ids" + } + "###); + assert_eq!(code, code2); + assert_eq!(response, response2); +} + +#[actix_rt::test] +async fn get_document_not_found_ids() { + let server = Server::new_shared(); + let index = server.unique_index(); + let (task, _code) = index + .add_documents( + json!([ + { "id": 0, "color": "red" }, + { "id": 1, "color": "blue" }, + { "id": 2, "color": "blue" }, + { "id": 3 }, + ]), + Some("id"), + ) + .await; + index.wait_task(task.uid()).await.succeeded(); + + let (response, code) = index.fetch_documents(json!({"ids": ["0", 3, 42] })).await; + let (response2, code2) = index.get_all_documents_raw("?ids=0,3,42").await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "message": "In `.ids[2]`: Document `42` not found.", + "code": "not_found_document_id", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#not_found_document_id" + } + "###); + assert_eq!(code, code2); + assert_eq!(response, response2); +} + +#[actix_rt::test] +async fn get_document_by_ids_and_filter() { + let server = Server::new_shared(); + let index = server.unique_index(); + index.update_settings_filterable_attributes(json!(["color"])).await; + let (task, _code) = index + .add_documents( + json!([ + { "id": 0, "color": "red" }, + { "id": 1, "color": "blue" }, + { "id": 2, "color": "blue" }, + { "id": 3 }, + ]), + Some("id"), + ) + .await; + index.wait_task(task.uid()).await.succeeded(); + + let (response, code) = + index.fetch_documents(json!({"ids": [2], "filter": "color = blue" })).await; + let (response2, code2) = index.get_all_documents_raw("?ids=2&filter=color=blue").await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [ + { + "id": 2, + "color": "blue" + } + ], + "offset": 0, + "limit": 20, + "total": 1 + } + "###); + assert_eq!(code, code2); + assert_eq!(response, response2); + + let (response, code) = index + .fetch_documents( + json!({ "offset": 1, "limit": 1, "ids": [0, 1, 2, 3], "filter": "color != blue" }), + ) + .await; + let (response2, code2) = + index.get_all_documents_raw("?ids=0,1,2,3&filter=color!=blue&offset=1&limit=1").await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [ + { + "id": 3 + } + ], + "offset": 1, + "limit": 1, + "total": 2 + } + "###); + assert_eq!(code, code2); + assert_eq!(response, response2); + + let (response, code) = index + .fetch_documents(json!({ "limit": 1, "ids": [0, 1, 2,3], "filter": "color != blue", "fields": ["color"] })) + .await; + let (response2, code2) = + index.get_all_documents_raw("?ids=0,1,2,3&limit=1&filter=color!=blue&fields=color").await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [ + { + "color": "red" + } + ], + "offset": 0, + "limit": 1, + "total": 2 + } + "###); + assert_eq!(code, code2); + assert_eq!(response, response2); + + // Now testing more complex filter that the get route can't represent + + let (response, code) = index + .fetch_documents(json!({ "ids": [0, "2"], "filter": [["color = blue", "color = red"]] })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [ + { + "id": 0, + "color": "red" + }, + { + "id": 2, + "color": "blue" + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + + let (response, code) = index + .fetch_documents(json!({ "filter": [["color != blue"], "color EXISTS"], "ids": [1, 2, 3] })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [], + "offset": 0, + "limit": 20, + "total": 0 + } + "###); +} + #[actix_rt::test] async fn get_document_with_vectors() { let server = Server::new().await; diff --git a/crates/meilisearch/tests/similar/errors.rs b/crates/meilisearch/tests/similar/errors.rs index 75bd6e46b..5d9b00def 100644 --- a/crates/meilisearch/tests/similar/errors.rs +++ b/crates/meilisearch/tests/similar/errors.rs @@ -55,11 +55,11 @@ async fn similar_bad_id() { snapshot!(code, @"202 Accepted"); server.wait_task(response.uid()).await; - let (response, code) = index.similar_post(json!({"id": ["doggo"]})).await; + let (response, code) = index.similar_post(json!({"id": ["doggo"], "embedder": "manual"})).await; snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Invalid value at `.id`: the value of `id` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.", + "message": "Invalid value at `.id`: Document identifier `[\"doggo\"]` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.", "code": "invalid_similar_id", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_similar_id" @@ -145,11 +145,12 @@ async fn similar_invalid_id() { snapshot!(code, @"202 Accepted"); server.wait_task(response.uid()).await; - let (response, code) = index.similar_post(json!({"id": "http://invalid-docid/"})).await; + let (response, code) = + index.similar_post(json!({"id": "http://invalid-docid/", "embedder": "manual"})).await; snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Invalid value at `.id`: the value of `id` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.", + "message": "Invalid value at `.id`: Document identifier `\"http://invalid-docid/\"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.", "code": "invalid_similar_id", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_similar_id" From 9d9e0d4c54f277f91562e8b8bce8b851aefe4b55 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 10 Mar 2025 11:33:15 +0100 Subject: [PATCH 5/7] Add analytics --- .../meilisearch/src/routes/indexes/documents.rs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index 8f063b31c..1aba81238 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -139,6 +139,9 @@ pub struct DocumentsFetchAggregator { #[serde(rename = "vector.retrieve_vectors")] retrieve_vectors: bool, + // maximum size of `ids` array. 0 if always empty or `null` + max_document_ids: usize, + // pagination #[serde(rename = "pagination.max_limit")] max_limit: usize, @@ -151,7 +154,7 @@ pub struct DocumentsFetchAggregator { #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum DocumentFetchKind { PerDocumentId { retrieve_vectors: bool }, - Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool }, + Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool, ids: usize }, } impl DocumentsFetchAggregator { @@ -163,12 +166,18 @@ impl DocumentsFetchAggregator { } }; + let ids = match query { + DocumentFetchKind::Normal { ids, .. } => *ids, + DocumentFetchKind::PerDocumentId { .. } => 0, + }; + Self { per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }), per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter), max_limit: limit, max_offset: offset, retrieve_vectors, + max_document_ids: ids, marker: PhantomData, } @@ -187,6 +196,7 @@ impl Aggregate for DocumentsFetchAggregator { retrieve_vectors: self.retrieve_vectors | new.retrieve_vectors, max_limit: self.max_limit.max(new.max_limit), max_offset: self.max_offset.max(new.max_offset), + max_document_ids: self.max_document_ids.max(new.max_document_ids), marker: PhantomData, }) } @@ -268,6 +278,7 @@ pub async fn get_document( per_filter: false, max_limit: 0, max_offset: 0, + max_document_ids: 0, marker: PhantomData, }, &req, @@ -487,6 +498,7 @@ pub async fn documents_by_query_post( retrieve_vectors: body.retrieve_vectors, max_limit: body.limit, max_offset: body.offset, + max_document_ids: body.ids.as_ref().map(Vec::len).unwrap_or_default(), per_document_id: false, marker: PhantomData, }, @@ -587,6 +599,7 @@ pub async fn get_documents( retrieve_vectors: query.retrieve_vectors, max_limit: query.limit, max_offset: query.offset, + max_document_ids: query.ids.as_ref().map(Vec::len).unwrap_or_default(), per_document_id: false, marker: PhantomData, }, From 7df5e3f059748b6d69774853fc2bddeb356f6166 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Mar 2025 11:48:40 +0100 Subject: [PATCH 6/7] Fix error message Co-authored-by: Tamo --- crates/meilisearch/src/routes/indexes/documents.rs | 2 +- crates/meilisearch/tests/documents/get_documents.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index 1aba81238..fb173ef3e 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -623,7 +623,7 @@ fn documents_by_query( let mut parsed_ids = Vec::with_capacity(ids.len()); for (index, id) in ids.into_iter().enumerate() { let id = id.try_into().map_err(|error| { - let msg = format!("In `.ids[{index}]`:{error}"); + let msg = format!("In `.ids[{index}]`: {error}"); ResponseError::from_msg(msg, Code::InvalidDocumentIds) })?; parsed_ids.push(id) diff --git a/crates/meilisearch/tests/documents/get_documents.rs b/crates/meilisearch/tests/documents/get_documents.rs index 2b30d15b1..bcd81043f 100644 --- a/crates/meilisearch/tests/documents/get_documents.rs +++ b/crates/meilisearch/tests/documents/get_documents.rs @@ -658,7 +658,7 @@ async fn get_document_invalid_ids() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" { - "message": "In `.ids[1]`:Document identifier `\"illegal/docid\"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.", + "message": "In `.ids[1]`: Document identifier `\"illegal/docid\"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.", "code": "invalid_document_ids", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_ids" From 60ff1b19a89b8a7f989c4b757b3b43741d51d958 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Mar 2025 11:50:39 +0100 Subject: [PATCH 7/7] Searching for a document that does not exist no longer raises an error --- crates/meilisearch-types/src/error.rs | 1 - .../src/routes/indexes/documents.rs | 6 ++---- .../tests/documents/get_documents.rs | 20 ++++++++++++++----- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index 77c324d97..94ca538fe 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -384,7 +384,6 @@ UnsupportedMediaType , InvalidRequest , UNSUPPORTED_MEDIA // Experimental features VectorEmbeddingError , InvalidRequest , BAD_REQUEST ; NotFoundSimilarId , InvalidRequest , BAD_REQUEST ; -NotFoundDocumentId , InvalidRequest , BAD_REQUEST ; InvalidDocumentEditionContext , InvalidRequest , BAD_REQUEST ; InvalidDocumentEditionFunctionFilter , InvalidRequest , BAD_REQUEST ; EditDocumentsByFunctionError , InvalidRequest , BAD_REQUEST diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index fb173ef3e..50eec46fe 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -1513,11 +1513,9 @@ fn retrieve_documents>( let mut candidates = if let Some(ids) = ids { let external_document_ids = index.external_documents_ids(); let mut candidates = RoaringBitmap::new(); - for (index, id) in ids.iter().enumerate() { + for id in ids.iter() { let Some(docid) = external_document_ids.get(&rtxn, id)? else { - let error = MeilisearchHttpError::DocumentNotFound(id.clone().into_inner()); - let msg = format!("In `.ids[{index}]`: {error}"); - return Err(ResponseError::from_msg(msg, Code::NotFoundDocumentId)); + continue; }; candidates.insert(docid); } diff --git a/crates/meilisearch/tests/documents/get_documents.rs b/crates/meilisearch/tests/documents/get_documents.rs index bcd81043f..f87a18b9f 100644 --- a/crates/meilisearch/tests/documents/get_documents.rs +++ b/crates/meilisearch/tests/documents/get_documents.rs @@ -687,13 +687,23 @@ async fn get_document_not_found_ids() { let (response, code) = index.fetch_documents(json!({"ids": ["0", 3, 42] })).await; let (response2, code2) = index.get_all_documents_raw("?ids=0,3,42").await; - snapshot!(code, @"400 Bad Request"); + // the document with id 42 is not in the results since it doesn't exist + // however, no error is raised + snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" { - "message": "In `.ids[2]`: Document `42` not found.", - "code": "not_found_document_id", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#not_found_document_id" + "results": [ + { + "id": 0, + "color": "red" + }, + { + "id": 3 + } + ], + "offset": 0, + "limit": 20, + "total": 2 } "###); assert_eq!(code, code2);