Merge #5384

5384: Get multiple documents by ids r=irevoire a=dureuill # Pull Request ## Related issue Fixes #5345 ## What does this PR do? - Implements [public usage](https://www.notion.so/meilisearch/Get-documents-by-ID-1994b06b651f805ba273e1c6b75ce4d8) - Slightly refactor error messages for the `/similar` route Co-authored-by: Louis Dureuil <louis@meilisearch.com>
2025-07-03 03:47:02 +02:00 · 2025-03-12 17:26:49 +00:00 · 2025-03-12 17:26:49 +00:00 · e2d0ce52ba
commit e2d0ce52ba
parent 995f8962bd 60ff1b19a8
8 changed files with 424 additions and 54 deletions
--- a/crates/meilisearch/src/routes/indexes/documents.rs
+++ b/crates/meilisearch/src/routes/indexes/documents.rs
@ -20,11 +20,13 @@ use meilisearch_types::index_uid::IndexUid;
 use meilisearch_types::milli::update::IndexDocumentsMethod;
 use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors;
 use meilisearch_types::milli::DocumentId;
+use meilisearch_types::serde_cs::vec::CS;
 use meilisearch_types::star_or::OptionStarOrList;
 use meilisearch_types::tasks::KindWithContent;
 use meilisearch_types::{milli, Document, Index};
 use mime::Mime;
 use once_cell::sync::Lazy;
+use roaring::RoaringBitmap;
 use serde::{Deserialize, Serialize};
 use serde_json::Value;
 use tempfile::tempfile;
@ -43,7 +45,7 @@ use crate::extractors::sequential_extractor::SeqHandler;
 use crate::routes::{
    get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT,
 };
-use crate::search::{parse_filter, RetrieveVectors};
+use crate::search::{parse_filter, ExternalDocumentId, RetrieveVectors};
 use crate::{aggregate_methods, Opt};

 static ACCEPTED_CONTENT_TYPE: Lazy<Vec<String>> = Lazy::new(|| {
@ -137,6 +139,9 @@ pub struct DocumentsFetchAggregator<Method: AggregateMethod> {
    #[serde(rename = "vector.retrieve_vectors")]
    retrieve_vectors: bool,

+    // maximum size of `ids` array. 0 if always empty or `null`
+    max_document_ids: usize,
+
    // pagination
    #[serde(rename = "pagination.max_limit")]
    max_limit: usize,
@ -149,7 +154,7 @@ pub struct DocumentsFetchAggregator<Method: AggregateMethod> {
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 pub enum DocumentFetchKind {
    PerDocumentId { retrieve_vectors: bool },
-    Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool },
+    Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool, ids: usize },
 }

 impl<Method: AggregateMethod> DocumentsFetchAggregator<Method> {
@ -161,12 +166,18 @@ impl<Method: AggregateMethod> DocumentsFetchAggregator<Method> {
            }
        };

+        let ids = match query {
+            DocumentFetchKind::Normal { ids, .. } => *ids,
+            DocumentFetchKind::PerDocumentId { .. } => 0,
+        };
+
        Self {
            per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }),
            per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter),
            max_limit: limit,
            max_offset: offset,
            retrieve_vectors,
+            max_document_ids: ids,

            marker: PhantomData,
        }
@ -185,6 +196,7 @@ impl<Method: AggregateMethod> Aggregate for DocumentsFetchAggregator<Method> {
            retrieve_vectors: self.retrieve_vectors | new.retrieve_vectors,
            max_limit: self.max_limit.max(new.max_limit),
            max_offset: self.max_offset.max(new.max_offset),
+            max_document_ids: self.max_document_ids.max(new.max_document_ids),
            marker: PhantomData,
        })
    }
@ -266,6 +278,7 @@ pub async fn get_document(
            per_filter: false,
            max_limit: 0,
            max_offset: 0,
+            max_document_ids: 0,
            marker: PhantomData,
        },
        &req,
@ -387,6 +400,9 @@ pub struct BrowseQueryGet {
    #[param(default, value_type = Option<bool>)]
    #[deserr(default, error = DeserrQueryParamError<InvalidDocumentRetrieveVectors>)]
    retrieve_vectors: Param<bool>,
+    #[param(default, value_type = Option<Vec<String>>)]
+    #[deserr(default, error = DeserrQueryParamError<InvalidDocumentIds>)]
+    ids: Option<CS<String>>,
    #[param(default, value_type = Option<String>, example = "popularity > 1000")]
    #[deserr(default, error = DeserrQueryParamError<InvalidDocumentFilter>)]
    filter: Option<String>,
@ -408,6 +424,9 @@ pub struct BrowseQuery {
    #[schema(default, example = true)]
    #[deserr(default, error = DeserrJsonError<InvalidDocumentRetrieveVectors>)]
    retrieve_vectors: bool,
+    #[schema(value_type = Option<Vec<String>>, example = json!(["cody", "finn", "brandy", "gambit"]))]
+    #[deserr(default, error = DeserrJsonError<InvalidDocumentIds>)]
+    ids: Option<Vec<serde_json::Value>>,
    #[schema(default, value_type = Option<Value>, example = "popularity > 1000")]
    #[deserr(default, error = DeserrJsonError<InvalidDocumentFilter>)]
    filter: Option<Value>,
@ -479,6 +498,7 @@ pub async fn documents_by_query_post(
            retrieve_vectors: body.retrieve_vectors,
            max_limit: body.limit,
            max_offset: body.offset,
+            max_document_ids: body.ids.as_ref().map(Vec::len).unwrap_or_default(),
            per_document_id: false,
            marker: PhantomData,
        },
@ -551,7 +571,8 @@ pub async fn get_documents(
 ) -> Result<HttpResponse, ResponseError> {
    debug!(parameters = ?params, "Get documents GET");

-    let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter } = params.into_inner();
+    let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter, ids } =
+        params.into_inner();

    let filter = match filter {
        Some(f) => match serde_json::from_str(&f) {
@ -561,12 +582,15 @@ pub async fn get_documents(
        None => None,
    };

+    let ids = ids.map(|ids| ids.into_iter().map(Into::into).collect());
+
    let query = BrowseQuery {
        offset: offset.0,
        limit: limit.0,
        fields: fields.merge_star_and_none(),
        retrieve_vectors: retrieve_vectors.0,
        filter,
+        ids,
    };

    analytics.publish(
@ -575,6 +599,7 @@ pub async fn get_documents(
            retrieve_vectors: query.retrieve_vectors,
            max_limit: query.limit,
            max_offset: query.offset,
+            max_document_ids: query.ids.as_ref().map(Vec::len).unwrap_or_default(),
            per_document_id: false,
            marker: PhantomData,
        },
@ -590,15 +615,30 @@ fn documents_by_query(
    query: BrowseQuery,
 ) -> Result<HttpResponse, ResponseError> {
    let index_uid = IndexUid::try_from(index_uid.into_inner())?;
-    let BrowseQuery { offset, limit, fields, retrieve_vectors, filter } = query;
+    let BrowseQuery { offset, limit, fields, retrieve_vectors, filter, ids } = query;

    let retrieve_vectors = RetrieveVectors::new(retrieve_vectors);

+    let ids = if let Some(ids) = ids {
+        let mut parsed_ids = Vec::with_capacity(ids.len());
+        for (index, id) in ids.into_iter().enumerate() {
+            let id = id.try_into().map_err(|error| {
+                let msg = format!("In `.ids[{index}]`: {error}");
+                ResponseError::from_msg(msg, Code::InvalidDocumentIds)
+            })?;
+            parsed_ids.push(id)
+        }
+        Some(parsed_ids)
+    } else {
+        None
+    };
+
    let index = index_scheduler.index(&index_uid)?;
    let (total, documents) = retrieve_documents(
        &index,
        offset,
        limit,
+        ids,
        filter,
        fields,
        retrieve_vectors,
@ -1451,10 +1491,12 @@ fn some_documents<'a, 't: 'a>(
    }))
 }

+#[allow(clippy::too_many_arguments)]
 fn retrieve_documents<S: AsRef<str>>(
    index: &Index,
    offset: usize,
    limit: usize,
+    ids: Option<Vec<ExternalDocumentId>>,
    filter: Option<Value>,
    attributes_to_retrieve: Option<Vec<S>>,
    retrieve_vectors: RetrieveVectors,
@ -1468,16 +1510,28 @@ fn retrieve_documents<S: AsRef<str>>(
        None
    };

-    let candidates = if let Some(filter) = filter {
-        filter.evaluate(&rtxn, index).map_err(|err| match err {
+    let mut candidates = if let Some(ids) = ids {
+        let external_document_ids = index.external_documents_ids();
+        let mut candidates = RoaringBitmap::new();
+        for id in ids.iter() {
+            let Some(docid) = external_document_ids.get(&rtxn, id)? else {
+                continue;
+            };
+            candidates.insert(docid);
+        }
+        candidates
+    } else {
+        index.documents_ids(&rtxn)?
+    };
+
+    if let Some(filter) = filter {
+        candidates &= filter.evaluate(&rtxn, index).map_err(|err| match err {
            milli::Error::UserError(milli::UserError::InvalidFilter(_)) => {
                ResponseError::from_msg(err.to_string(), Code::InvalidDocumentFilter)
            }
            e => e.into(),
        })?
-    } else {
-        index.documents_ids(&rtxn)?
-    };
+    }

    let (it, number_of_documents) = {
        let number_of_documents = candidates.len();
--- a/crates/meilisearch/src/routes/indexes/similar.rs
+++ b/crates/meilisearch/src/routes/indexes/similar.rs
@ -5,7 +5,7 @@ use index_scheduler::IndexScheduler;
 use meilisearch_types::deserr::query_params::Param;
 use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError};
 use meilisearch_types::error::deserr_codes::*;
-use meilisearch_types::error::{ErrorCode as _, ResponseError};
+use meilisearch_types::error::ResponseError;
 use meilisearch_types::index_uid::IndexUid;
 use meilisearch_types::keys::actions;
 use meilisearch_types::serde_cs::vec::CS;
@ -111,7 +111,7 @@ pub async fn similar_get(
 ) -> Result<HttpResponse, ResponseError> {
    let index_uid = IndexUid::try_from(index_uid.into_inner())?;

-    let query = params.0.try_into()?;
+    let query = params.0.into();

    let mut aggregate = SimilarAggregator::<SimilarGET>::from_query(&query);

@ -295,10 +295,8 @@ impl std::convert::TryFrom<String> for RankingScoreThresholdGet {
    }
 }

-impl TryFrom<SimilarQueryGet> for SimilarQuery {
-    type Error = ResponseError;
-
-    fn try_from(
+impl From<SimilarQueryGet> for SimilarQuery {
+    fn from(
        SimilarQueryGet {
            id,
            offset,
@ -311,7 +309,7 @@ impl TryFrom<SimilarQueryGet> for SimilarQuery {
            embedder,
            ranking_score_threshold,
        }: SimilarQueryGet,
-    ) -> Result<Self, Self::Error> {
+    ) -> Self {
        let filter = match filter {
            Some(f) => match serde_json::from_str(&f) {
                Ok(v) => Some(v),
@ -320,10 +318,8 @@ impl TryFrom<SimilarQueryGet> for SimilarQuery {
            None => None,
        };

-        Ok(SimilarQuery {
-            id: id.0.try_into().map_err(|code: InvalidSimilarId| {
-                ResponseError::from_msg(code.to_string(), code.error_code())
-            })?,
+        SimilarQuery {
+            id: serde_json::Value::String(id.0),
            offset: offset.0,
            limit: limit.0,
            filter,
@ -333,6 +329,6 @@ impl TryFrom<SimilarQueryGet> for SimilarQuery {
            show_ranking_score: show_ranking_score.0,
            show_ranking_score_details: show_ranking_score_details.0,
            ranking_score_threshold: ranking_score_threshold.map(|x| x.0),
-        })
+        }
    }
 }