mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 11:57:07 +02:00
Merge #4649
4649: Don't store the vectors in the documents database r=dureuill a=irevoire # Pull Request ## Related issue Fixes https://github.com/meilisearch/meilisearch/issues/4607 ## What does this PR do? - Ensure that anything falling under `_vectors` is NOT searchable, filterable or sortable - [x] per embedder, add a roaring bitmap of documents that provide "userProvided" embeddings - [x] in the indexing process in extract_vector_points, set the bit corresponding to the document depending on the "userProvided" subfield in the _vectors field. - [x] in the document DB in typed chunks, when writing the _vectors field, remove all keys corresponding to an embedder Co-authored-by: Tamo <tamo@meilisearch.com> Co-authored-by: Louis Dureuil <louis@meilisearch.com>
This commit is contained in:
commit
e9bf4c43a4
60 changed files with 3920 additions and 1126 deletions
|
@ -74,8 +74,8 @@ pub enum DocumentDeletionKind {
|
|||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
pub enum DocumentFetchKind {
|
||||
PerDocumentId,
|
||||
Normal { with_filter: bool, limit: usize, offset: usize },
|
||||
PerDocumentId { retrieve_vectors: bool },
|
||||
Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool },
|
||||
}
|
||||
|
||||
pub trait Analytics: Sync + Send {
|
||||
|
|
|
@ -622,6 +622,7 @@ pub struct SearchAggregator {
|
|||
// Whether a non-default embedder was specified
|
||||
embedder: bool,
|
||||
hybrid: bool,
|
||||
retrieve_vectors: bool,
|
||||
|
||||
// every time a search is done, we increment the counter linked to the used settings
|
||||
matching_strategy: HashMap<String, usize>,
|
||||
|
@ -662,6 +663,7 @@ impl SearchAggregator {
|
|||
page,
|
||||
hits_per_page,
|
||||
attributes_to_retrieve: _,
|
||||
retrieve_vectors,
|
||||
attributes_to_crop: _,
|
||||
crop_length,
|
||||
attributes_to_highlight: _,
|
||||
|
@ -728,6 +730,7 @@ impl SearchAggregator {
|
|||
if let Some(ref vector) = vector {
|
||||
ret.max_vector_size = vector.len();
|
||||
}
|
||||
ret.retrieve_vectors |= retrieve_vectors;
|
||||
|
||||
if query.is_finite_pagination() {
|
||||
let limit = hits_per_page.unwrap_or_else(DEFAULT_SEARCH_LIMIT);
|
||||
|
@ -803,6 +806,7 @@ impl SearchAggregator {
|
|||
attributes_to_search_on_total_number_of_uses,
|
||||
max_terms_number,
|
||||
max_vector_size,
|
||||
retrieve_vectors,
|
||||
matching_strategy,
|
||||
max_limit,
|
||||
max_offset,
|
||||
|
@ -873,6 +877,7 @@ impl SearchAggregator {
|
|||
|
||||
// vector
|
||||
self.max_vector_size = self.max_vector_size.max(max_vector_size);
|
||||
self.retrieve_vectors |= retrieve_vectors;
|
||||
self.semantic_ratio |= semantic_ratio;
|
||||
self.hybrid |= hybrid;
|
||||
self.embedder |= embedder;
|
||||
|
@ -929,6 +934,7 @@ impl SearchAggregator {
|
|||
attributes_to_search_on_total_number_of_uses,
|
||||
max_terms_number,
|
||||
max_vector_size,
|
||||
retrieve_vectors,
|
||||
matching_strategy,
|
||||
max_limit,
|
||||
max_offset,
|
||||
|
@ -991,6 +997,7 @@ impl SearchAggregator {
|
|||
},
|
||||
"vector": {
|
||||
"max_vector_size": max_vector_size,
|
||||
"retrieve_vectors": retrieve_vectors,
|
||||
},
|
||||
"hybrid": {
|
||||
"enabled": hybrid,
|
||||
|
@ -1079,6 +1086,7 @@ impl MultiSearchAggregator {
|
|||
page: _,
|
||||
hits_per_page: _,
|
||||
attributes_to_retrieve: _,
|
||||
retrieve_vectors: _,
|
||||
attributes_to_crop: _,
|
||||
crop_length: _,
|
||||
attributes_to_highlight: _,
|
||||
|
@ -1534,6 +1542,9 @@ pub struct DocumentsFetchAggregator {
|
|||
// if a filter was used
|
||||
per_filter: bool,
|
||||
|
||||
#[serde(rename = "vector.retrieve_vectors")]
|
||||
retrieve_vectors: bool,
|
||||
|
||||
// pagination
|
||||
#[serde(rename = "pagination.max_limit")]
|
||||
max_limit: usize,
|
||||
|
@ -1543,18 +1554,21 @@ pub struct DocumentsFetchAggregator {
|
|||
|
||||
impl DocumentsFetchAggregator {
|
||||
pub fn from_query(query: &DocumentFetchKind, request: &HttpRequest) -> Self {
|
||||
let (limit, offset) = match query {
|
||||
DocumentFetchKind::PerDocumentId => (1, 0),
|
||||
DocumentFetchKind::Normal { limit, offset, .. } => (*limit, *offset),
|
||||
let (limit, offset, retrieve_vectors) = match query {
|
||||
DocumentFetchKind::PerDocumentId { retrieve_vectors } => (1, 0, *retrieve_vectors),
|
||||
DocumentFetchKind::Normal { limit, offset, retrieve_vectors, .. } => {
|
||||
(*limit, *offset, *retrieve_vectors)
|
||||
}
|
||||
};
|
||||
Self {
|
||||
timestamp: Some(OffsetDateTime::now_utc()),
|
||||
user_agents: extract_user_agents(request).into_iter().collect(),
|
||||
total_received: 1,
|
||||
per_document_id: matches!(query, DocumentFetchKind::PerDocumentId),
|
||||
per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }),
|
||||
per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter),
|
||||
max_limit: limit,
|
||||
max_offset: offset,
|
||||
retrieve_vectors,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1568,6 +1582,7 @@ impl DocumentsFetchAggregator {
|
|||
per_filter,
|
||||
max_limit,
|
||||
max_offset,
|
||||
retrieve_vectors,
|
||||
} = other;
|
||||
|
||||
if self.timestamp.is_none() {
|
||||
|
@ -1583,6 +1598,8 @@ impl DocumentsFetchAggregator {
|
|||
|
||||
self.max_limit = self.max_limit.max(max_limit);
|
||||
self.max_offset = self.max_offset.max(max_offset);
|
||||
|
||||
self.retrieve_vectors |= retrieve_vectors;
|
||||
}
|
||||
|
||||
pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> {
|
||||
|
@ -1623,6 +1640,7 @@ pub struct SimilarAggregator {
|
|||
|
||||
// Whether a non-default embedder was specified
|
||||
embedder: bool,
|
||||
retrieve_vectors: bool,
|
||||
|
||||
// pagination
|
||||
max_limit: usize,
|
||||
|
@ -1646,6 +1664,7 @@ impl SimilarAggregator {
|
|||
offset,
|
||||
limit,
|
||||
attributes_to_retrieve: _,
|
||||
retrieve_vectors,
|
||||
show_ranking_score,
|
||||
show_ranking_score_details,
|
||||
filter,
|
||||
|
@ -1690,6 +1709,7 @@ impl SimilarAggregator {
|
|||
ret.ranking_score_threshold = ranking_score_threshold.is_some();
|
||||
|
||||
ret.embedder = embedder.is_some();
|
||||
ret.retrieve_vectors = *retrieve_vectors;
|
||||
|
||||
ret
|
||||
}
|
||||
|
@ -1722,6 +1742,7 @@ impl SimilarAggregator {
|
|||
show_ranking_score_details,
|
||||
embedder,
|
||||
ranking_score_threshold,
|
||||
retrieve_vectors,
|
||||
} = other;
|
||||
|
||||
if self.timestamp.is_none() {
|
||||
|
@ -1751,6 +1772,7 @@ impl SimilarAggregator {
|
|||
}
|
||||
|
||||
self.embedder |= embedder;
|
||||
self.retrieve_vectors |= retrieve_vectors;
|
||||
|
||||
// pagination
|
||||
self.max_limit = self.max_limit.max(max_limit);
|
||||
|
@ -1785,6 +1807,7 @@ impl SimilarAggregator {
|
|||
show_ranking_score_details,
|
||||
embedder,
|
||||
ranking_score_threshold,
|
||||
retrieve_vectors,
|
||||
} = self;
|
||||
|
||||
if total_received == 0 {
|
||||
|
@ -1811,6 +1834,9 @@ impl SimilarAggregator {
|
|||
"avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64),
|
||||
"most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)),
|
||||
},
|
||||
"vector": {
|
||||
"retrieve_vectors": retrieve_vectors,
|
||||
},
|
||||
"hybrid": {
|
||||
"embedder": embedder,
|
||||
},
|
||||
|
|
|
@ -16,6 +16,7 @@ use meilisearch_types::error::{Code, ResponseError};
|
|||
use meilisearch_types::heed::RoTxn;
|
||||
use meilisearch_types::index_uid::IndexUid;
|
||||
use meilisearch_types::milli::update::IndexDocumentsMethod;
|
||||
use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors;
|
||||
use meilisearch_types::milli::DocumentId;
|
||||
use meilisearch_types::star_or::OptionStarOrList;
|
||||
use meilisearch_types::tasks::KindWithContent;
|
||||
|
@ -39,7 +40,7 @@ use crate::extractors::sequential_extractor::SeqHandler;
|
|||
use crate::routes::{
|
||||
get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT,
|
||||
};
|
||||
use crate::search::parse_filter;
|
||||
use crate::search::{parse_filter, RetrieveVectors};
|
||||
use crate::Opt;
|
||||
|
||||
static ACCEPTED_CONTENT_TYPE: Lazy<Vec<String>> = Lazy::new(|| {
|
||||
|
@ -94,6 +95,8 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
|
|||
pub struct GetDocument {
|
||||
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentFields>)]
|
||||
fields: OptionStarOrList<String>,
|
||||
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentRetrieveVectors>)]
|
||||
retrieve_vectors: Param<bool>,
|
||||
}
|
||||
|
||||
pub async fn get_document(
|
||||
|
@ -107,13 +110,20 @@ pub async fn get_document(
|
|||
debug!(parameters = ?params, "Get document");
|
||||
let index_uid = IndexUid::try_from(index_uid)?;
|
||||
|
||||
analytics.get_fetch_documents(&DocumentFetchKind::PerDocumentId, &req);
|
||||
|
||||
let GetDocument { fields } = params.into_inner();
|
||||
let GetDocument { fields, retrieve_vectors: param_retrieve_vectors } = params.into_inner();
|
||||
let attributes_to_retrieve = fields.merge_star_and_none();
|
||||
|
||||
let features = index_scheduler.features();
|
||||
let retrieve_vectors = RetrieveVectors::new(param_retrieve_vectors.0, features)?;
|
||||
|
||||
analytics.get_fetch_documents(
|
||||
&DocumentFetchKind::PerDocumentId { retrieve_vectors: param_retrieve_vectors.0 },
|
||||
&req,
|
||||
);
|
||||
|
||||
let index = index_scheduler.index(&index_uid)?;
|
||||
let document = retrieve_document(&index, &document_id, attributes_to_retrieve)?;
|
||||
let document =
|
||||
retrieve_document(&index, &document_id, attributes_to_retrieve, retrieve_vectors)?;
|
||||
debug!(returns = ?document, "Get document");
|
||||
Ok(HttpResponse::Ok().json(document))
|
||||
}
|
||||
|
@ -153,6 +163,8 @@ pub struct BrowseQueryGet {
|
|||
limit: Param<usize>,
|
||||
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentFields>)]
|
||||
fields: OptionStarOrList<String>,
|
||||
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentRetrieveVectors>)]
|
||||
retrieve_vectors: Param<bool>,
|
||||
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentFilter>)]
|
||||
filter: Option<String>,
|
||||
}
|
||||
|
@ -166,6 +178,8 @@ pub struct BrowseQuery {
|
|||
limit: usize,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidDocumentFields>)]
|
||||
fields: Option<Vec<String>>,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidDocumentRetrieveVectors>)]
|
||||
retrieve_vectors: bool,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidDocumentFilter>)]
|
||||
filter: Option<Value>,
|
||||
}
|
||||
|
@ -185,6 +199,7 @@ pub async fn documents_by_query_post(
|
|||
with_filter: body.filter.is_some(),
|
||||
limit: body.limit,
|
||||
offset: body.offset,
|
||||
retrieve_vectors: body.retrieve_vectors,
|
||||
},
|
||||
&req,
|
||||
);
|
||||
|
@ -201,7 +216,7 @@ pub async fn get_documents(
|
|||
) -> Result<HttpResponse, ResponseError> {
|
||||
debug!(parameters = ?params, "Get documents GET");
|
||||
|
||||
let BrowseQueryGet { limit, offset, fields, filter } = params.into_inner();
|
||||
let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter } = params.into_inner();
|
||||
|
||||
let filter = match filter {
|
||||
Some(f) => match serde_json::from_str(&f) {
|
||||
|
@ -215,6 +230,7 @@ pub async fn get_documents(
|
|||
offset: offset.0,
|
||||
limit: limit.0,
|
||||
fields: fields.merge_star_and_none(),
|
||||
retrieve_vectors: retrieve_vectors.0,
|
||||
filter,
|
||||
};
|
||||
|
||||
|
@ -223,6 +239,7 @@ pub async fn get_documents(
|
|||
with_filter: query.filter.is_some(),
|
||||
limit: query.limit,
|
||||
offset: query.offset,
|
||||
retrieve_vectors: query.retrieve_vectors,
|
||||
},
|
||||
&req,
|
||||
);
|
||||
|
@ -236,10 +253,14 @@ fn documents_by_query(
|
|||
query: BrowseQuery,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
let BrowseQuery { offset, limit, fields, filter } = query;
|
||||
let BrowseQuery { offset, limit, fields, retrieve_vectors, filter } = query;
|
||||
|
||||
let features = index_scheduler.features();
|
||||
let retrieve_vectors = RetrieveVectors::new(retrieve_vectors, features)?;
|
||||
|
||||
let index = index_scheduler.index(&index_uid)?;
|
||||
let (total, documents) = retrieve_documents(&index, offset, limit, filter, fields)?;
|
||||
let (total, documents) =
|
||||
retrieve_documents(&index, offset, limit, filter, fields, retrieve_vectors)?;
|
||||
|
||||
let ret = PaginationView::new(offset, limit, total as usize, documents);
|
||||
|
||||
|
@ -579,13 +600,44 @@ fn some_documents<'a, 't: 'a>(
|
|||
index: &'a Index,
|
||||
rtxn: &'t RoTxn,
|
||||
doc_ids: impl IntoIterator<Item = DocumentId> + 'a,
|
||||
retrieve_vectors: RetrieveVectors,
|
||||
) -> Result<impl Iterator<Item = Result<Document, ResponseError>> + 'a, ResponseError> {
|
||||
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
||||
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
|
||||
let embedding_configs = index.embedding_configs(rtxn)?;
|
||||
|
||||
Ok(index.iter_documents(rtxn, doc_ids)?.map(move |ret| {
|
||||
ret.map_err(ResponseError::from).and_then(|(_key, document)| -> Result<_, ResponseError> {
|
||||
Ok(milli::obkv_to_json(&all_fields, &fields_ids_map, document)?)
|
||||
ret.map_err(ResponseError::from).and_then(|(key, document)| -> Result<_, ResponseError> {
|
||||
let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, document)?;
|
||||
match retrieve_vectors {
|
||||
RetrieveVectors::Ignore => {}
|
||||
RetrieveVectors::Hide => {
|
||||
document.remove("_vectors");
|
||||
}
|
||||
RetrieveVectors::Retrieve => {
|
||||
let mut vectors = match document.remove("_vectors") {
|
||||
Some(Value::Object(map)) => map,
|
||||
_ => Default::default(),
|
||||
};
|
||||
for (name, vector) in index.embeddings(rtxn, key)? {
|
||||
let user_provided = embedding_configs
|
||||
.iter()
|
||||
.find(|conf| conf.name == name)
|
||||
.is_some_and(|conf| conf.user_provided.contains(key));
|
||||
let embeddings = ExplicitVectors {
|
||||
embeddings: Some(vector.into()),
|
||||
regenerate: !user_provided,
|
||||
};
|
||||
vectors.insert(
|
||||
name,
|
||||
serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?,
|
||||
);
|
||||
}
|
||||
document.insert("_vectors".into(), vectors.into());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(document)
|
||||
})
|
||||
}))
|
||||
}
|
||||
|
@ -596,6 +648,7 @@ fn retrieve_documents<S: AsRef<str>>(
|
|||
limit: usize,
|
||||
filter: Option<Value>,
|
||||
attributes_to_retrieve: Option<Vec<S>>,
|
||||
retrieve_vectors: RetrieveVectors,
|
||||
) -> Result<(u64, Vec<Document>), ResponseError> {
|
||||
let rtxn = index.read_txn()?;
|
||||
let filter = &filter;
|
||||
|
@ -620,53 +673,57 @@ fn retrieve_documents<S: AsRef<str>>(
|
|||
let (it, number_of_documents) = {
|
||||
let number_of_documents = candidates.len();
|
||||
(
|
||||
some_documents(index, &rtxn, candidates.into_iter().skip(offset).take(limit))?,
|
||||
some_documents(
|
||||
index,
|
||||
&rtxn,
|
||||
candidates.into_iter().skip(offset).take(limit),
|
||||
retrieve_vectors,
|
||||
)?,
|
||||
number_of_documents,
|
||||
)
|
||||
};
|
||||
|
||||
let documents: Result<Vec<_>, ResponseError> = it
|
||||
let documents: Vec<_> = it
|
||||
.map(|document| {
|
||||
Ok(match &attributes_to_retrieve {
|
||||
Some(attributes_to_retrieve) => permissive_json_pointer::select_values(
|
||||
&document?,
|
||||
attributes_to_retrieve.iter().map(|s| s.as_ref()),
|
||||
attributes_to_retrieve.iter().map(|s| s.as_ref()).chain(
|
||||
(retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors"),
|
||||
),
|
||||
),
|
||||
None => document?,
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
.collect::<Result<_, ResponseError>>()?;
|
||||
|
||||
Ok((number_of_documents, documents?))
|
||||
Ok((number_of_documents, documents))
|
||||
}
|
||||
|
||||
fn retrieve_document<S: AsRef<str>>(
|
||||
index: &Index,
|
||||
doc_id: &str,
|
||||
attributes_to_retrieve: Option<Vec<S>>,
|
||||
retrieve_vectors: RetrieveVectors,
|
||||
) -> Result<Document, ResponseError> {
|
||||
let txn = index.read_txn()?;
|
||||
|
||||
let fields_ids_map = index.fields_ids_map(&txn)?;
|
||||
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
|
||||
|
||||
let internal_id = index
|
||||
.external_documents_ids()
|
||||
.get(&txn, doc_id)?
|
||||
.ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?;
|
||||
|
||||
let document = index
|
||||
.documents(&txn, std::iter::once(internal_id))?
|
||||
.into_iter()
|
||||
let document = some_documents(index, &txn, Some(internal_id), retrieve_vectors)?
|
||||
.next()
|
||||
.map(|(_, d)| d)
|
||||
.ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?;
|
||||
.ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))??;
|
||||
|
||||
let document = meilisearch_types::milli::obkv_to_json(&all_fields, &fields_ids_map, document)?;
|
||||
let document = match &attributes_to_retrieve {
|
||||
Some(attributes_to_retrieve) => permissive_json_pointer::select_values(
|
||||
&document,
|
||||
attributes_to_retrieve.iter().map(|s| s.as_ref()),
|
||||
attributes_to_retrieve
|
||||
.iter()
|
||||
.map(|s| s.as_ref())
|
||||
.chain((retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors")),
|
||||
),
|
||||
None => document,
|
||||
};
|
||||
|
|
|
@ -115,6 +115,7 @@ impl From<FacetSearchQuery> for SearchQuery {
|
|||
page: None,
|
||||
hits_per_page: None,
|
||||
attributes_to_retrieve: None,
|
||||
retrieve_vectors: false,
|
||||
attributes_to_crop: None,
|
||||
crop_length: DEFAULT_CROP_LENGTH(),
|
||||
attributes_to_highlight: None,
|
||||
|
|
|
@ -20,9 +20,9 @@ use crate::extractors::sequential_extractor::SeqHandler;
|
|||
use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS;
|
||||
use crate::search::{
|
||||
add_search_rules, perform_search, HybridQuery, MatchingStrategy, RankingScoreThreshold,
|
||||
SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER,
|
||||
DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT,
|
||||
DEFAULT_SEARCH_OFFSET, DEFAULT_SEMANTIC_RATIO,
|
||||
RetrieveVectors, SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH,
|
||||
DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG,
|
||||
DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, DEFAULT_SEMANTIC_RATIO,
|
||||
};
|
||||
use crate::search_queue::SearchQueue;
|
||||
|
||||
|
@ -51,6 +51,8 @@ pub struct SearchQueryGet {
|
|||
hits_per_page: Option<Param<usize>>,
|
||||
#[deserr(default, error = DeserrQueryParamError<InvalidSearchAttributesToRetrieve>)]
|
||||
attributes_to_retrieve: Option<CS<String>>,
|
||||
#[deserr(default, error = DeserrQueryParamError<InvalidSearchRetrieveVectors>)]
|
||||
retrieve_vectors: Param<bool>,
|
||||
#[deserr(default, error = DeserrQueryParamError<InvalidSearchAttributesToCrop>)]
|
||||
attributes_to_crop: Option<CS<String>>,
|
||||
#[deserr(default = Param(DEFAULT_CROP_LENGTH()), error = DeserrQueryParamError<InvalidSearchCropLength>)]
|
||||
|
@ -153,6 +155,7 @@ impl From<SearchQueryGet> for SearchQuery {
|
|||
page: other.page.as_deref().copied(),
|
||||
hits_per_page: other.hits_per_page.as_deref().copied(),
|
||||
attributes_to_retrieve: other.attributes_to_retrieve.map(|o| o.into_iter().collect()),
|
||||
retrieve_vectors: other.retrieve_vectors.0,
|
||||
attributes_to_crop: other.attributes_to_crop.map(|o| o.into_iter().collect()),
|
||||
crop_length: other.crop_length.0,
|
||||
attributes_to_highlight: other.attributes_to_highlight.map(|o| o.into_iter().collect()),
|
||||
|
@ -222,10 +225,12 @@ pub async fn search_with_url_query(
|
|||
let features = index_scheduler.features();
|
||||
|
||||
let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?;
|
||||
|
||||
let retrieve_vector = RetrieveVectors::new(query.retrieve_vectors, features)?;
|
||||
let _permit = search_queue.try_get_search_permit().await?;
|
||||
let search_result =
|
||||
tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)).await?;
|
||||
let search_result = tokio::task::spawn_blocking(move || {
|
||||
perform_search(&index, query, search_kind, retrieve_vector)
|
||||
})
|
||||
.await?;
|
||||
if let Ok(ref search_result) = search_result {
|
||||
aggregate.succeed(search_result);
|
||||
}
|
||||
|
@ -262,10 +267,13 @@ pub async fn search_with_post(
|
|||
let features = index_scheduler.features();
|
||||
|
||||
let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?;
|
||||
let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?;
|
||||
|
||||
let _permit = search_queue.try_get_search_permit().await?;
|
||||
let search_result =
|
||||
tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)).await?;
|
||||
let search_result = tokio::task::spawn_blocking(move || {
|
||||
perform_search(&index, query, search_kind, retrieve_vectors)
|
||||
})
|
||||
.await?;
|
||||
if let Ok(ref search_result) = search_result {
|
||||
aggregate.succeed(search_result);
|
||||
if search_result.degraded {
|
||||
|
@ -287,11 +295,10 @@ pub fn search_kind(
|
|||
features: RoFeatures,
|
||||
) -> Result<SearchKind, ResponseError> {
|
||||
if query.vector.is_some() {
|
||||
features.check_vector("Passing `vector` as a query parameter")?;
|
||||
features.check_vector("Passing `vector` as a parameter")?;
|
||||
}
|
||||
|
||||
if query.hybrid.is_some() {
|
||||
features.check_vector("Passing `hybrid` as a query parameter")?;
|
||||
features.check_vector("Passing `hybrid` as a parameter")?;
|
||||
}
|
||||
|
||||
// regardless of anything, always do a keyword search when we don't have a vector and the query is whitespace or missing
|
||||
|
|
|
@ -4,11 +4,7 @@ use deserr::actix_web::{AwebJson, AwebQueryParameter};
|
|||
use index_scheduler::IndexScheduler;
|
||||
use meilisearch_types::deserr::query_params::Param;
|
||||
use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError};
|
||||
use meilisearch_types::error::deserr_codes::{
|
||||
InvalidEmbedder, InvalidSimilarAttributesToRetrieve, InvalidSimilarFilter, InvalidSimilarId,
|
||||
InvalidSimilarLimit, InvalidSimilarOffset, InvalidSimilarRankingScoreThreshold,
|
||||
InvalidSimilarShowRankingScore, InvalidSimilarShowRankingScoreDetails,
|
||||
};
|
||||
use meilisearch_types::error::deserr_codes::*;
|
||||
use meilisearch_types::error::{ErrorCode as _, ResponseError};
|
||||
use meilisearch_types::index_uid::IndexUid;
|
||||
use meilisearch_types::keys::actions;
|
||||
|
@ -21,8 +17,8 @@ use crate::analytics::{Analytics, SimilarAggregator};
|
|||
use crate::extractors::authentication::GuardedData;
|
||||
use crate::extractors::sequential_extractor::SeqHandler;
|
||||
use crate::search::{
|
||||
add_search_rules, perform_similar, RankingScoreThresholdSimilar, SearchKind, SimilarQuery,
|
||||
SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
|
||||
add_search_rules, perform_similar, RankingScoreThresholdSimilar, RetrieveVectors, SearchKind,
|
||||
SimilarQuery, SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
|
||||
};
|
||||
|
||||
pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||
|
@ -97,6 +93,8 @@ async fn similar(
|
|||
|
||||
features.check_vector("Using the similar API")?;
|
||||
|
||||
let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?;
|
||||
|
||||
// Tenant token search_rules.
|
||||
if let Some(search_rules) = index_scheduler.filters().get_index_search_rules(&index_uid) {
|
||||
add_search_rules(&mut query.filter, search_rules);
|
||||
|
@ -107,8 +105,10 @@ async fn similar(
|
|||
let (embedder_name, embedder) =
|
||||
SearchKind::embedder(&index_scheduler, &index, query.embedder.as_deref(), None)?;
|
||||
|
||||
tokio::task::spawn_blocking(move || perform_similar(&index, query, embedder_name, embedder))
|
||||
.await?
|
||||
tokio::task::spawn_blocking(move || {
|
||||
perform_similar(&index, query, embedder_name, embedder, retrieve_vectors)
|
||||
})
|
||||
.await?
|
||||
}
|
||||
|
||||
#[derive(Debug, deserr::Deserr)]
|
||||
|
@ -122,6 +122,8 @@ pub struct SimilarQueryGet {
|
|||
limit: Param<usize>,
|
||||
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarAttributesToRetrieve>)]
|
||||
attributes_to_retrieve: Option<CS<String>>,
|
||||
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarRetrieveVectors>)]
|
||||
retrieve_vectors: Param<bool>,
|
||||
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarFilter>)]
|
||||
filter: Option<String>,
|
||||
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarShowRankingScore>)]
|
||||
|
@ -156,6 +158,7 @@ impl TryFrom<SimilarQueryGet> for SimilarQuery {
|
|||
offset,
|
||||
limit,
|
||||
attributes_to_retrieve,
|
||||
retrieve_vectors,
|
||||
filter,
|
||||
show_ranking_score,
|
||||
show_ranking_score_details,
|
||||
|
@ -180,6 +183,7 @@ impl TryFrom<SimilarQueryGet> for SimilarQuery {
|
|||
filter,
|
||||
embedder,
|
||||
attributes_to_retrieve: attributes_to_retrieve.map(|o| o.into_iter().collect()),
|
||||
retrieve_vectors: retrieve_vectors.0,
|
||||
show_ranking_score: show_ranking_score.0,
|
||||
show_ranking_score_details: show_ranking_score_details.0,
|
||||
ranking_score_threshold: ranking_score_threshold.map(|x| x.0),
|
||||
|
|
|
@ -15,7 +15,7 @@ use crate::extractors::authentication::{AuthenticationError, GuardedData};
|
|||
use crate::extractors::sequential_extractor::SeqHandler;
|
||||
use crate::routes::indexes::search::search_kind;
|
||||
use crate::search::{
|
||||
add_search_rules, perform_search, SearchQueryWithIndex, SearchResultWithIndex,
|
||||
add_search_rules, perform_search, RetrieveVectors, SearchQueryWithIndex, SearchResultWithIndex,
|
||||
};
|
||||
use crate::search_queue::SearchQueue;
|
||||
|
||||
|
@ -83,11 +83,14 @@ pub async fn multi_search_with_post(
|
|||
|
||||
let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)
|
||||
.with_index(query_index)?;
|
||||
let retrieve_vector =
|
||||
RetrieveVectors::new(query.retrieve_vectors, features).with_index(query_index)?;
|
||||
|
||||
let search_result =
|
||||
tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind))
|
||||
.await
|
||||
.with_index(query_index)?;
|
||||
let search_result = tokio::task::spawn_blocking(move || {
|
||||
perform_search(&index, query, search_kind, retrieve_vector)
|
||||
})
|
||||
.await
|
||||
.with_index(query_index)?;
|
||||
|
||||
search_results.push(SearchResultWithIndex {
|
||||
index_uid: index_uid.into_inner(),
|
||||
|
|
|
@ -15,6 +15,7 @@ use meilisearch_types::error::{Code, ResponseError};
|
|||
use meilisearch_types::heed::RoTxn;
|
||||
use meilisearch_types::index_uid::IndexUid;
|
||||
use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy};
|
||||
use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors;
|
||||
use meilisearch_types::milli::vector::Embedder;
|
||||
use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues, TimeBudget};
|
||||
use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS;
|
||||
|
@ -59,6 +60,8 @@ pub struct SearchQuery {
|
|||
pub hits_per_page: Option<usize>,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToRetrieve>)]
|
||||
pub attributes_to_retrieve: Option<BTreeSet<String>>,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSearchRetrieveVectors>)]
|
||||
pub retrieve_vectors: bool,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToCrop>)]
|
||||
pub attributes_to_crop: Option<Vec<String>>,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSearchCropLength>, default = DEFAULT_CROP_LENGTH())]
|
||||
|
@ -141,6 +144,7 @@ impl fmt::Debug for SearchQuery {
|
|||
page,
|
||||
hits_per_page,
|
||||
attributes_to_retrieve,
|
||||
retrieve_vectors,
|
||||
attributes_to_crop,
|
||||
crop_length,
|
||||
attributes_to_highlight,
|
||||
|
@ -173,6 +177,9 @@ impl fmt::Debug for SearchQuery {
|
|||
if let Some(q) = q {
|
||||
debug.field("q", &q);
|
||||
}
|
||||
if *retrieve_vectors {
|
||||
debug.field("retrieve_vectors", &retrieve_vectors);
|
||||
}
|
||||
if let Some(v) = vector {
|
||||
if v.len() < 10 {
|
||||
debug.field("vector", &v);
|
||||
|
@ -370,6 +377,8 @@ pub struct SearchQueryWithIndex {
|
|||
pub hits_per_page: Option<usize>,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToRetrieve>)]
|
||||
pub attributes_to_retrieve: Option<BTreeSet<String>>,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSearchRetrieveVectors>)]
|
||||
pub retrieve_vectors: bool,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToCrop>)]
|
||||
pub attributes_to_crop: Option<Vec<String>>,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSearchCropLength>, default = DEFAULT_CROP_LENGTH())]
|
||||
|
@ -413,6 +422,7 @@ impl SearchQueryWithIndex {
|
|||
page,
|
||||
hits_per_page,
|
||||
attributes_to_retrieve,
|
||||
retrieve_vectors,
|
||||
attributes_to_crop,
|
||||
crop_length,
|
||||
attributes_to_highlight,
|
||||
|
@ -440,6 +450,7 @@ impl SearchQueryWithIndex {
|
|||
page,
|
||||
hits_per_page,
|
||||
attributes_to_retrieve,
|
||||
retrieve_vectors,
|
||||
attributes_to_crop,
|
||||
crop_length,
|
||||
attributes_to_highlight,
|
||||
|
@ -478,6 +489,8 @@ pub struct SimilarQuery {
|
|||
pub embedder: Option<String>,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSimilarAttributesToRetrieve>)]
|
||||
pub attributes_to_retrieve: Option<BTreeSet<String>>,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSimilarRetrieveVectors>)]
|
||||
pub retrieve_vectors: bool,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSimilarShowRankingScore>, default)]
|
||||
pub show_ranking_score: bool,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSimilarShowRankingScoreDetails>, default)]
|
||||
|
@ -810,6 +823,7 @@ pub fn perform_search(
|
|||
index: &Index,
|
||||
query: SearchQuery,
|
||||
search_kind: SearchKind,
|
||||
retrieve_vectors: RetrieveVectors,
|
||||
) -> Result<SearchResult, MeilisearchHttpError> {
|
||||
let before_search = Instant::now();
|
||||
let rtxn = index.read_txn()?;
|
||||
|
@ -847,6 +861,8 @@ pub fn perform_search(
|
|||
page,
|
||||
hits_per_page,
|
||||
attributes_to_retrieve,
|
||||
// use the enum passed as parameter
|
||||
retrieve_vectors: _,
|
||||
attributes_to_crop,
|
||||
crop_length,
|
||||
attributes_to_highlight,
|
||||
|
@ -870,6 +886,7 @@ pub fn perform_search(
|
|||
|
||||
let format = AttributesFormat {
|
||||
attributes_to_retrieve,
|
||||
retrieve_vectors,
|
||||
attributes_to_highlight,
|
||||
attributes_to_crop,
|
||||
crop_length,
|
||||
|
@ -953,6 +970,7 @@ pub fn perform_search(
|
|||
|
||||
struct AttributesFormat {
|
||||
attributes_to_retrieve: Option<BTreeSet<String>>,
|
||||
retrieve_vectors: RetrieveVectors,
|
||||
attributes_to_highlight: Option<HashSet<String>>,
|
||||
attributes_to_crop: Option<Vec<String>>,
|
||||
crop_length: usize,
|
||||
|
@ -965,6 +983,36 @@ struct AttributesFormat {
|
|||
show_ranking_score_details: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum RetrieveVectors {
|
||||
/// Do not touch the `_vectors` field
|
||||
///
|
||||
/// this is the behavior when the vectorStore feature is disabled
|
||||
Ignore,
|
||||
/// Remove the `_vectors` field
|
||||
///
|
||||
/// this is the behavior when the vectorStore feature is enabled, and `retrieveVectors` is `false`
|
||||
Hide,
|
||||
/// Retrieve vectors from the DB and merge them into the `_vectors` field
|
||||
///
|
||||
/// this is the behavior when the vectorStore feature is enabled, and `retrieveVectors` is `true`
|
||||
Retrieve,
|
||||
}
|
||||
|
||||
impl RetrieveVectors {
|
||||
pub fn new(
|
||||
retrieve_vector: bool,
|
||||
features: index_scheduler::RoFeatures,
|
||||
) -> Result<Self, index_scheduler::Error> {
|
||||
match (retrieve_vector, features.check_vector("Passing `retrieveVectors` as a parameter")) {
|
||||
(true, Ok(())) => Ok(Self::Retrieve),
|
||||
(true, Err(error)) => Err(error),
|
||||
(false, Ok(())) => Ok(Self::Hide),
|
||||
(false, Err(_)) => Ok(Self::Ignore),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn make_hits(
|
||||
index: &Index,
|
||||
rtxn: &RoTxn<'_>,
|
||||
|
@ -974,10 +1022,32 @@ fn make_hits(
|
|||
document_scores: Vec<Vec<ScoreDetails>>,
|
||||
) -> Result<Vec<SearchHit>, MeilisearchHttpError> {
|
||||
let fields_ids_map = index.fields_ids_map(rtxn).unwrap();
|
||||
let displayed_ids = index
|
||||
.displayed_fields_ids(rtxn)?
|
||||
.map(|fields| fields.into_iter().collect::<BTreeSet<_>>())
|
||||
.unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect());
|
||||
let displayed_ids =
|
||||
index.displayed_fields_ids(rtxn)?.map(|fields| fields.into_iter().collect::<BTreeSet<_>>());
|
||||
|
||||
let vectors_fid = fields_ids_map.id(milli::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME);
|
||||
|
||||
let vectors_is_hidden = match (&displayed_ids, vectors_fid) {
|
||||
// displayed_ids is a wildcard, so `_vectors` can be displayed regardless of its fid
|
||||
(None, _) => false,
|
||||
// displayed_ids is a finite list, and `_vectors` cannot be part of it because it is not an existing field
|
||||
(Some(_), None) => true,
|
||||
// displayed_ids is a finit list, so hide if `_vectors` is not part of it
|
||||
(Some(map), Some(vectors_fid)) => map.contains(&vectors_fid),
|
||||
};
|
||||
|
||||
let retrieve_vectors = if let RetrieveVectors::Retrieve = format.retrieve_vectors {
|
||||
if vectors_is_hidden {
|
||||
RetrieveVectors::Hide
|
||||
} else {
|
||||
RetrieveVectors::Retrieve
|
||||
}
|
||||
} else {
|
||||
format.retrieve_vectors
|
||||
};
|
||||
|
||||
let displayed_ids =
|
||||
displayed_ids.unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect());
|
||||
let fids = |attrs: &BTreeSet<String>| {
|
||||
let mut ids = BTreeSet::new();
|
||||
for attr in attrs {
|
||||
|
@ -1000,6 +1070,7 @@ fn make_hits(
|
|||
.intersection(&displayed_ids)
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
let attr_to_highlight = format.attributes_to_highlight.unwrap_or_default();
|
||||
let attr_to_crop = format.attributes_to_crop.unwrap_or_default();
|
||||
let formatted_options = compute_formatted_options(
|
||||
|
@ -1033,18 +1104,48 @@ fn make_hits(
|
|||
formatter_builder.highlight_prefix(format.highlight_pre_tag);
|
||||
formatter_builder.highlight_suffix(format.highlight_post_tag);
|
||||
let mut documents = Vec::new();
|
||||
let embedding_configs = index.embedding_configs(rtxn)?;
|
||||
let documents_iter = index.documents(rtxn, documents_ids)?;
|
||||
for ((_id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) {
|
||||
for ((id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) {
|
||||
// First generate a document with all the displayed fields
|
||||
let displayed_document = make_document(&displayed_ids, &fields_ids_map, obkv)?;
|
||||
|
||||
let add_vectors_fid =
|
||||
vectors_fid.filter(|_fid| retrieve_vectors == RetrieveVectors::Retrieve);
|
||||
|
||||
// select the attributes to retrieve
|
||||
let attributes_to_retrieve = to_retrieve_ids
|
||||
.iter()
|
||||
// skip the vectors_fid if RetrieveVectors::Hide
|
||||
.filter(|fid| match vectors_fid {
|
||||
Some(vectors_fid) => {
|
||||
!(retrieve_vectors == RetrieveVectors::Hide && **fid == vectors_fid)
|
||||
}
|
||||
None => true,
|
||||
})
|
||||
// need to retrieve the existing `_vectors` field if the `RetrieveVectors::Retrieve`
|
||||
.chain(add_vectors_fid.iter())
|
||||
.map(|&fid| fields_ids_map.name(fid).expect("Missing field name"));
|
||||
let mut document =
|
||||
permissive_json_pointer::select_values(&displayed_document, attributes_to_retrieve);
|
||||
|
||||
if retrieve_vectors == RetrieveVectors::Retrieve {
|
||||
let mut vectors = match document.remove("_vectors") {
|
||||
Some(Value::Object(map)) => map,
|
||||
_ => Default::default(),
|
||||
};
|
||||
for (name, vector) in index.embeddings(rtxn, id)? {
|
||||
let user_provided = embedding_configs
|
||||
.iter()
|
||||
.find(|conf| conf.name == name)
|
||||
.is_some_and(|conf| conf.user_provided.contains(id));
|
||||
let embeddings =
|
||||
ExplicitVectors { embeddings: Some(vector.into()), regenerate: !user_provided };
|
||||
vectors.insert(name, serde_json::to_value(embeddings)?);
|
||||
}
|
||||
document.insert("_vectors".into(), vectors.into());
|
||||
}
|
||||
|
||||
let (matches_position, formatted) = format_fields(
|
||||
&displayed_document,
|
||||
&fields_ids_map,
|
||||
|
@ -1114,6 +1215,7 @@ pub fn perform_similar(
|
|||
query: SimilarQuery,
|
||||
embedder_name: String,
|
||||
embedder: Arc<Embedder>,
|
||||
retrieve_vectors: RetrieveVectors,
|
||||
) -> Result<SimilarResult, ResponseError> {
|
||||
let before_search = Instant::now();
|
||||
let rtxn = index.read_txn()?;
|
||||
|
@ -1125,6 +1227,7 @@ pub fn perform_similar(
|
|||
filter: _,
|
||||
embedder: _,
|
||||
attributes_to_retrieve,
|
||||
retrieve_vectors: _,
|
||||
show_ranking_score,
|
||||
show_ranking_score_details,
|
||||
ranking_score_threshold,
|
||||
|
@ -1171,6 +1274,7 @@ pub fn perform_similar(
|
|||
|
||||
let format = AttributesFormat {
|
||||
attributes_to_retrieve,
|
||||
retrieve_vectors,
|
||||
attributes_to_highlight: None,
|
||||
attributes_to_crop: None,
|
||||
crop_length: DEFAULT_CROP_LENGTH(),
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue