diff --git a/crates/meilisearch/tests/common/server.rs b/crates/meilisearch/tests/common/server.rs index 4367650c5..e3839855b 100644 --- a/crates/meilisearch/tests/common/server.rs +++ b/crates/meilisearch/tests/common/server.rs @@ -35,7 +35,7 @@ pub struct Server { pub static TEST_TEMP_DIR: Lazy = Lazy::new(|| TempDir::new().unwrap()); impl Server { - fn into_shared(self) -> Server { + pub fn into_shared(self) -> Server { Server { service: self.service, _dir: self._dir, _marker: PhantomData } } @@ -327,7 +327,7 @@ impl Server { self.service.get(url).await } - pub(super) fn _index(&self, uid: impl AsRef) -> Index<'_> { + pub fn _index(&self, uid: impl AsRef) -> Index<'_> { Index { uid: uid.as_ref().to_string(), service: &self.service, diff --git a/crates/meilisearch/tests/vector/fragments.rs b/crates/meilisearch/tests/vector/fragments.rs new file mode 100644 index 000000000..337f01ca6 --- /dev/null +++ b/crates/meilisearch/tests/vector/fragments.rs @@ -0,0 +1,2248 @@ +use std::collections::BTreeMap; + +use meili_snap::{json_string, snapshot}; +use tokio::sync::OnceCell; +use wiremock::matchers::{method, path}; +use wiremock::{Mock, MockServer, Request, ResponseTemplate}; + +use crate::common::index::Index; +use crate::common::{Owned, Shared}; +use crate::json; +use crate::vector::{GetAllDocumentsOptions, Server}; + +async fn shared_index_for_fragments() -> Index<'static, Shared> { + static INDEX: OnceCell<(Server, String)> = OnceCell::const_new(); + let (server, uid) = INDEX + .get_or_init(|| async { + let (server, uid, _) = init_fragments_index().await; + (server.into_shared(), uid) + }) + .await; + server._index(uid).to_shared() +} + +pub async fn init_fragments_index() -> (Server, String, crate::common::Value) { + let mock_server = MockServer::start().await; + + let text_to_embedding: BTreeMap<_, _> = vec![ + ("kefir", [0.5, -0.5, 0.0]), + ("intel", [1.0, 1.0, 0.0]), + ("dustin", [-0.5, 0.5, 0.0]), + ("bulldog", [0.0, 0.0, 1.0]), + ("labrador", [0.0, 0.0, -1.0]), + ("{{ doc.", [-9999.0, -9999.0, -9999.0]), // If a template didn't render + ] + .into_iter() + .collect(); + + Mock::given(method("POST")) + .and(path("/")) + .respond_with(move |req: &Request| { + let text = String::from_utf8_lossy(&req.body).to_string(); + let mut data = [0.0, 0.0, 0.0]; + for (inner_text, inner_data) in &text_to_embedding { + if text.contains(inner_text) { + for (i, &value) in inner_data.iter().enumerate() { + data[i] += value; + } + } + } + ResponseTemplate::new(200).set_body_json(json!({ "data": data })) + }) + .mount(&mock_server) + .await; + let url = mock_server.uri(); + + let server = Server::new().await; + let index = server.unique_index(); + + let (_response, code) = server.set_features(json!({"multimodal": true})).await; + snapshot!(code, @"200 OK"); + + // Configure the index to use our mock embedder + let settings = json!({ + "embedders": { + "rest": { + "source": "rest", + "url": url, + "dimensions": 3, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "indexingFragments": { + "withBreed": {"value": "{{ doc.name }} is a {{ doc.breed }}"}, + "basic": {"value": "{{ doc.name }} is a dog"}, + }, + "searchFragments": { + "justBreed": {"value": "It's a {{ media.breed }}"}, + "justName": {"value": "{{ media.name }} is a dog"}, + "query": {"value": "Some pre-prompt for query {{ q }}"}, + } + }, + }, + }); + let (response, code) = index.update_settings(settings.clone()).await; + snapshot!(code, @"202 Accepted"); + + let task = server.wait_task(response.uid()).await; + snapshot!(task["status"], @r###""succeeded""###); + + // Send documents + let documents = json!([ + {"id": 0, "name": "kefir"}, + {"id": 1, "name": "echo", "_vectors": { "rest": [1, 1, 1] }}, + {"id": 2, "name": "intel", "breed": "labrador"}, + {"id": 3, "name": "dustin", "breed": "bulldog"}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + + let task = index.wait_task(value.uid()).await; + snapshot!(task["status"], @r###""succeeded""###); + + let uid = index.uid.clone(); + (server, uid, settings) +} + +#[actix_rt::test] +async fn experimental_feature_not_enabled() { + let server = Server::new().await; + let index = server.unique_index(); + + let settings = json!({ + "embedders": { + "rest": { + "source": "rest", + "url": "http://localhost:1337", + "dimensions": 3, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "indexingFragments": { + "basic": {"value": "{{ doc.name }} is a dog"}, + }, + "searchFragments": { + "query": {"value": "Some pre-prompt for query {{ q }}"}, + } + }, + }, + }); + let (response, code) = index.update_settings(settings.clone()).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r#" + { + "message": "setting `indexingFragments` requires enabling the `multimodal` experimental feature. See https://github.com/orgs/meilisearch/discussions/846", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "#); +} + +#[actix_rt::test] +async fn indexing_fragments() { + let index = shared_index_for_fragments().await; + + // Make sure the documents have been indexed and their embeddings retrieved + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(documents, @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ], + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ], + [ + -0.5, + 0.5, + 1.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); +} + +#[actix_rt::test] +async fn replace_document() { + let (server, uid, _settings) = init_fragments_index().await; + let index = server.index(uid); + + let documents = json!([ + { "id": 0, "name": "kefir", "breed": "sorry-I-forgot" }, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + + let task = index.wait_task(value.uid()).await; + snapshot!(task["status"], @r###""succeeded""###); + + // Make sure kefir now has 2 vectors + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(documents, @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "breed": "sorry-I-forgot", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ], + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ], + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ], + [ + -0.5, + 0.5, + 1.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); +} + +#[actix_rt::test] +async fn search_with_vector() { + let index = shared_index_for_fragments().await; + + let (value, code) = index.search_post( + json!({"vector": [1.0, 1.0, 1.0], "hybrid": {"semanticRatio": 1.0, "embedder": "rest"}, "limit": 1} + )).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r#" + { + "hits": [ + { + "id": 1, + "name": "echo" + } + ], + "query": "", + "processingTimeMs": "[duration]", + "limit": 1, + "offset": 0, + "estimatedTotalHits": 4, + "semanticHitCount": 1 + } + "#); +} + +#[actix_rt::test] +async fn search_with_media() { + let index = shared_index_for_fragments().await; + + let (value, code) = index + .search_post(json!({ + "media": { "breed": "labrador" }, + "hybrid": {"semanticRatio": 1.0, "embedder": "rest"}, + "limit": 1 + } + )) + .await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r#" + { + "hits": [ + { + "id": 2, + "name": "intel", + "breed": "labrador" + } + ], + "query": "", + "processingTimeMs": "[duration]", + "limit": 1, + "offset": 0, + "estimatedTotalHits": 4, + "semanticHitCount": 1 + } + "#); +} + +#[actix_rt::test] +async fn search_with_media_and_vector() { + let index = shared_index_for_fragments().await; + + let (value, code) = index + .search_post(json!({ + "vector": [1.0, 1.0, 1.0], + "media": { "breed": "labrador" }, + "hybrid": {"semanticRatio": 1.0, "embedder": "rest"}, + "limit": 1 + } + )) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(value, @r#" + { + "message": "Invalid request: both `media` and `vector` parameters are present.", + "code": "invalid_search_media_and_vector", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_media_and_vector" + } + "#); +} + +#[actix_rt::test] +async fn search_with_media_matching_multiple_fragments() { + let index = shared_index_for_fragments().await; + + let (value, code) = index + .search_post(json!({ + "media": { "name": "dustin", "breed": "labrador" }, + "hybrid": {"semanticRatio": 1.0, "embedder": "rest"}, + "limit": 1 + } + )) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(value, @r#" + { + "message": "Error while generating embeddings: user error: Query matches multiple search fragments.\n - Note: First matched fragment `justBreed`.\n - Note: Second matched fragment `justName`.\n - Note: {\"q\":null,\"media\":{\"name\":\"dustin\",\"breed\":\"labrador\"}}", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "#); +} + +#[actix_rt::test] +async fn search_with_media_matching_no_fragment() { + let index = shared_index_for_fragments().await; + + let (value, code) = index + .search_post(json!({ + "media": { "ticker": "GME", "section": "portfolio" }, + "hybrid": {"semanticRatio": 1.0, "embedder": "rest"}, + "limit": 1 + } + )) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(value, @r#" + { + "message": "Error while generating embeddings: user error: Query matches no search fragment.\n - Note: {\"q\":null,\"media\":{\"ticker\":\"GME\",\"section\":\"portfolio\"}}", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "#); +} + +#[actix_rt::test] +async fn search_with_query() { + let index = shared_index_for_fragments().await; + + let (value, code) = index + .search_post(json!({ + "q": "bulldog", + "hybrid": {"semanticRatio": 1.0, "embedder": "rest"}, + "limit": 1 + } + )) + .await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r#" + { + "hits": [ + { + "id": 3, + "name": "dustin", + "breed": "bulldog" + } + ], + "query": "bulldog", + "processingTimeMs": "[duration]", + "limit": 1, + "offset": 0, + "estimatedTotalHits": 4, + "semanticHitCount": 1 + } + "#); +} + +#[actix_rt::test] +async fn deleting_fragments_deletes_vectors() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + settings["embedders"]["rest"]["indexingFragments"]["basic"] = serde_json::Value::Null; + + let (response, code) = index.update_settings(settings).await; + snapshot!(code, @"202 Accepted"); + let value = server.wait_task(response.uid()).await.succeeded(); + snapshot!(value, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": null, + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + let (value, code) = index.settings().await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(value["embedders"], { + ".rest.url" => "[url]", + }), @r#" + { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "headers": {} + } + } + "#); + + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 1.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "###); +} + +#[actix_rt::test] +async fn modifying_fragments_modifies_vectors() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + settings["embedders"]["rest"]["indexingFragments"]["basic"]["value"] = + serde_json::Value::String("{{ doc.name }} is a dog (maybe bulldog?)".to_string()); + + let (response, code) = index.update_settings(settings).await; + snapshot!(code, @"202 Accepted"); + let value = server.wait_task(response.uid()).await.succeeded(); + snapshot!(value, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": { + "value": "{{ doc.name }} is a dog (maybe bulldog?)" + }, + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(documents), @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.5, + -0.5, + 1.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ], + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 1.0 + ], + [ + -0.5, + 0.5, + 1.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); +} + +#[actix_rt::test] +async fn swapping_fragments() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + let basic = settings["embedders"]["rest"]["indexingFragments"]["basic"].clone(); + let with_breed = settings["embedders"]["rest"]["indexingFragments"]["withBreed"].clone(); + settings["embedders"]["rest"]["indexingFragments"]["basic"] = with_breed; + settings["embedders"]["rest"]["indexingFragments"]["withBreed"] = basic; + + let (response, code) = index.update_settings(settings).await; + snapshot!(code, @"202 Accepted"); + let value = server.wait_task(response.uid()).await.succeeded(); + snapshot!(value, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + }, + "withBreed": { + "value": "{{ doc.name }} is a dog" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(documents, @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + -1.0 + ], + [ + 1.0, + 1.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 1.0 + ], + [ + -0.5, + 0.5, + 0.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); +} + +#[actix_rt::test] +async fn ommitted_fragment_isnt_removed() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + settings["embedders"]["rest"]["indexingFragments"]["basic"] = serde_json::Value::Null; // basic is removed + settings["embedders"]["rest"]["indexingFragments"].as_object_mut().unwrap().remove("withBreed"); // withBreed isn't specified + + let (response, code) = index.update_settings(settings).await; + snapshot!(code, @"202 Accepted"); + let value = server.wait_task(response.uid()).await.succeeded(); + snapshot!(value, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": null + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + let (value, code) = index.settings().await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(value["embedders"], { + ".rest.url" => "[url]", + }), @r#" + { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "headers": {} + } + } + "#); +} + +#[actix_rt::test] +async fn fragment_insertion() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + settings["embedders"]["rest"]["indexingFragments"].as_object_mut().unwrap().insert( + String::from("useless"), + serde_json::json!({ + "value": "This fragment is useless" + }), + ); + + let (response, code) = index.update_settings(settings).await; + snapshot!(code, @"202 Accepted"); + let value = server.wait_task(response.uid()).await.succeeded(); + snapshot!(value, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": { + "value": "{{ doc.name }} is a dog" + }, + "useless": { + "value": "This fragment is useless" + }, + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(documents), @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ], + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ], + [ + 1.0, + 1.0, + -1.0 + ], + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ], + [ + -0.5, + 0.5, + 1.0 + ], + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); +} + +#[actix_rt::test] +async fn multiple_embedders() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + let url = settings["embedders"]["rest"]["url"].as_str().unwrap(); + + let settings2 = json!({ + "embedders": { + "rest2": { + "source": "rest", + "url": url, + "dimensions": 3, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "indexingFragments": { + "withBreed": {"value": "{{ doc.name }} is a {{ doc.breed }}"}, + "basic": {"value": "{{ doc.name }} is a dog"}, + }, + "searchFragments": { + "query": {"value": "Some pre-prompt for query {{ q }}"}, + } + }, + "rest3": { + "source": "rest", + "url": url, + "dimensions": 3, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "indexingFragments": { + "basic": {"value": "{{ doc.name }} is a dog"}, + }, + "searchFragments": { + "query": {"value": "Some pre-prompt for query {{ q }}"}, + } + }, + }, + }); + let (response, code) = index.update_settings(settings2).await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest2": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": { + "value": "{{ doc.name }} is a dog" + }, + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + }, + "rest3": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": { + "value": "{{ doc.name }} is a dog" + } + }, + "searchFragments": { + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(documents), @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + }, + "rest2": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + }, + "rest2": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ], + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": true + }, + "rest2": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ], + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ], + [ + -0.5, + 0.5, + 1.0 + ] + ], + "regenerate": true + }, + "rest2": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ], + [ + -0.5, + 0.5, + 1.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); + + // Remove Rest2 + + settings["embedders"]["rest2"] = serde_json::Value::Null; + + let (response, code) = index.update_settings(settings.clone()).await; + snapshot!(code, @"202 Accepted"); + let value = server.wait_task(response.uid()).await.succeeded(); + snapshot!(value["status"], @r###""succeeded""###); + + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(documents), @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + }, + "rest3": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ], + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ], + [ + -0.5, + 0.5, + 1.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); + + // Remove rest's basic fragment + + settings["embedders"]["rest"]["indexingFragments"]["basic"] = serde_json::Value::Null; + //settings["embedders"].as_object_mut().unwrap().remove("rest2"); + + let (response, code) = index.update_settings(settings).await; + snapshot!(code, @"202 Accepted"); + let value = server.wait_task(response.uid()).await.succeeded(); + snapshot!(value["status"], @r###""succeeded""###); + + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(documents), @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + }, + "rest3": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 1.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); +} + +#[actix_rt::test] +async fn remove_non_existant_embedder() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + settings["embedders"] + .as_object_mut() + .unwrap() + .insert(String::from("non-existant"), serde_json::Value::Null); + + let (response, code) = index.update_settings(settings).await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "non-existant": null, + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": { + "value": "{{ doc.name }} is a dog" + }, + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); +} + +#[actix_rt::test] +async fn double_remove_embedder() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + settings["embedders"] + .as_object_mut() + .unwrap() + .insert(String::from("rest"), serde_json::Value::Null); + + let (response, code) = index.update_settings(settings.clone()).await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": null + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + let (response, code) = index.update_settings(settings.clone()).await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": null + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); +} + +#[actix_rt::test] +async fn complex_fragment() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + settings["embedders"]["rest"]["indexingFragments"].as_object_mut().unwrap().insert( + String::from("complex"), + serde_json::json!({ + "value": { + "breed": "{{ doc.breed }}", + "breeds": [ + "{{ doc.breed }}", + { + "breed": "{{ doc.breed }}", + } + ] + } + }), + ); + + let (response, code) = index.update_settings(settings).await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": { + "value": "{{ doc.name }} is a dog" + }, + "complex": { + "value": { + "breed": "{{ doc.breed }}", + "breeds": [ + "{{ doc.breed }}", + { + "breed": "{{ doc.breed }}" + } + ] + } + }, + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(documents), @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ], + [ + 1.0, + 1.0, + -1.0 + ], + [ + 0.0, + 0.0, + -1.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ], + [ + -0.5, + 0.5, + 1.0 + ], + [ + 0.0, + 0.0, + 1.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); +} + +#[actix_rt::test] +async fn both_fragments_and_document_template() { + let server = Server::new().await; + let index = server.unique_index(); + + let (_response, code) = server.set_features(json!({"multimodal": true})).await; + snapshot!(code, @"200 OK"); + + let settings = json!({ + "embedders": { + "rest": { + "source": "rest", + "url": "http://localhost:1337", + "dimensions": 3, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "indexingFragments": { + "basic": {"value": "{{ doc.name }} is a dog"}, + }, + "searchFragments": { + "justBreed": {"value": "It's a {{ media.breed }}"}, + }, + "documentTemplate": "{{ doc.name }} is a dog", + }, + }, + }); + + let (response, code) = index.update_settings(settings.clone()).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r#" + { + "message": "Error while generating embeddings: user error: cannot pass both fragments and a document template.\n - Note: 1 fragments declared in `indexingFragments` and 1 fragments declared in `search_fragments_len`.\n - Hint: remove the declared fragments or remove the `documentTemplate`", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "#); +} + +#[actix_rt::test] +async fn set_fragments_then_document_template() { + let (server, uid, settings) = init_fragments_index().await; + let index = server.index(uid); + + let url = settings["embedders"]["rest"]["url"].as_str().unwrap(); + + let settings = json!({ + "embedders": { + "rest": { + "source": "rest", + "url": url, + "dimensions": 3, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "documentTemplate": "{{ doc.name }} is a dog", + }, + }, + }); + + let (response, code) = index.update_settings(settings.clone()).await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task, @r""); + + let (settings, code) = index.settings().await; + snapshot!(code, @"200 OK"); + snapshot!(settings, @r#" + { + "displayedAttributes": [ + "*" + ], + "searchableAttributes": [ + "*" + ], + "filterableAttributes": [], + "sortableAttributes": [], + "rankingRules": [ + "words", + "typo", + "proximity", + "attribute", + "sort", + "exactness" + ], + "stopWords": [], + "nonSeparatorTokens": [], + "separatorTokens": [], + "dictionary": [], + "synonyms": {}, + "distinctAttribute": null, + "proximityPrecision": "byWord", + "typoTolerance": { + "enabled": true, + "minWordSizeForTypos": { + "oneTypo": 5, + "twoTypos": 9 + }, + "disableOnWords": [], + "disableOnAttributes": [], + "disableOnNumbers": false + }, + "faceting": { + "maxValuesPerFacet": 100, + "sortFacetValuesBy": { + "*": "alpha" + } + }, + "pagination": { + "maxTotalHits": 1000 + }, + "embedders": { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "http://127.0.0.1:55578", + "indexingFragments": { + "basic": { + "value": "{{ doc.name }} is a dog" + }, + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "headers": {} + } + }, + "searchCutoffMs": null, + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" + } + "#); +} diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index 98555dfac..7f54489b6 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -1,4 +1,5 @@ mod binary_quantized; +mod fragments; #[cfg(feature = "test-ollama")] mod ollama; mod openai; diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index 911f51865..4124aa540 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -101,6 +101,10 @@ impl Setting { matches!(self, Self::NotSet) } + pub const fn is_reset(&self) -> bool { + matches!(self, Self::Reset) + } + /// If `Self` is `Reset`, then map self to `Set` with the provided `val`. pub fn or_reset(self, val: T) -> Self { match self { @@ -1213,6 +1217,10 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { // new config EitherOrBoth::Right((name, mut setting)) => { tracing::debug!(embedder = name, "new embedder"); + // if we are asked to reset an embedder that doesn't exist, just ignore it + if setting.is_reset() { + continue; + } // apply the default source in case the source was not set so that it gets validated crate::vector::settings::EmbeddingSettings::apply_default_source(&mut setting); crate::vector::settings::EmbeddingSettings::apply_default_openai_model(