diff --git a/Cargo.lock b/Cargo.lock index bf0415915..c72053be7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -55,7 +55,7 @@ dependencies = [ "encoding_rs", "flate2", "futures-core", - "h2", + "h2 0.3.26", "http 0.2.11", "httparse", "httpdate", @@ -403,6 +403,16 @@ dependencies = [ "thiserror", ] +[[package]] +name = "assert-json-diff" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47e4f2b81832e72834d7518d8487a0396a28cc408186a2e8854c0f98011faf12" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "async-trait" version = "0.1.81" @@ -414,6 +424,12 @@ dependencies = [ "syn 2.0.60", ] +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + [[package]] name = "autocfg" version = "1.2.0" @@ -1377,6 +1393,24 @@ dependencies = [ "syn 2.0.60", ] +[[package]] +name = "deadpool" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb84100978c1c7b37f09ed3ce3e5f843af02c2a2c431bae5b19230dad2c1b490" +dependencies = [ + "async-trait", + "deadpool-runtime", + "num_cpus", + "tokio", +] + +[[package]] +name = "deadpool-runtime" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "092966b41edc516079bdf31ec78a2e0588d1d0c08f78b91d8307215928642b2b" + [[package]] name = "debugid" version = "0.8.0" @@ -2231,6 +2265,25 @@ dependencies = [ "tracing", ] +[[package]] +name = "h2" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa82e28a107a8cc405f0839610bdc9b15f1e25ec7d696aa5cf173edbcb1486ab" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http 1.1.0", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "half" version = "1.8.2" @@ -2441,9 +2494,11 @@ dependencies = [ "bytes", "futures-channel", "futures-util", + "h2 0.4.5", "http 1.1.0", "http-body", "httparse", + "httpdate", "itoa", "pin-project-lite", "smallvec", @@ -3423,6 +3478,7 @@ dependencies = [ "url", "urlencoding", "uuid", + "wiremock", "yaup", "zip 2.1.3", ] @@ -6281,6 +6337,30 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "wiremock" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec874e1eef0df2dcac546057fe5e29186f09c378181cd7b635b4b7bcc98e9d81" +dependencies = [ + "assert-json-diff", + "async-trait", + "base64 0.21.7", + "deadpool", + "futures", + "http 1.1.0", + "http-body-util", + "hyper", + "hyper-util", + "log", + "once_cell", + "regex", + "serde", + "serde_json", + "tokio", + "url", +] + [[package]] name = "wyz" version = "0.5.1" diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index ca9bca820..c9ea70bb8 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -3047,6 +3047,8 @@ mod tests { api_key: Setting::Set(S("My super secret")), url: Setting::Set(S("http://localhost:7777")), dimensions: Setting::Set(4), + request: Setting::Set(serde_json::json!("{{text}}")), + response: Setting::Set(serde_json::json!("{{embedding}}")), ..Default::default() }; embedders.insert(S("default"), Setting::Set(embedding_settings)); @@ -5006,6 +5008,8 @@ mod tests { api_key: Setting::Set(S("My super secret")), url: Setting::Set(S("http://localhost:7777")), dimensions: Setting::Set(384), + request: Setting::Set(serde_json::json!("{{text}}")), + response: Setting::Set(serde_json::json!("{{embedding}}")), ..Default::default() }; embedders.insert(S("A_fakerest"), Setting::Set(embedding_settings)); diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-2.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-2.snap index bc16fc8be..629ea87dc 100644 --- a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-2.snap +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-2.snap @@ -8,7 +8,9 @@ expression: task.details "source": "rest", "apiKey": "MyXXXX...", "dimensions": 384, - "url": "http://localhost:7777" + "url": "http://localhost:7777", + "request": "{{text}}", + "response": "{{embedding}}" }, "B_small_hf": { "source": "huggingFace", diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-5.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-5.snap index 013115a58..97a67ea0a 100644 --- a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-5.snap +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-5.snap @@ -8,16 +8,7 @@ expression: fakerest_config.embedder_options "distribution": null, "dimensions": 384, "url": "http://localhost:7777", - "query": null, - "input_field": [ - "input" - ], - "path_to_embeddings": [ - "data" - ], - "embedding_object": [ - "embedding" - ], - "input_type": "text" + "request": "{{text}}", + "response": "{{embedding}}" } } diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors.snap index bc16fc8be..629ea87dc 100644 --- a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors.snap +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors.snap @@ -8,7 +8,9 @@ expression: task.details "source": "rest", "apiKey": "MyXXXX...", "dimensions": 384, - "url": "http://localhost:7777" + "url": "http://localhost:7777", + "request": "{{text}}", + "response": "{{embedding}}" }, "B_small_hf": { "source": "huggingFace", diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-2.snap b/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-2.snap index 72a25f915..2b76f46a6 100644 --- a/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-2.snap +++ b/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-2.snap @@ -8,7 +8,9 @@ expression: task.details "source": "rest", "apiKey": "MyXXXX...", "dimensions": 4, - "url": "http://localhost:7777" + "url": "http://localhost:7777", + "request": "{{text}}", + "response": "{{embedding}}" } } } diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-5.snap b/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-5.snap index f7ae1c00a..a88f94df3 100644 --- a/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-5.snap +++ b/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-5.snap @@ -1,6 +1,6 @@ --- source: index-scheduler/src/lib.rs -expression: embedding_config.embedder_options +expression: config.embedder_options --- { "Rest": { @@ -8,16 +8,7 @@ expression: embedding_config.embedder_options "distribution": null, "dimensions": 4, "url": "http://localhost:7777", - "query": null, - "input_field": [ - "input" - ], - "path_to_embeddings": [ - "data" - ], - "embedding_object": [ - "embedding" - ], - "input_type": "text" + "request": "{{text}}", + "response": "{{embedding}}" } } diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__settings_update.snap b/index-scheduler/src/snapshots/index_scheduler__tests__settings_update.snap index 72a25f915..2b76f46a6 100644 --- a/index-scheduler/src/snapshots/index_scheduler__tests__settings_update.snap +++ b/index-scheduler/src/snapshots/index_scheduler__tests__settings_update.snap @@ -8,7 +8,9 @@ expression: task.details "source": "rest", "apiKey": "MyXXXX...", "dimensions": 4, - "url": "http://localhost:7777" + "url": "http://localhost:7777", + "request": "{{text}}", + "response": "{{embedding}}" } } } diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap index 6b285ba56..6f2da1f17 100644 --- a/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} 2 {uid: 2, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- @@ -46,4 +46,3 @@ doggos: { number_of_documents: 1, field_distribution: {"_vectors": 1, "breed": 1 ### File Store: ---------------------------------------------------------------------- - diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap index 6f23d96fd..569556a19 100644 --- a/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} 2 {uid: 2, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- @@ -45,4 +45,3 @@ doggos: { number_of_documents: 1, field_distribution: {"_vectors": 1, "breed": 1 00000000-0000-0000-0000-000000000001 ---------------------------------------------------------------------- - diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap index 5dcb5a4f7..b626d8bc5 100644 --- a/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: @@ -42,4 +42,3 @@ doggos: { number_of_documents: 1, field_distribution: {"_vectors": 1, "breed": 1 ### File Store: ---------------------------------------------------------------------- - diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap index 80521df42..65f758f32 100644 --- a/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: @@ -41,4 +41,3 @@ doggos: { number_of_documents: 0, field_distribution: {} } 00000000-0000-0000-0000-000000000000 ---------------------------------------------------------------------- - diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap index 97b669f44..9c628461d 100644 --- a/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [0,] @@ -33,4 +33,3 @@ doggos [0,] ### File Store: ---------------------------------------------------------------------- - diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap index f3ce4b104..eddf6d7e8 100644 --- a/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [] @@ -37,4 +37,3 @@ doggos: { number_of_documents: 0, field_distribution: {} } ### File Store: ---------------------------------------------------------------------- - diff --git a/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap b/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap index dad082667..7873fb6cf 100644 --- a/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap +++ b/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [0,] @@ -33,4 +33,3 @@ doggos [0,] ### File Store: ---------------------------------------------------------------------- - diff --git a/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap b/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap index 271db8765..8a4838094 100644 --- a/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap +++ b/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [] @@ -37,4 +37,3 @@ doggos: { number_of_documents: 0, field_distribution: {} } ### File Store: ---------------------------------------------------------------------- - diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index 7db47f7ad..e614ecc6a 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -114,6 +114,7 @@ maplit = "1.0.2" meili-snap = { path = "../meili-snap" } temp-env = "0.3.6" urlencoding = "2.1.3" +wiremock = "0.6.0" yaup = "0.3.1" [build-dependencies] diff --git a/meilisearch/tests/common/mod.rs b/meilisearch/tests/common/mod.rs index adcef9fe6..ee1d8aa6e 100644 --- a/meilisearch/tests/common/mod.rs +++ b/meilisearch/tests/common/mod.rs @@ -80,7 +80,14 @@ impl Display for Value { write!( f, "{}", - json_string!(self, { ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]", ".processingTimeMs" => "[duration]" }) + json_string!(self, { + ".enqueuedAt" => "[date]", + ".startedAt" => "[date]", + ".finishedAt" => "[date]", + ".duration" => "[duration]", + ".processingTimeMs" => "[duration]", + ".details.embedders.*.url" => "[url]" + }) ) } } diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 95d3bb933..ee4181694 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -497,7 +497,7 @@ async fn query_combination() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: attempt to embed the following text in a configuration where embeddings must be user provided: \"Captain\"", + "message": "Error while generating embeddings: user error: attempt to embed the following text in a configuration where embeddings must be user provided:\n - `Captain`", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" diff --git a/meilisearch/tests/settings/get_settings.rs b/meilisearch/tests/settings/get_settings.rs index 5571a1e03..239151197 100644 --- a/meilisearch/tests/settings/get_settings.rs +++ b/meilisearch/tests/settings/get_settings.rs @@ -116,6 +116,8 @@ async fn secrets_are_hidden_in_settings() { "url": "https://localhost:7777", "apiKey": "My super secret value you will never guess", "dimensions": 4, + "request": "{{text}}", + "response": "{{embedding}}" } } })) @@ -189,17 +191,8 @@ async fn secrets_are_hidden_in_settings() { "dimensions": 4, "documentTemplate": "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}", "url": "https://localhost:7777", - "query": null, - "inputField": [ - "input" - ], - "pathToEmbeddings": [ - "data" - ], - "embeddingObject": [ - "embedding" - ], - "inputType": "text" + "request": "{{text}}", + "response": "{{embedding}}" } }, "searchCutoffMs": null @@ -215,7 +208,9 @@ async fn secrets_are_hidden_in_settings() { "source": "rest", "apiKey": "My suXXXXXX...", "dimensions": 4, - "url": "https://localhost:7777" + "url": "https://localhost:7777", + "request": "{{text}}", + "response": "{{embedding}}" } } } diff --git a/meilisearch/tests/vector/mod.rs b/meilisearch/tests/vector/mod.rs index 0419f4533..43c2ff606 100644 --- a/meilisearch/tests/vector/mod.rs +++ b/meilisearch/tests/vector/mod.rs @@ -1,3 +1,4 @@ +mod rest; mod settings; use meili_snap::{json_string, snapshot}; @@ -505,7 +506,7 @@ async fn user_provided_vectors_error() { "indexedDocuments": 0 }, "error": { - "message": "While embedding documents for embedder `manual`: user error: attempt to embed the following text in a configuration where embeddings must be user provided: \" id: 42\\n name: kefir\\n _vectors: \\n _vectors.manual: \\n _vectors.manual.regenerate: \\n _vectors.manual.embeddings: \\n\"\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: opt-out for a document with `_vectors.manual: null`", + "message": "While embedding documents for embedder `manual`: user error: attempt to embed the following text in a configuration where embeddings must be user provided:\n - ` id: 42\n name: kefir\n _vectors: \n _vectors.manual: \n _vectors.manual.regenerate: \n _vectors.manual.embeddings: \n`\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: opt-out for a document with `_vectors.manual: null`", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -534,7 +535,7 @@ async fn user_provided_vectors_error() { "indexedDocuments": 0 }, "error": { - "message": "While embedding documents for embedder `manual`: user error: attempt to embed the following text in a configuration where embeddings must be user provided: \" id: 42\\n name: kefir\\n _vectors: \\n _vectors.manual: \\n _vectors.manual.regenerate: \\n _vectors.manual.embeddings: \\n _vector: manaul000\\n _vector.manaul: \\n\"\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vector` by `_vectors` in 1 document(s).", + "message": "While embedding documents for embedder `manual`: user error: attempt to embed the following text in a configuration where embeddings must be user provided:\n - ` id: 42\n name: kefir\n _vectors: \n _vectors.manual: \n _vectors.manual.regenerate: \n _vectors.manual.embeddings: \n _vector: manaul000\n _vector.manaul: \n`\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vector` by `_vectors` in 1 document(s).", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -563,7 +564,7 @@ async fn user_provided_vectors_error() { "indexedDocuments": 0 }, "error": { - "message": "While embedding documents for embedder `manual`: user error: attempt to embed the following text in a configuration where embeddings must be user provided: \" id: 42\\n name: kefir\\n _vectors: manaul000\\n _vectors.manual: \\n _vectors.manual.regenerate: \\n _vectors.manual.embeddings: \\n _vectors.manaul: \\n\"\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vectors.manaul` by `_vectors.manual` in 1 document(s).", + "message": "While embedding documents for embedder `manual`: user error: attempt to embed the following text in a configuration where embeddings must be user provided:\n - ` id: 42\n name: kefir\n _vectors: manaul000\n _vectors.manual: \n _vectors.manual.regenerate: \n _vectors.manual.embeddings: \n _vectors.manaul: \n`\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vectors.manaul` by `_vectors.manual` in 1 document(s).", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" diff --git a/meilisearch/tests/vector/rest.rs b/meilisearch/tests/vector/rest.rs new file mode 100644 index 000000000..dd22baad2 --- /dev/null +++ b/meilisearch/tests/vector/rest.rs @@ -0,0 +1,1734 @@ +use std::sync::atomic::{AtomicUsize, Ordering}; + +use meili_snap::{json_string, snapshot}; +use reqwest::IntoUrl; +use wiremock::matchers::{method, path}; +use wiremock::{Mock, MockServer, Request, ResponseTemplate}; + +use crate::common::{Server, Value}; +use crate::json; +use crate::vector::GetAllDocumentsOptions; + +async fn create_mock() -> (MockServer, Value) { + let mock_server = MockServer::start().await; + + let counter = AtomicUsize::new(0); + + Mock::given(method("POST")) + .and(path("/")) + .respond_with(move |_req: &Request| { + let counter = counter.fetch_add(1, Ordering::Relaxed); + ResponseTemplate::new(200).set_body_json(json!({ "data": vec![counter; 3] })) + }) + .mount(&mock_server) + .await; + let url = mock_server.uri(); + + let embedder_settings = json!({ + "source": "rest", + "url": url, + "dimensions": 3, + "request": "{{text}}", + "response": { + "data": "{{embedding}}" + } + }); + + (mock_server, embedder_settings) +} + +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +struct MultipleRequest { + input: Vec, +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +struct MultipleResponse { + output: Vec, +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +struct SingleResponse { + text: String, + embedding: Vec, +} + +async fn create_mock_multiple() -> (MockServer, Value) { + let mock_server = MockServer::start().await; + + let counter = AtomicUsize::new(0); + + Mock::given(method("POST")) + .and(path("/")) + .respond_with(move |req: &Request| { + let req: MultipleRequest = match req.body_json() { + Ok(req) => req, + Err(error) => { + return ResponseTemplate::new(400).set_body_json(json!({ + "error": format!("Invalid request: {error}") + })); + } + }; + + let output = req + .input + .into_iter() + .map(|text| SingleResponse { + text, + embedding: vec![counter.fetch_add(1, Ordering::Relaxed) as f32; 3], + }) + .collect(); + + let response = MultipleResponse { output }; + + ResponseTemplate::new(200).set_body_json(response) + }) + .mount(&mock_server) + .await; + let url = mock_server.uri(); + + let embedder_settings = json!({ + "source": "rest", + "url": url, + "dimensions": 3, + "request": { + "input": ["{{text}}", "{{..}}"] + }, + "response": { + "output": [ + { + "embedding": "{{embedding}}" + }, + "{{..}}" + ] + } + }); + + (mock_server, embedder_settings) +} + +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +struct SingleRequest { + input: String, +} + +async fn create_mock_single_response_in_array() -> (MockServer, Value) { + let mock_server = MockServer::start().await; + + let counter = AtomicUsize::new(0); + + Mock::given(method("POST")) + .and(path("/")) + .respond_with(move |req: &Request| { + let req: SingleRequest = match req.body_json() { + Ok(req) => req, + Err(error) => { + return ResponseTemplate::new(400).set_body_json(json!({ + "error": format!("Invalid request: {error}") + })); + } + }; + + let output = vec![SingleResponse { + text: req.input, + embedding: vec![counter.fetch_add(1, Ordering::Relaxed) as f32; 3], + }]; + + let response = MultipleResponse { output }; + + ResponseTemplate::new(200).set_body_json(response) + }) + .mount(&mock_server) + .await; + let url = mock_server.uri(); + + let embedder_settings = json!({ + "source": "rest", + "url": url, + "dimensions": 3, + "request": { + "input": "{{text}}" + }, + "response": { + "output": [ + { + "embedding": "{{embedding}}" + } + ] + } + }); + + (mock_server, embedder_settings) +} + +async fn create_mock_raw() -> (MockServer, Value) { + let mock_server = MockServer::start().await; + + let counter = AtomicUsize::new(0); + + Mock::given(method("POST")) + .and(path("/")) + .respond_with(move |req: &Request| { + let _req: String = match req.body_json() { + Ok(req) => req, + Err(error) => { + return ResponseTemplate::new(400).set_body_json(json!({ + "error": format!("Invalid request: {error}") + })); + } + }; + + let output = vec![counter.fetch_add(1, Ordering::Relaxed) as f32; 3]; + + ResponseTemplate::new(200).set_body_json(output) + }) + .mount(&mock_server) + .await; + let url = mock_server.uri(); + + let embedder_settings = json!({ + "source": "rest", + "url": url, + "dimensions": 3, + "request": "{{text}}", + "response": "{{embedding}}" + }); + + (mock_server, embedder_settings) +} + +pub async fn post(url: T) -> reqwest::Result { + reqwest::Client::builder().build()?.post(url).send().await +} + +#[actix_rt::test] +async fn dummy_testing_the_mock() { + let (mock, _setting) = create_mock().await; + let body = post(&mock.uri()).await.unwrap().text().await.unwrap(); + snapshot!(body, @r###"{"data":[0,0,0]}"###); + let body = post(&mock.uri()).await.unwrap().text().await.unwrap(); + snapshot!(body, @r###"{"data":[1,1,1]}"###); + let body = post(&mock.uri()).await.unwrap().text().await.unwrap(); + snapshot!(body, @r###"{"data":[2,2,2]}"###); + let body = post(&mock.uri()).await.unwrap().text().await.unwrap(); + snapshot!(body, @r###"{"data":[3,3,3]}"###); + let body = post(&mock.uri()).await.unwrap().text().await.unwrap(); + snapshot!(body, @r###"{"data":[4,4,4]}"###); +} + +async fn get_server_vector() -> Server { + let server = Server::new().await; + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false, + "editDocumentsByFunction": false, + "containsFilter": false + } + "###); + server +} + +#[actix_rt::test] +async fn bad_request() { + let (mock, _setting) = create_mock().await; + + let server = get_server_vector().await; + let index = server.index("doggo"); + + // No placeholder string appear in the template + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "request": "54", "response": "{{embedding}}" }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: in `request`: \"{{text}}\" not found", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); + + // A repeat string appears inside a repeated value + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "request": { + "input": [ + { + "input": [ + "{{text}}", + "{{..}}" + ] + }, + "{{..}}" + ] + }, "response": "{{embedding}}" }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: in `request.input.input`: \"{{..}}\" appears nested inside of a value that is itself repeated", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); + + // A repeat string appears outside of an array + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "request": { + "input": { + "input": "{{text}}", + "repeat": "{{..}}" + } + }, "response": "{{embedding}}" }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: in `request.input.repeat`: \"{{..}}\" appears outside of an array", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); + + // A repeat string appears in an array, but not in the second position + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "request": { + "input": [ + "{{..}}", + "{{text}}" + ] + }, "response": "{{embedding}}" }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: in `request.input`: \"{{..}}\" expected at position #1, but found at position #0", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "request": { + "input": [ + "{{text}}", + "42", + "{{..}}", + ] + }, "response": "{{embedding}}" }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: in `request.input`: \"{{..}}\" expected at position #1, but found at position #2", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); + + // A repeated value lacks a placeholder + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "request": { + "input": [ + "42", + "{{..}}", + ] + }, "response": "{{embedding}}" }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: in `request.input[0]`: Expected \"{{text}}\" inside of the repeated value", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); + + // Multiple repeat strings appear in the template + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "request": { + "input": [ + "{{text}}", + "{{..}}", + ], + "data": [ + "42", + "{{..}}", + ], + }, "response": "{{embedding}}" }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: in `request.data`: Found \"{{..}}\", but it was already present in `request.input`", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); + + // Multiple placeholder strings appear in the template + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "request": { + "input": "{{text}}", + "data": "{{text}}", + }, "response": "{{embedding}}" }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: in `request.data`: Found \"{{text}}\", but it was already present in `request.input`", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "request": + {"repeated": [{ + "input": "{{text}}", + "data": [42, "{{text}}"], + }, "{{..}}"]}, "response": "{{embedding}}" }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: in `request.repeated.data[1]`: Found \"{{text}}\", but it was already present in `request.repeated.input`", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); + + // A placeholder appears both inside a repeated value and outside of it + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "request": { + "input": ["{{text}}", "{{..}}"], + "data": "{{text}}", + }, "response": "{{embedding}}" }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: in `request.data`: Found \"{{text}}\", but it was already present in `request.input[0]` (repeated)", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); +} + +#[actix_rt::test] +async fn bad_response() { + let (mock, _setting) = create_mock().await; + + let server = get_server_vector().await; + let index = server.index("doggo"); + + // No placeholder string appear in the template + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "request": "{{text}}", "response": "42" }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: in `response`: \"{{embedding}}\" not found", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); + + // A repeat string appears inside a repeated value + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "response": { + "output": [ + { + "output": [ + "{{embedding}}", + "{{..}}" + ] + }, + "{{..}}" + ] + }, "request": "{{text}}" }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: in `response.output.output`: \"{{..}}\" appears nested inside of a value that is itself repeated", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); + + // A repeat string appears outside of an array + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "response": { + "output": { + "output": "{{embedding}}", + "repeat": "{{..}}" + } + }, "request": "{{text}}" }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: in `response.output.repeat`: \"{{..}}\" appears outside of an array", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); + + // A repeat string appears in an array, but not in the second position + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "response": { + "output": [ + "{{..}}", + "{{embedding}}" + ] + }, "request": "{{text}}" }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: in `response.output`: \"{{..}}\" expected at position #1, but found at position #0", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "response": { + "output": [ + "{{embedding}}", + "42", + "{{..}}", + ] + }, "request": "{{text}}" }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: in `response.output`: \"{{..}}\" expected at position #1, but found at position #2", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); + + // A repeated value lacks a placeholder + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "response": { + "output": [ + "42", + "{{..}}", + ] + }, "request": "{{text}}" }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: in `response.output[0]`: Expected \"{{embedding}}\" inside of the repeated value", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); + + // Multiple repeat strings appear in the template + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "response": { + "output": [ + "{{embedding}}", + "{{..}}", + ], + "data": [ + "42", + "{{..}}", + ], + }, "request": "{{text}}" }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: in `response.data`: Found \"{{..}}\", but it was already present in `response.output`", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); + + // Multiple placeholder strings appear in the template + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "response": { + "output": [{"type": "data", "data": "{{embedding}}"}], + "data": "{{embedding}}", + }, "request": "{{text}}" }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: in `response.data`: Found \"{{embedding}}\", but it was already present in `response.output[0].data`", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "response": + {"repeated": [{ + "output": "{{embedding}}", + "data": [42, "{{embedding}}"], + }, "{{..}}"]}, "request": "{{text}}" }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: in `response.repeated.data[1]`: Found \"{{embedding}}\", but it was already present in `response.repeated.output`", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); + + // A placeholder appears both inside a repeated value and outside of it + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "response": { + "output": ["{{embedding}}", "{{..}}"], + "data": "{{embedding}}", + }, "request": "{{text}}" }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: in `response.data`: Found \"{{embedding}}\", but it was already present in `response.output[0]` (repeated)", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); + + // request sends a single text but response expects multiple embeddings + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "response": { + "data": ["{{embedding}}", "{{..}}"], + }, "request": "{{text}}" }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: in `response`: `response` has multiple embeddings, but `request` has only one text to embed", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); + + // request sends multiple texts but response expects a single embedding + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "response": { + "data": "{{embedding}}", + }, "request": {"data": ["{{text}}", "{{..}}"]} }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: in `response`: `response` has a single embedding, but `request` has multiple texts to embed", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); +} + +#[actix_rt::test] +async fn bad_settings() { + let (mock, _setting) = create_mock().await; + + let server = get_server_vector().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "request": 42, "response": 42 }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: in `request`: \"{{text}}\" not found", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": "kefir", "request": 42, "response": 42 }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "`.embedders.rest.url`: could not parse `kefir`: relative URL without a base", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "response": "{{text}}" }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "`.embedders.rest`: Missing field `request` (note: this field is mandatory for source rest)", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "request": "{{text}}" }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "`.embedders.rest`: Missing field `response` (note: this field is mandatory for source rest)", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "request": "{{text}}", "response": 42 }), + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Error while generating embeddings: user error: in `response`: \"{{embedding}}\" not found", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "request": "{{text}}", "response": "{{embedding}}" }), + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task, @r###" + { + "uid": 0, + "indexUid": "doggo", + "status": "failed", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "url": "[url]", + "request": "{{text}}", + "response": "{{embedding}}" + } + } + }, + "error": { + "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting a single \"{{embedding}}\", expected `response` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected a sequence", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + // Validate an embedder with a bad dimension of 2 instead of 3 + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "request": "{{text}}", "response": { "data": "{{embedding}}" }, "dimensions": 2 }), + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task["status"], @r###""succeeded""###); + + let (response, code) = index.add_documents(json!( { "id": 1, "name": "kefir" }), None).await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task, @r###" + { + "uid": 2, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "While embedding documents for embedder `rest`: runtime error: was expecting embeddings of dimension `2`, got embeddings of dimensions `3`", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); +} + +#[actix_rt::test] +async fn add_vector_and_user_provided() { + let (_mock, setting) = create_mock().await; + let server = get_server_vector().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": setting, + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task["status"], @r###""succeeded""###); + let documents = json!([ + {"id": 0, "name": "kefir"}, + {"id": 1, "name": "echo", "_vectors": { "rest": [1, 1, 1] }}, + {"id": 2, "name": "intel"}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 1, + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 3, + "indexedDocuments": 3 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 3 + } + "###); +} + +#[actix_rt::test] +async fn server_returns_bad_request() { + let (mock, _setting) = create_mock_multiple().await; + let server = get_server_vector().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "request": "{{text}}", "response": "{{embedding}}" }), + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task, @r###" + { + "uid": 0, + "indexUid": "doggo", + "status": "failed", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "url": "[url]", + "request": "{{text}}", + "response": "{{embedding}}" + } + } + }, + "error": { + "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: sent a bad request to embedding server\n - Hint: check that the `request` in the embedder configuration matches the remote server's API\n - server replied with `{\"error\":\"Invalid request: invalid type: string \\\"test\\\", expected struct MultipleRequest at line 1 column 6\"}`", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), "request": "{{text}}", "response": "{{embedding}}", "dimensions": 3 }), + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task, @r###" + { + "uid": 1, + "indexUid": "doggo", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "request": "{{text}}", + "response": "{{embedding}}" + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (response, code) = index.add_documents(json!( { "id": 1, "name": "kefir" }), None).await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task, @r###" + { + "uid": 2, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "While embedding documents for embedder `rest`: user error: sent a bad request to embedding server\n - Hint: check that the `request` in the embedder configuration matches the remote server's API\n - server replied with `{\"error\":\"Invalid request: invalid type: string \\\" id: 1\\\\n name: kefir\\\\n\\\", expected struct MultipleRequest at line 1 column 24\"}`", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); +} + +#[actix_rt::test] +async fn server_returns_bad_response() { + let (mock, _setting) = create_mock_multiple().await; + let server = get_server_vector().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), + "request": { + "input": ["{{text}}", "{{..}}"] + }, + "response": ["{{embedding}}", "{{..}}"] }), + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task, @r###" + { + "uid": 0, + "indexUid": "doggo", + "status": "failed", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "url": "[url]", + "request": { + "input": [ + "{{text}}", + "{{..}}" + ] + }, + "response": [ + "{{embedding}}", + "{{..}}" + ] + } + } + }, + "error": { + "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting the array of \"{{embedding}}\"s, configuration expects `response` to be an array with at least 1 item(s) but server sent an object with 1 field(s)", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), + "request": { + "input": ["{{text}}", "{{..}}"] + }, + "response": { + "output": ["{{embedding}}", "{{..}}"] + } }), + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task, @r###" + { + "uid": 1, + "indexUid": "doggo", + "status": "failed", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "url": "[url]", + "request": { + "input": [ + "{{text}}", + "{{..}}" + ] + }, + "response": { + "output": [ + "{{embedding}}", + "{{..}}" + ] + } + } + } + }, + "error": { + "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting item #0 from the array of \"{{embedding}}\"s, expected `response` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected a sequence", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), + "request": { + "input": ["{{text}}"] + }, + "response": { + "output": "{{embedding}}" + } }), + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task, @r###" + { + "uid": 2, + "indexUid": "doggo", + "status": "failed", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "url": "[url]", + "request": { + "input": [ + "{{text}}" + ] + }, + "response": { + "output": "{{embedding}}" + } + } + } + }, + "error": { + "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.output`, while extracting a single \"{{embedding}}\", expected `output` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected f32", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), + "request": { + "input": ["{{text}}", "{{..}}"] + }, + "response": { + "output": [{ "embedding": + { + "data": "{{embedding}}" + } + }, "{{..}}"] + } }), + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task, @r###" + { + "uid": 3, + "indexUid": "doggo", + "status": "failed", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "url": "[url]", + "request": { + "input": [ + "{{text}}", + "{{..}}" + ] + }, + "response": { + "output": [ + { + "embedding": { + "data": "{{embedding}}" + } + }, + "{{..}}" + ] + } + } + } + }, + "error": { + "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.embedding`, while extracting item #0 from the array of \"{{embedding}}\"s, configuration expects `embedding` to be an object with key `data` but server sent an array of size 3", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": json!({ "source": "rest", "url": mock.uri(), + "request": { + "input": ["{{text}}"] + }, + "response": { + "output": [ + { "embeddings": + { + "data": "{{embedding}}" + } + } + ] + } }), + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task, @r###" + { + "uid": 4, + "indexUid": "doggo", + "status": "failed", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "url": "[url]", + "request": { + "input": [ + "{{text}}" + ] + }, + "response": { + "output": [ + { + "embeddings": { + "data": "{{embedding}}" + } + } + ] + } + } + } + }, + "error": { + "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.output[0]`, while extracting a single \"{{embedding}}\", configuration expects key \"embeddings\", which is missing in response\n - Hint: item #0 inside `output` has key `embedding`, did you mean `response.output[0].embedding` in embedder configuration?", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); +} + +#[actix_rt::test] +async fn server_returns_multiple() { + let (_mock, setting) = create_mock_multiple().await; + let server = get_server_vector().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": setting, + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task["status"], @r###""succeeded""###); + let documents = json!([ + {"id": 0, "name": "kefir"}, + {"id": 1, "name": "echo", "_vectors": { "rest": [1, 1, 1] }}, + {"id": 2, "name": "intel"}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 1, + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 3, + "indexedDocuments": 3 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 3 + } + "###); +} + +#[actix_rt::test] +async fn server_single_input_returns_in_array() { + let (_mock, setting) = create_mock_single_response_in_array().await; + let server = get_server_vector().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": setting, + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task["status"], @r###""succeeded""###); + let documents = json!([ + {"id": 0, "name": "kefir"}, + {"id": 1, "name": "echo", "_vectors": { "rest": [1, 1, 1] }}, + {"id": 2, "name": "intel"}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 1, + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 3, + "indexedDocuments": 3 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 3 + } + "###); +} + +#[actix_rt::test] +async fn server_raw() { + let (_mock, setting) = create_mock_raw().await; + let server = get_server_vector().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": setting, + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task["status"], @r###""succeeded""###); + let documents = json!([ + {"id": 0, "name": "kefir"}, + {"id": 1, "name": "echo", "_vectors": { "rest": [1, 1, 1] }}, + {"id": 2, "name": "intel"}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 1, + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 3, + "indexedDocuments": 3 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 3 + } + "###); +} diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 7f07dafed..2521b778f 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2741,11 +2741,8 @@ mod tests { dimensions: Setting::Set(3), document_template: Setting::NotSet, url: Setting::NotSet, - query: Setting::NotSet, - input_field: Setting::NotSet, - path_to_embeddings: Setting::NotSet, - embedding_object: Setting::NotSet, - input_type: Setting::NotSet, + request: Setting::NotSet, + response: Setting::NotSet, distribution: Setting::NotSet, }), ); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 54a25abd5..448c74fd8 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1484,11 +1484,8 @@ fn validate_prompt( dimensions, document_template: Setting::Set(template), url, - query, - input_field, - path_to_embeddings, - embedding_object, - input_type, + request, + response, distribution, }) => { // validate @@ -1504,11 +1501,8 @@ fn validate_prompt( dimensions, document_template: Setting::Set(template), url, - query, - input_field, - path_to_embeddings, - embedding_object, - input_type, + request, + response, distribution, })) } @@ -1530,11 +1524,8 @@ pub fn validate_embedding_settings( dimensions, document_template, url, - query, - input_field, - path_to_embeddings, - embedding_object, - input_type, + request, + response, distribution, } = settings; @@ -1553,6 +1544,15 @@ pub fn validate_embedding_settings( })?; } + if let Some(request) = request.as_ref().set() { + let request = crate::vector::rest::Request::new(request.to_owned()) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; + if let Some(response) = response.as_ref().set() { + crate::vector::rest::Response::new(response.to_owned(), &request) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; + } + } + let Some(inferred_source) = source.set() else { return Ok(Setting::Set(EmbeddingSettings { source, @@ -1562,11 +1562,8 @@ pub fn validate_embedding_settings( dimensions, document_template, url, - query, - input_field, - path_to_embeddings, - embedding_object, - input_type, + request, + response, distribution, })); }; @@ -1574,21 +1571,8 @@ pub fn validate_embedding_settings( EmbedderSource::OpenAi => { check_unset(&revision, EmbeddingSettings::REVISION, inferred_source, name)?; - check_unset(&query, EmbeddingSettings::QUERY, inferred_source, name)?; - check_unset(&input_field, EmbeddingSettings::INPUT_FIELD, inferred_source, name)?; - check_unset( - &path_to_embeddings, - EmbeddingSettings::PATH_TO_EMBEDDINGS, - inferred_source, - name, - )?; - check_unset( - &embedding_object, - EmbeddingSettings::EMBEDDING_OBJECT, - inferred_source, - name, - )?; - check_unset(&input_type, EmbeddingSettings::INPUT_TYPE, inferred_source, name)?; + check_unset(&request, EmbeddingSettings::REQUEST, inferred_source, name)?; + check_unset(&response, EmbeddingSettings::RESPONSE, inferred_source, name)?; if let Setting::Set(model) = &model { let model = crate::vector::openai::EmbeddingModel::from_name(model.as_str()) @@ -1626,42 +1610,16 @@ pub fn validate_embedding_settings( check_set(&model, EmbeddingSettings::MODEL, inferred_source, name)?; check_unset(&revision, EmbeddingSettings::REVISION, inferred_source, name)?; - check_unset(&query, EmbeddingSettings::QUERY, inferred_source, name)?; - check_unset(&input_field, EmbeddingSettings::INPUT_FIELD, inferred_source, name)?; - check_unset( - &path_to_embeddings, - EmbeddingSettings::PATH_TO_EMBEDDINGS, - inferred_source, - name, - )?; - check_unset( - &embedding_object, - EmbeddingSettings::EMBEDDING_OBJECT, - inferred_source, - name, - )?; - check_unset(&input_type, EmbeddingSettings::INPUT_TYPE, inferred_source, name)?; + check_unset(&request, EmbeddingSettings::REQUEST, inferred_source, name)?; + check_unset(&response, EmbeddingSettings::RESPONSE, inferred_source, name)?; } EmbedderSource::HuggingFace => { check_unset(&api_key, EmbeddingSettings::API_KEY, inferred_source, name)?; check_unset(&dimensions, EmbeddingSettings::DIMENSIONS, inferred_source, name)?; check_unset(&url, EmbeddingSettings::URL, inferred_source, name)?; - check_unset(&query, EmbeddingSettings::QUERY, inferred_source, name)?; - check_unset(&input_field, EmbeddingSettings::INPUT_FIELD, inferred_source, name)?; - check_unset( - &path_to_embeddings, - EmbeddingSettings::PATH_TO_EMBEDDINGS, - inferred_source, - name, - )?; - check_unset( - &embedding_object, - EmbeddingSettings::EMBEDDING_OBJECT, - inferred_source, - name, - )?; - check_unset(&input_type, EmbeddingSettings::INPUT_TYPE, inferred_source, name)?; + check_unset(&request, EmbeddingSettings::REQUEST, inferred_source, name)?; + check_unset(&response, EmbeddingSettings::RESPONSE, inferred_source, name)?; } EmbedderSource::UserProvided => { check_unset(&model, EmbeddingSettings::MODEL, inferred_source, name)?; @@ -1676,26 +1634,15 @@ pub fn validate_embedding_settings( check_set(&dimensions, EmbeddingSettings::DIMENSIONS, inferred_source, name)?; check_unset(&url, EmbeddingSettings::URL, inferred_source, name)?; - check_unset(&query, EmbeddingSettings::QUERY, inferred_source, name)?; - check_unset(&input_field, EmbeddingSettings::INPUT_FIELD, inferred_source, name)?; - check_unset( - &path_to_embeddings, - EmbeddingSettings::PATH_TO_EMBEDDINGS, - inferred_source, - name, - )?; - check_unset( - &embedding_object, - EmbeddingSettings::EMBEDDING_OBJECT, - inferred_source, - name, - )?; - check_unset(&input_type, EmbeddingSettings::INPUT_TYPE, inferred_source, name)?; + check_unset(&request, EmbeddingSettings::REQUEST, inferred_source, name)?; + check_unset(&response, EmbeddingSettings::RESPONSE, inferred_source, name)?; } EmbedderSource::Rest => { check_unset(&model, EmbeddingSettings::MODEL, inferred_source, name)?; check_unset(&revision, EmbeddingSettings::REVISION, inferred_source, name)?; check_set(&url, EmbeddingSettings::URL, inferred_source, name)?; + check_set(&request, EmbeddingSettings::REQUEST, inferred_source, name)?; + check_set(&response, EmbeddingSettings::RESPONSE, inferred_source, name)?; } } Ok(Setting::Set(EmbeddingSettings { @@ -1706,11 +1653,8 @@ pub fn validate_embedding_settings( dimensions, document_template, url, - query, - input_field, - path_to_embeddings, - embedding_object, - input_type, + request, + response, distribution, })) } diff --git a/milli/src/vector/error.rs b/milli/src/vector/error.rs index 975561dc3..7e1cb8752 100644 --- a/milli/src/vector/error.rs +++ b/milli/src/vector/error.rs @@ -4,6 +4,7 @@ use std::path::PathBuf; use hf_hub::api::sync::ApiError; use super::parsed_vectors::ParsedVectorsDiff; +use super::rest::ConfigurationSource; use crate::error::FaultSource; use crate::{FieldDistribution, PanicCatched}; @@ -45,48 +46,57 @@ pub struct EmbedError { #[derive(Debug, thiserror::Error)] pub enum EmbedErrorKind { - #[error("could not tokenize: {0}")] + #[error("could not tokenize:\n - {0}")] Tokenize(Box), - #[error("unexpected tensor shape: {0}")] + #[error("unexpected tensor shape:\n - {0}")] TensorShape(candle_core::Error), - #[error("unexpected tensor value: {0}")] + #[error("unexpected tensor value:\n - {0}")] TensorValue(candle_core::Error), - #[error("could not run model: {0}")] + #[error("could not run model:\n - {0}")] ModelForward(candle_core::Error), - #[error("attempt to embed the following text in a configuration where embeddings must be user provided: {0:?}")] + #[error("attempt to embed the following text in a configuration where embeddings must be user provided:\n - `{0}`")] ManualEmbed(String), - #[error("model not found. Meilisearch will not automatically download models from the Ollama library, please pull the model manually: {0:?}")] + #[error("model not found. Meilisearch will not automatically download models from the Ollama library, please pull the model manually{}", option_info(.0.as_deref(), "server replied with "))] OllamaModelNotFoundError(Option), - #[error("error deserialization the response body as JSON: {0}")] + #[error("error deserialization the response body as JSON:\n - {0}")] RestResponseDeserialization(std::io::Error), - #[error("component `{0}` not found in path `{1}` in response: `{2}`")] - RestResponseMissingEmbeddings(String, String, String), - #[error("unexpected format of the embedding response: {0}")] - RestResponseFormat(serde_json::Error), #[error("expected a response containing {0} embeddings, got only {1}")] RestResponseEmbeddingCount(usize, usize), - #[error("could not authenticate against embedding server: {0:?}")] + #[error("could not authenticate against embedding server{}", option_info(.0.as_deref(), "server replied with "))] RestUnauthorized(Option), - #[error("sent too many requests to embedding server: {0:?}")] + #[error("sent too many requests to embedding server{}", option_info(.0.as_deref(), "server replied with "))] RestTooManyRequests(Option), - #[error("sent a bad request to embedding server: {0:?}")] - RestBadRequest(Option), - #[error("received internal error from embedding server: {0:?}")] + #[error("sent a bad request to embedding server{}{}", + if ConfigurationSource::User == *.1 { + "\n - Hint: check that the `request` in the embedder configuration matches the remote server's API" + } else { + "" + }, + option_info(.0.as_deref(), "server replied with "))] + RestBadRequest(Option, ConfigurationSource), + #[error("received internal error HTTP {0} from embedding server{}", option_info(.1.as_deref(), "server replied with "))] RestInternalServerError(u16, Option), - #[error("received HTTP {0} from embedding server: {0:?}")] + #[error("received unexpected HTTP {0} from embedding server{}", option_info(.1.as_deref(), "server replied with "))] RestOtherStatusCode(u16, Option), - #[error("could not reach embedding server: {0}")] + #[error("could not reach embedding server:\n - {0}")] RestNetwork(ureq::Transport), - #[error("was expected '{}' to be an object in query '{0}'", .1.join("."))] - RestNotAnObject(serde_json::Value, Vec), - #[error("while embedding tokenized, was expecting embeddings of dimension `{0}`, got embeddings of dimensions `{1}`")] - OpenAiUnexpectedDimension(usize, usize), + #[error("error extracting embeddings from the response:\n - {0}")] + RestExtractionError(String), + #[error("was expecting embeddings of dimension `{0}`, got embeddings of dimensions `{1}`")] + UnexpectedDimension(usize, usize), #[error("no embedding was produced")] MissingEmbedding, #[error(transparent)] PanicInThreadPool(#[from] PanicCatched), } +fn option_info(info: Option<&str>, prefix: &str) -> String { + match info { + Some(info) => format!("\n - {prefix}`{info}`"), + None => String::new(), + } +} + impl EmbedError { pub fn tokenize(inner: Box) -> Self { Self { kind: EmbedErrorKind::Tokenize(inner), fault: FaultSource::Runtime } @@ -119,28 +129,6 @@ impl EmbedError { } } - pub(crate) fn rest_response_missing_embeddings>( - response: serde_json::Value, - component: &str, - response_field: &[S], - ) -> EmbedError { - let response_field: Vec<&str> = response_field.iter().map(AsRef::as_ref).collect(); - let response_field = response_field.join("."); - - Self { - kind: EmbedErrorKind::RestResponseMissingEmbeddings( - component.to_owned(), - response_field, - serde_json::to_string_pretty(&response).unwrap_or_default(), - ), - fault: FaultSource::Undecided, - } - } - - pub(crate) fn rest_response_format(error: serde_json::Error) -> EmbedError { - Self { kind: EmbedErrorKind::RestResponseFormat(error), fault: FaultSource::Undecided } - } - pub(crate) fn rest_response_embedding_count(expected: usize, got: usize) -> EmbedError { Self { kind: EmbedErrorKind::RestResponseEmbeddingCount(expected, got), @@ -159,8 +147,14 @@ impl EmbedError { } } - pub(crate) fn rest_bad_request(error_response: Option) -> EmbedError { - Self { kind: EmbedErrorKind::RestBadRequest(error_response), fault: FaultSource::User } + pub(crate) fn rest_bad_request( + error_response: Option, + configuration_source: ConfigurationSource, + ) -> EmbedError { + Self { + kind: EmbedErrorKind::RestBadRequest(error_response, configuration_source), + fault: FaultSource::User, + } } pub(crate) fn rest_internal_server_error( @@ -184,22 +178,19 @@ impl EmbedError { Self { kind: EmbedErrorKind::RestNetwork(transport), fault: FaultSource::Runtime } } - pub(crate) fn rest_not_an_object( - query: serde_json::Value, - input_path: Vec, - ) -> EmbedError { - Self { kind: EmbedErrorKind::RestNotAnObject(query, input_path), fault: FaultSource::User } - } - - pub(crate) fn openai_unexpected_dimension(expected: usize, got: usize) -> EmbedError { + pub(crate) fn rest_unexpected_dimension(expected: usize, got: usize) -> EmbedError { Self { - kind: EmbedErrorKind::OpenAiUnexpectedDimension(expected, got), + kind: EmbedErrorKind::UnexpectedDimension(expected, got), fault: FaultSource::Runtime, } } pub(crate) fn missing_embedding() -> EmbedError { Self { kind: EmbedErrorKind::MissingEmbedding, fault: FaultSource::Undecided } } + + pub(crate) fn rest_extraction_error(error: String) -> EmbedError { + Self { kind: EmbedErrorKind::RestExtractionError(error), fault: FaultSource::Runtime } + } } #[derive(Debug, thiserror::Error)] @@ -290,10 +281,17 @@ impl NewEmbedderError { fault: FaultSource::Runtime, } } + + pub(crate) fn rest_could_not_parse_template(message: String) -> NewEmbedderError { + Self { + kind: NewEmbedderErrorKind::CouldNotParseTemplate(message), + fault: FaultSource::User, + } + } } #[derive(Debug, thiserror::Error)] -#[error("could not open config at {filename:?}: {inner}")] +#[error("could not open config at {filename}: {inner}")] pub struct OpenConfig { pub filename: PathBuf, pub inner: std::io::Error, @@ -339,18 +337,20 @@ pub enum NewEmbedderErrorKind { UnsupportedModel(UnsupportedModel), #[error(transparent)] OpenTokenizer(OpenTokenizer), - #[error("could not build weights from Pytorch weights: {0}")] + #[error("could not build weights from Pytorch weights:\n - {0}")] PytorchWeight(candle_core::Error), - #[error("could not build weights from Safetensor weights: {0}")] + #[error("could not build weights from Safetensor weights:\n - {0}")] SafetensorWeight(candle_core::Error), - #[error("could not spawn HG_HUB API client: {0}")] + #[error("could not spawn HG_HUB API client:\n - {0}")] NewApiFail(ApiError), - #[error("fetching file from HG_HUB failed: {0}")] + #[error("fetching file from HG_HUB failed:\n - {0}")] ApiGet(ApiError), - #[error("could not determine model dimensions: test embedding failed with {0}")] + #[error("could not determine model dimensions:\n - test embedding failed with {0}")] CouldNotDetermineDimension(EmbedError), - #[error("loading model failed: {0}")] + #[error("loading model failed:\n - {0}")] LoadModel(candle_core::Error), + #[error("{0}")] + CouldNotParseTemplate(String), } pub struct PossibleEmbeddingMistakes { diff --git a/milli/src/vector/json_template.rs b/milli/src/vector/json_template.rs new file mode 100644 index 000000000..454f23251 --- /dev/null +++ b/milli/src/vector/json_template.rs @@ -0,0 +1,970 @@ +//! Module to manipulate JSON templates. +//! +//! This module allows two main operations: +//! 1. Render JSON values from a template and a context value. +//! 2. Retrieve data from a template and JSON values. + +#![warn(rustdoc::broken_intra_doc_links)] +#![warn(missing_docs)] + +use serde::Deserialize; +use serde_json::{Map, Value}; + +type ValuePath = Vec; + +/// Encapsulates a JSON template and allows injecting and extracting values from it. +#[derive(Debug)] +pub struct ValueTemplate { + template: Value, + value_kind: ValueKind, +} + +#[derive(Debug)] +enum ValueKind { + Single(ValuePath), + Array(ArrayPath), +} + +#[derive(Debug)] +struct ArrayPath { + repeated_value: Value, + path_to_array: ValuePath, + value_path_in_array: ValuePath, +} + +/// Component of a path to a Value +#[derive(Debug, Clone)] +pub enum PathComponent { + /// A key inside of an object + MapKey(String), + /// An index inside of an array + ArrayIndex(usize), +} + +impl PartialEq for PathComponent { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (Self::MapKey(l0), Self::MapKey(r0)) => l0 == r0, + (Self::ArrayIndex(l0), Self::ArrayIndex(r0)) => l0 == r0, + _ => false, + } + } +} + +impl Eq for PathComponent {} + +/// Error that occurs when no few value was provided to a template for injection. +#[derive(Debug)] +pub struct MissingValue; + +/// Error that occurs when trying to parse a template in [`ValueTemplate::new`] +#[derive(Debug)] +pub enum TemplateParsingError { + /// A repeat string appears inside a repeated value + NestedRepeatString(ValuePath), + /// A repeat string appears outside of an array + RepeatStringNotInArray(ValuePath), + /// A repeat string appears in an array, but not in the second position + BadIndexForRepeatString(ValuePath, usize), + /// A repeated value lacks a placeholder + MissingPlaceholderInRepeatedValue(ValuePath), + /// Multiple repeat string appear in the template + MultipleRepeatString(ValuePath, ValuePath), + /// Multiple placeholder strings appear in the template + MultiplePlaceholderString(ValuePath, ValuePath), + /// No placeholder string appear in the template + MissingPlaceholderString, + /// A placeholder appears both inside a repeated value and outside of it + BothArrayAndSingle { + /// Path to the single value + single_path: ValuePath, + /// Path to the array of repeated values + path_to_array: ValuePath, + /// Path to placeholder inside each repeated value, starting from the array + array_to_placeholder: ValuePath, + }, +} + +impl TemplateParsingError { + /// Produce an error message from the error kind, the name of the root object, the placeholder string and the repeat string + pub fn error_message(&self, root: &str, placeholder: &str, repeat: &str) -> String { + match self { + TemplateParsingError::NestedRepeatString(path) => { + format!( + r#"in {}: "{repeat}" appears nested inside of a value that is itself repeated"#, + path_with_root(root, path) + ) + } + TemplateParsingError::RepeatStringNotInArray(path) => format!( + r#"in {}: "{repeat}" appears outside of an array"#, + path_with_root(root, path) + ), + TemplateParsingError::BadIndexForRepeatString(path, index) => format!( + r#"in {}: "{repeat}" expected at position #1, but found at position #{index}"#, + path_with_root(root, path) + ), + TemplateParsingError::MissingPlaceholderInRepeatedValue(path) => format!( + r#"in {}: Expected "{placeholder}" inside of the repeated value"#, + path_with_root(root, path) + ), + TemplateParsingError::MultipleRepeatString(current, previous) => format!( + r#"in {}: Found "{repeat}", but it was already present in {}"#, + path_with_root(root, current), + path_with_root(root, previous) + ), + TemplateParsingError::MultiplePlaceholderString(current, previous) => format!( + r#"in {}: Found "{placeholder}", but it was already present in {}"#, + path_with_root(root, current), + path_with_root(root, previous) + ), + TemplateParsingError::MissingPlaceholderString => { + format!(r#"in `{root}`: "{placeholder}" not found"#) + } + TemplateParsingError::BothArrayAndSingle { + single_path, + path_to_array, + array_to_placeholder, + } => { + let path_to_first_repeated = path_to_array + .iter() + .chain(std::iter::once(&PathComponent::ArrayIndex(0))) + .chain(array_to_placeholder.iter()); + format!( + r#"in {}: Found "{placeholder}", but it was already present in {} (repeated)"#, + path_with_root(root, single_path), + path_with_root(root, path_to_first_repeated) + ) + } + } + } + + fn prepend_path(self, mut prepended_path: ValuePath) -> Self { + match self { + TemplateParsingError::NestedRepeatString(mut path) => { + prepended_path.append(&mut path); + TemplateParsingError::NestedRepeatString(prepended_path) + } + TemplateParsingError::RepeatStringNotInArray(mut path) => { + prepended_path.append(&mut path); + TemplateParsingError::RepeatStringNotInArray(prepended_path) + } + TemplateParsingError::BadIndexForRepeatString(mut path, index) => { + prepended_path.append(&mut path); + TemplateParsingError::BadIndexForRepeatString(prepended_path, index) + } + TemplateParsingError::MissingPlaceholderInRepeatedValue(mut path) => { + prepended_path.append(&mut path); + TemplateParsingError::MissingPlaceholderInRepeatedValue(prepended_path) + } + TemplateParsingError::MultipleRepeatString(mut path, older_path) => { + let older_prepended_path = + prepended_path.iter().cloned().chain(older_path).collect(); + prepended_path.append(&mut path); + TemplateParsingError::MultipleRepeatString(prepended_path, older_prepended_path) + } + TemplateParsingError::MultiplePlaceholderString(mut path, older_path) => { + let older_prepended_path = + prepended_path.iter().cloned().chain(older_path).collect(); + prepended_path.append(&mut path); + TemplateParsingError::MultiplePlaceholderString( + prepended_path, + older_prepended_path, + ) + } + TemplateParsingError::MissingPlaceholderString => { + TemplateParsingError::MissingPlaceholderString + } + TemplateParsingError::BothArrayAndSingle { + single_path, + mut path_to_array, + array_to_placeholder, + } => { + // note, this case is not super logical, but is also likely to be dead code + let single_prepended_path = + prepended_path.iter().cloned().chain(single_path).collect(); + prepended_path.append(&mut path_to_array); + // we don't prepend the array_to_placeholder path as it is the array path that is prepended + TemplateParsingError::BothArrayAndSingle { + single_path: single_prepended_path, + path_to_array: prepended_path, + array_to_placeholder, + } + } + } + } +} + +/// Error that occurs when [`ValueTemplate::extract`] fails. +#[derive(Debug)] +pub struct ExtractionError { + /// The cause of the failure + pub kind: ExtractionErrorKind, + /// The context where the failure happened: the operation that failed + pub context: ExtractionErrorContext, +} + +impl ExtractionError { + /// Produce an error message from the error, the name of the root object, the placeholder string and the expected value type + pub fn error_message( + &self, + root: &str, + placeholder: &str, + expected_value_type: &str, + ) -> String { + let context = match &self.context { + ExtractionErrorContext::ExtractingSingleValue => { + format!(r#"extracting a single "{placeholder}""#) + } + ExtractionErrorContext::FindingPathToArray => { + format!(r#"extracting the array of "{placeholder}"s"#) + } + ExtractionErrorContext::ExtractingArrayItem(index) => { + format!(r#"extracting item #{index} from the array of "{placeholder}"s"#) + } + }; + match &self.kind { + ExtractionErrorKind::MissingPathComponent { missing_index, path, key_suggestion } => { + let last_named_object = last_named_object(root, path.iter().take(*missing_index)); + format!( + "in {}, while {context}, configuration expects {}, which is missing in response{}", + path_with_root(root, path.iter().take(*missing_index)), + missing_component(path.get(*missing_index)), + match key_suggestion { + Some(key_suggestion) => format!("\n - Hint: {last_named_object} has key `{key_suggestion}`, did you mean {} in embedder configuration?", + path_with_root(root, path.iter().take(*missing_index).chain(std::iter::once(&PathComponent::MapKey(key_suggestion.to_owned()))))), + None => "".to_owned(), + } + ) + } + ExtractionErrorKind::WrongPathComponent { wrong_component, index, path } => { + let last_named_object = last_named_object(root, path.iter().take(*index)); + format!( + "in {}, while {context}, configuration expects {last_named_object} to be {} but server sent {wrong_component}", + path_with_root(root, path.iter().take(*index)), + expected_component(path.get(*index)) + ) + } + ExtractionErrorKind::DeserializationError { error, path } => { + let last_named_object = last_named_object(root, path); + format!( + "in {}, while {context}, expected {last_named_object} to be {expected_value_type}, but failed to parse server response:\n - {error}", + path_with_root(root, path) + ) + } + } + } +} + +fn missing_component(component: Option<&PathComponent>) -> String { + match component { + Some(PathComponent::ArrayIndex(index)) => { + format!(r#"item #{index}"#) + } + Some(PathComponent::MapKey(key)) => { + format!(r#"key "{key}""#) + } + None => "unknown".to_string(), + } +} + +fn expected_component(component: Option<&PathComponent>) -> String { + match component { + Some(PathComponent::ArrayIndex(index)) => { + format!(r#"an array with at least {} item(s)"#, index.saturating_add(1)) + } + Some(PathComponent::MapKey(key)) => { + format!("an object with key `{}`", key) + } + None => "unknown".to_string(), + } +} + +fn last_named_object<'a>( + root: &'a str, + path: impl IntoIterator + 'a, +) -> LastNamedObject<'a> { + let mut last_named_object = LastNamedObject::Object { name: root }; + for component in path.into_iter() { + last_named_object = match (component, last_named_object) { + (PathComponent::MapKey(name), _) => LastNamedObject::Object { name }, + (PathComponent::ArrayIndex(index), LastNamedObject::Object { name }) => { + LastNamedObject::ArrayInsideObject { object_name: name, index: *index } + } + ( + PathComponent::ArrayIndex(index), + LastNamedObject::ArrayInsideObject { object_name, index: _ }, + ) => LastNamedObject::NestedArrayInsideObject { + object_name, + index: *index, + nesting_level: 0, + }, + ( + PathComponent::ArrayIndex(index), + LastNamedObject::NestedArrayInsideObject { object_name, index: _, nesting_level }, + ) => LastNamedObject::NestedArrayInsideObject { + object_name, + index: *index, + nesting_level: nesting_level.saturating_add(1), + }, + } + } + last_named_object +} + +impl<'a> std::fmt::Display for LastNamedObject<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + LastNamedObject::Object { name } => write!(f, "`{name}`"), + LastNamedObject::ArrayInsideObject { object_name, index } => { + write!(f, "item #{index} inside `{object_name}`") + } + LastNamedObject::NestedArrayInsideObject { object_name, index, nesting_level } => { + if *nesting_level == 0 { + write!(f, "item #{index} inside nested array in `{object_name}`") + } else { + write!(f, "item #{index} inside nested array ({} levels of nesting) in `{object_name}`", nesting_level + 1) + } + } + } + } +} + +#[derive(Debug, Clone, Copy)] +enum LastNamedObject<'a> { + Object { name: &'a str }, + ArrayInsideObject { object_name: &'a str, index: usize }, + NestedArrayInsideObject { object_name: &'a str, index: usize, nesting_level: usize }, +} + +/// Builds a string representation of a path, preprending the name of the root value. +pub fn path_with_root<'a>( + root: &str, + path: impl IntoIterator + 'a, +) -> String { + use std::fmt::Write as _; + let mut res = format!("`{root}"); + for component in path.into_iter() { + match component { + PathComponent::MapKey(key) => { + let _ = write!(&mut res, ".{key}"); + } + PathComponent::ArrayIndex(index) => { + let _ = write!(&mut res, "[{index}]"); + } + } + } + res.push('`'); + res +} + +/// Context where an extraction failure happened +/// +/// The operation that failed +#[derive(Debug, Clone, Copy)] +pub enum ExtractionErrorContext { + /// Failure happened while extracting a value at a single location + ExtractingSingleValue, + /// Failure happened while extracting an array of values + FindingPathToArray, + /// Failure happened while extracting a value inside of an array + ExtractingArrayItem(usize), +} + +/// Kind of errors that can happen during extraction +#[derive(Debug)] +pub enum ExtractionErrorKind { + /// An expected path component is missing + MissingPathComponent { + /// Index of the missing component in the path + missing_index: usize, + /// Path where a component is missing + path: ValuePath, + /// Possible matching key in object + key_suggestion: Option, + }, + /// An expected path component cannot be found because its container is the wrong type + WrongPathComponent { + /// String representation of the wrong component + wrong_component: String, + /// Index of the wrong component in the path + index: usize, + /// Path where a component has the wrong type + path: ValuePath, + }, + /// Could not deserialize an extracted value to its requested type + DeserializationError { + /// inner deserialization error + error: serde_json::Error, + /// path to extracted value + path: ValuePath, + }, +} + +enum ArrayParsingContext<'a> { + Nested, + NotNested(&'a mut Option), +} + +impl ValueTemplate { + /// Prepare a template for injection or extraction. + /// + /// # Parameters + /// + /// - `template`: JSON value that acts a template. Its placeholder values will be replaced by actual values during injection, + /// and actual values will be recovered from their location during extraction. + /// - `placeholder_string`: Value that a JSON string should assume to act as a placeholder value that can be injected into or + /// extracted from. + /// - `repeat_string`: Sentinel value that can be placed as the second value in an array to indicate that the first value can be repeated + /// any number of times. The first value should contain exactly one placeholder string. + /// + /// # Errors + /// + /// - [`TemplateParsingError`]: refer to the documentation of this type + pub fn new( + template: Value, + placeholder_string: &str, + repeat_string: &str, + ) -> Result { + let mut value_path = None; + let mut array_path = None; + let mut current_path = Vec::new(); + Self::parse_value( + &template, + placeholder_string, + repeat_string, + &mut value_path, + &mut ArrayParsingContext::NotNested(&mut array_path), + &mut current_path, + )?; + + let value_kind = match (array_path, value_path) { + (None, None) => return Err(TemplateParsingError::MissingPlaceholderString), + (None, Some(value_path)) => ValueKind::Single(value_path), + (Some(array_path), None) => ValueKind::Array(array_path), + (Some(array_path), Some(value_path)) => { + return Err(TemplateParsingError::BothArrayAndSingle { + single_path: value_path, + path_to_array: array_path.path_to_array, + array_to_placeholder: array_path.value_path_in_array, + }) + } + }; + + Ok(Self { template, value_kind }) + } + + /// Whether there is a placeholder that can be repeated. + /// + /// - During injection, all values are injected in the array placeholder, + /// - During extraction, all repeatable placeholders are extracted from the array. + pub fn has_array_value(&self) -> bool { + matches!(self.value_kind, ValueKind::Array(_)) + } + + /// Render a value from the template and context values. + /// + /// # Error + /// + /// - [`MissingValue`]: if the number of injected values is 0. + pub fn inject(&self, values: impl IntoIterator) -> Result { + let mut rendered = self.template.clone(); + let mut values = values.into_iter(); + + match &self.value_kind { + ValueKind::Single(injection_path) => { + let Some(injected_value) = values.next() else { return Err(MissingValue) }; + inject_value(&mut rendered, injection_path, injected_value); + } + ValueKind::Array(ArrayPath { repeated_value, path_to_array, value_path_in_array }) => { + // 1. build the array of repeated values + let mut array = Vec::new(); + for injected_value in values { + let mut repeated_value = repeated_value.clone(); + inject_value(&mut repeated_value, value_path_in_array, injected_value); + array.push(repeated_value); + } + + if array.is_empty() { + return Err(MissingValue); + } + // 2. inject at the injection point in the rendered value + inject_value(&mut rendered, path_to_array, Value::Array(array)); + } + } + + Ok(rendered) + } + + /// Extract sub values from the template and a value. + /// + /// # Errors + /// + /// - if a single placeholder is missing. + /// - if there is no value corresponding to an array placeholder + /// - if the value corresponding to an array placeholder is not an array + pub fn extract(&self, mut value: Value) -> Result, ExtractionError> + where + T: for<'de> Deserialize<'de>, + { + Ok(match &self.value_kind { + ValueKind::Single(extraction_path) => { + let extracted_value = + extract_value(extraction_path, &mut value).with_context(|kind| { + ExtractionError { + kind, + context: ExtractionErrorContext::ExtractingSingleValue, + } + })?; + vec![extracted_value] + } + ValueKind::Array(ArrayPath { + repeated_value: _, + path_to_array, + value_path_in_array, + }) => { + // get the array + let array = extract_value(path_to_array, &mut value).with_context(|kind| { + ExtractionError { kind, context: ExtractionErrorContext::FindingPathToArray } + })?; + let array = match array { + Value::Array(array) => array, + not_array => { + let mut path = path_to_array.clone(); + path.push(PathComponent::ArrayIndex(0)); + return Err(ExtractionError { + kind: ExtractionErrorKind::WrongPathComponent { + wrong_component: format_value(¬_array), + index: path_to_array.len(), + path, + }, + context: ExtractionErrorContext::FindingPathToArray, + }); + } + }; + let mut extracted_values = Vec::with_capacity(array.len()); + + for (index, mut item) in array.into_iter().enumerate() { + let extracted_value = extract_value(value_path_in_array, &mut item) + .with_context(|kind| ExtractionError { + kind, + context: ExtractionErrorContext::ExtractingArrayItem(index), + })?; + extracted_values.push(extracted_value); + } + + extracted_values + } + }) + } + + fn parse_array( + array: &[Value], + placeholder_string: &str, + repeat_string: &str, + value_path: &mut Option, + mut array_path: &mut ArrayParsingContext, + current_path: &mut ValuePath, + ) -> Result<(), TemplateParsingError> { + // two modes for parsing array. + match array { + // 1. array contains a repeat string in second position + [first, second, rest @ ..] if second == repeat_string => { + let ArrayParsingContext::NotNested(array_path) = &mut array_path else { + return Err(TemplateParsingError::NestedRepeatString(current_path.clone())); + }; + if let Some(array_path) = array_path { + return Err(TemplateParsingError::MultipleRepeatString( + current_path.clone(), + array_path.path_to_array.clone(), + )); + } + if first == repeat_string { + return Err(TemplateParsingError::BadIndexForRepeatString( + current_path.clone(), + 0, + )); + } + if let Some(position) = rest.iter().position(|value| value == repeat_string) { + let position = position + 2; + return Err(TemplateParsingError::BadIndexForRepeatString( + current_path.clone(), + position, + )); + } + + let value_path_in_array = { + let mut value_path = None; + let mut current_path_in_array = Vec::new(); + + Self::parse_value( + first, + placeholder_string, + repeat_string, + &mut value_path, + &mut ArrayParsingContext::Nested, + &mut current_path_in_array, + ) + .map_err(|error| error.prepend_path(current_path.to_vec()))?; + + value_path.ok_or_else(|| { + let mut repeated_value_path = current_path.clone(); + repeated_value_path.push(PathComponent::ArrayIndex(0)); + TemplateParsingError::MissingPlaceholderInRepeatedValue(repeated_value_path) + })? + }; + **array_path = Some(ArrayPath { + repeated_value: first.to_owned(), + path_to_array: current_path.clone(), + value_path_in_array, + }); + } + // 2. array does not contain a repeat string + array => { + if let Some(position) = array.iter().position(|value| value == repeat_string) { + return Err(TemplateParsingError::BadIndexForRepeatString( + current_path.clone(), + position, + )); + } + for (index, value) in array.iter().enumerate() { + current_path.push(PathComponent::ArrayIndex(index)); + Self::parse_value( + value, + placeholder_string, + repeat_string, + value_path, + array_path, + current_path, + )?; + current_path.pop(); + } + } + } + Ok(()) + } + + fn parse_object( + object: &Map, + placeholder_string: &str, + repeat_string: &str, + value_path: &mut Option, + array_path: &mut ArrayParsingContext, + current_path: &mut ValuePath, + ) -> Result<(), TemplateParsingError> { + for (key, value) in object.iter() { + current_path.push(PathComponent::MapKey(key.to_owned())); + Self::parse_value( + value, + placeholder_string, + repeat_string, + value_path, + array_path, + current_path, + )?; + current_path.pop(); + } + Ok(()) + } + + fn parse_value( + value: &Value, + placeholder_string: &str, + repeat_string: &str, + value_path: &mut Option, + array_path: &mut ArrayParsingContext, + current_path: &mut ValuePath, + ) -> Result<(), TemplateParsingError> { + match value { + Value::String(str) => { + if placeholder_string == str { + if let Some(value_path) = value_path { + return Err(TemplateParsingError::MultiplePlaceholderString( + current_path.clone(), + value_path.clone(), + )); + } + + *value_path = Some(current_path.clone()); + } + if repeat_string == str { + return Err(TemplateParsingError::RepeatStringNotInArray(current_path.clone())); + } + } + Value::Null | Value::Bool(_) | Value::Number(_) => {} + Value::Array(array) => Self::parse_array( + array, + placeholder_string, + repeat_string, + value_path, + array_path, + current_path, + )?, + Value::Object(object) => Self::parse_object( + object, + placeholder_string, + repeat_string, + value_path, + array_path, + current_path, + )?, + } + Ok(()) + } +} + +fn inject_value(rendered: &mut Value, injection_path: &Vec, injected_value: Value) { + let mut current_value = rendered; + for injection_component in injection_path { + current_value = match injection_component { + PathComponent::MapKey(key) => current_value.get_mut(key).unwrap(), + PathComponent::ArrayIndex(index) => current_value.get_mut(index).unwrap(), + } + } + *current_value = injected_value; +} + +fn format_value(value: &Value) -> String { + match value { + Value::Array(array) => format!("an array of size {}", array.len()), + Value::Object(object) => { + format!("an object with {} field(s)", object.len()) + } + value => value.to_string(), + } +} + +fn extract_value( + extraction_path: &[PathComponent], + initial_value: &mut Value, +) -> Result +where + T: for<'de> Deserialize<'de>, +{ + let mut current_value = initial_value; + for (path_index, extraction_component) in extraction_path.iter().enumerate() { + current_value = { + match extraction_component { + PathComponent::MapKey(key) => { + if !current_value.is_object() { + return Err(ExtractionErrorKind::WrongPathComponent { + wrong_component: format_value(current_value), + index: path_index, + path: extraction_path.to_vec(), + }); + } + if let Some(object) = current_value.as_object_mut() { + if !object.contains_key(key) { + let typos = + levenshtein_automata::LevenshteinAutomatonBuilder::new(2, true) + .build_dfa(key); + let mut key_suggestion = None; + 'check_typos: for (key, _) in object.iter() { + match typos.eval(key) { + levenshtein_automata::Distance::Exact(0) => { /* ??? */ } + levenshtein_automata::Distance::Exact(_) => { + key_suggestion = Some(key.to_owned()); + break 'check_typos; + } + levenshtein_automata::Distance::AtLeast(_) => continue, + } + } + return Err(ExtractionErrorKind::MissingPathComponent { + missing_index: path_index, + path: extraction_path.to_vec(), + key_suggestion, + }); + } + if let Some(value) = object.get_mut(key) { + value + } else { + // borrow checking limit: the borrow checker cannot be convinced that `object` is no longer mutably borrowed on the + // `else` branch of the `if let`, so we cannot return MissingPathComponent here. + // As a workaround, we checked that the object does not contain the key above, making this `else` unreachable. + unreachable!() + } + } else { + // borrow checking limit: the borrow checker cannot be convinced that `current_value` is no longer mutably borrowed + // on the `else` branch of the `if let`, so we cannot return WrongPathComponent here. + // As a workaround, we checked that the value was not a map above, making this `else` unreachable. + unreachable!() + } + } + PathComponent::ArrayIndex(index) => { + if !current_value.is_array() { + return Err(ExtractionErrorKind::WrongPathComponent { + wrong_component: format_value(current_value), + index: path_index, + path: extraction_path.to_vec(), + }); + } + match current_value.get_mut(index) { + Some(value) => value, + None => { + return Err(ExtractionErrorKind::MissingPathComponent { + missing_index: path_index, + path: extraction_path.to_vec(), + key_suggestion: None, + }); + } + } + } + } + }; + } + serde_json::from_value(current_value.take()).map_err(|error| { + ExtractionErrorKind::DeserializationError { error, path: extraction_path.to_vec() } + }) +} + +trait ExtractionResultErrorContext { + fn with_context(self, f: F) -> Result + where + F: FnOnce(ExtractionErrorKind) -> ExtractionError; +} + +impl ExtractionResultErrorContext for Result { + fn with_context(self, f: F) -> Result + where + F: FnOnce(ExtractionErrorKind) -> ExtractionError, + { + match self { + Ok(t) => Ok(t), + Err(kind) => Err(f(kind)), + } + } +} + +#[cfg(test)] +mod test { + use serde_json::{json, Value}; + + use super::{PathComponent, TemplateParsingError, ValueTemplate}; + + fn new_template(template: Value) -> Result { + ValueTemplate::new(template, "{{text}}", "{{..}}") + } + + #[test] + fn empty_template() { + let template = json!({ + "toto": "no template at all", + "titi": ["this", "will", "not", "work"], + "tutu": null + }); + + let error = new_template(template.clone()).unwrap_err(); + assert!(matches!(error, TemplateParsingError::MissingPlaceholderString)) + } + + #[test] + fn single_template() { + let template = json!({ + "toto": "text", + "titi": ["this", "will", "still", "{{text}}"], + "tutu": null + }); + + let basic = new_template(template.clone()).unwrap(); + + assert!(!basic.has_array_value()); + + assert_eq!( + basic.inject(vec!["work".into(), Value::Null, "test".into()]).unwrap(), + json!({ + "toto": "text", + "titi": ["this", "will", "still", "work"], + "tutu": null + }) + ); + } + + #[test] + fn too_many_placeholders() { + let template = json!({ + "toto": "{{text}}", + "titi": ["this", "will", "still", "{{text}}"], + "tutu": "text" + }); + + match new_template(template.clone()) { + Err(TemplateParsingError::MultiplePlaceholderString(left, right)) => { + assert_eq!( + left, + vec![PathComponent::MapKey("titi".into()), PathComponent::ArrayIndex(3)] + ); + + assert_eq!(right, vec![PathComponent::MapKey("toto".into())]) + } + _ => panic!("should error"), + } + } + + #[test] + fn dynamic_template() { + let template = json!({ + "toto": "text", + "titi": [{ + "type": "text", + "data": "{{text}}" + }, "{{..}}"], + "tutu": null + }); + + let basic = new_template(template.clone()).unwrap(); + + assert!(basic.has_array_value()); + + let injected_values = vec![ + "work".into(), + Value::Null, + 42.into(), + "test".into(), + "tata".into(), + "titi".into(), + "tutu".into(), + ]; + + let rendered = basic.inject(injected_values.clone()).unwrap(); + + assert_eq!( + rendered, + json!({ + "toto": "text", + "titi": [ + { + "type": "text", + "data": "work" + }, + { + "type": "text", + "data": Value::Null + }, + { + "type": "text", + "data": 42 + }, + { + "type": "text", + "data": "test" + }, + { + "type": "text", + "data": "tata" + }, + { + "type": "text", + "data": "titi" + }, + { + "type": "text", + "data": "tutu" + } + ], + "tutu": null + }) + ); + + let extracted_values: Vec = basic.extract(rendered).unwrap(); + assert_eq!(extracted_values, injected_values); + } +} diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index bfe99149b..a1c937d24 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -11,6 +11,7 @@ use crate::ThreadPoolNoAbort; pub mod error; pub mod hf; +pub mod json_template; pub mod manual; pub mod openai; pub mod parsed_vectors; @@ -227,7 +228,9 @@ impl Embedder { EmbedderOptions::UserProvided(options) => { Self::UserProvided(manual::Embedder::new(options)) } - EmbedderOptions::Rest(options) => Self::Rest(rest::Embedder::new(options)?), + EmbedderOptions::Rest(options) => { + Self::Rest(rest::Embedder::new(options, rest::ConfigurationSource::User)?) + } }) } diff --git a/milli/src/vector/ollama.rs b/milli/src/vector/ollama.rs index 2c29cc816..84baac1ba 100644 --- a/milli/src/vector/ollama.rs +++ b/milli/src/vector/ollama.rs @@ -28,19 +28,22 @@ impl EmbedderOptions { impl Embedder { pub fn new(options: EmbedderOptions) -> Result { let model = options.embedding_model.as_str(); - let rest_embedder = match RestEmbedder::new(RestEmbedderOptions { - api_key: options.api_key, - dimensions: None, - distribution: options.distribution, - url: options.url.unwrap_or_else(get_ollama_path), - query: serde_json::json!({ - "model": model, - }), - input_field: vec!["prompt".to_owned()], - path_to_embeddings: Default::default(), - embedding_object: vec!["embedding".to_owned()], - input_type: super::rest::InputType::Text, - }) { + let rest_embedder = match RestEmbedder::new( + RestEmbedderOptions { + api_key: options.api_key, + dimensions: None, + distribution: options.distribution, + url: options.url.unwrap_or_else(get_ollama_path), + request: serde_json::json!({ + "model": model, + "prompt": super::rest::REQUEST_PLACEHOLDER, + }), + response: serde_json::json!({ + "embedding": super::rest::RESPONSE_PLACEHOLDER, + }), + }, + super::rest::ConfigurationSource::Ollama, + ) { Ok(embedder) => embedder, Err(NewEmbedderError { kind: diff --git a/milli/src/vector/openai.rs b/milli/src/vector/openai.rs index ade9e51fc..514ad4a3b 100644 --- a/milli/src/vector/openai.rs +++ b/milli/src/vector/openai.rs @@ -26,20 +26,21 @@ impl EmbedderOptions { } } - pub fn query(&self) -> serde_json::Value { + pub fn request(&self) -> serde_json::Value { let model = self.embedding_model.name(); - let mut query = serde_json::json!({ + let mut request = serde_json::json!({ "model": model, + "input": [super::rest::REQUEST_PLACEHOLDER, super::rest::REPEAT_PLACEHOLDER] }); if self.embedding_model.supports_overriding_dimensions() { if let Some(dimensions) = self.dimensions { - query["dimensions"] = dimensions.into(); + request["dimensions"] = dimensions.into(); } } - query + request } pub fn distribution(&self) -> Option { @@ -180,17 +181,23 @@ impl Embedder { let url = options.url.as_deref().unwrap_or(OPENAI_EMBEDDINGS_URL).to_owned(); - let rest_embedder = RestEmbedder::new(RestEmbedderOptions { - api_key: Some(api_key.clone()), - distribution: None, - dimensions: Some(options.dimensions()), - url, - query: options.query(), - input_field: vec!["input".to_owned()], - input_type: crate::vector::rest::InputType::TextArray, - path_to_embeddings: vec!["data".to_owned()], - embedding_object: vec!["embedding".to_owned()], - })?; + let rest_embedder = RestEmbedder::new( + RestEmbedderOptions { + api_key: Some(api_key.clone()), + distribution: None, + dimensions: Some(options.dimensions()), + url, + request: options.request(), + response: serde_json::json!({ + "data": [{ + "embedding": super::rest::RESPONSE_PLACEHOLDER + }, + super::rest::REPEAT_PLACEHOLDER + ] + }), + }, + super::rest::ConfigurationSource::OpenAi, + )?; // looking at the code it is very unclear that this can actually fail. let tokenizer = tiktoken_rs::cl100k_base().unwrap(); @@ -201,7 +208,7 @@ impl Embedder { pub fn embed(&self, texts: Vec) -> Result>, EmbedError> { match self.rest_embedder.embed_ref(&texts) { Ok(embeddings) => Ok(embeddings), - Err(EmbedError { kind: EmbedErrorKind::RestBadRequest(error), fault: _ }) => { + Err(EmbedError { kind: EmbedErrorKind::RestBadRequest(error, _), fault: _ }) => { tracing::warn!(error=?error, "OpenAI: received `BAD_REQUEST`. Input was maybe too long, retrying on tokenized version. For best performance, limit the size of your document template."); self.try_embed_tokenized(&texts) } @@ -225,7 +232,7 @@ impl Embedder { let embedding = self.rest_embedder.embed_tokens(tokens)?; embeddings_for_prompt.append(embedding.into_inner()).map_err(|got| { - EmbedError::openai_unexpected_dimension(self.dimensions(), got.len()) + EmbedError::rest_unexpected_dimension(self.dimensions(), got.len()) })?; all_embeddings.push(embeddings_for_prompt); diff --git a/milli/src/vector/rest.rs b/milli/src/vector/rest.rs index b651cba63..35a7ebc41 100644 --- a/milli/src/vector/rest.rs +++ b/milli/src/vector/rest.rs @@ -4,6 +4,7 @@ use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _}; use serde::{Deserialize, Serialize}; use super::error::EmbedErrorKind; +use super::json_template::ValueTemplate; use super::{ DistributionShift, EmbedError, Embedding, Embeddings, NewEmbedderError, REQUEST_PARALLELISM, }; @@ -11,12 +12,18 @@ use crate::error::FaultSource; use crate::ThreadPoolNoAbort; // retrying in case of failure - pub struct Retry { pub error: EmbedError, strategy: RetryStrategy, } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ConfigurationSource { + OpenAi, + Ollama, + User, +} + pub enum RetryStrategy { GiveUp, Retry, @@ -63,10 +70,20 @@ impl Retry { #[derive(Debug)] pub struct Embedder { - client: ureq::Agent, - options: EmbedderOptions, - bearer: Option, + data: EmbedderData, dimensions: usize, + distribution: Option, +} + +/// All data needed to perform requests and parse responses +#[derive(Debug)] +struct EmbedderData { + client: ureq::Agent, + bearer: Option, + url: String, + request: Request, + response: Response, + configuration_source: ConfigurationSource, } #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] @@ -75,29 +92,8 @@ pub struct EmbedderOptions { pub distribution: Option, pub dimensions: Option, pub url: String, - pub query: serde_json::Value, - pub input_field: Vec, - // path to the array of embeddings - pub path_to_embeddings: Vec, - // shape of a single embedding - pub embedding_object: Vec, - pub input_type: InputType, -} - -impl Default for EmbedderOptions { - fn default() -> Self { - Self { - url: Default::default(), - query: Default::default(), - input_field: vec!["input".into()], - path_to_embeddings: vec!["data".into()], - embedding_object: vec!["embedding".into()], - input_type: InputType::Text, - api_key: None, - distribution: None, - dimensions: None, - } - } + pub request: serde_json::Value, + pub response: serde_json::Value, } impl std::hash::Hash for EmbedderOptions { @@ -106,26 +102,25 @@ impl std::hash::Hash for EmbedderOptions { self.distribution.hash(state); self.dimensions.hash(state); self.url.hash(state); - // skip hashing the query + // skip hashing the request and response // collisions in regular usage should be minimal, // and the list is limited to 256 values anyway - self.input_field.hash(state); - self.path_to_embeddings.hash(state); - self.embedding_object.hash(state); - self.input_type.hash(state); } } #[derive(Debug, Clone, Copy, Deserialize, Serialize, PartialEq, Eq, Hash, Deserr)] #[serde(rename_all = "camelCase")] #[deserr(rename_all = camelCase, deny_unknown_fields)] -pub enum InputType { +enum InputType { Text, TextArray, } impl Embedder { - pub fn new(options: EmbedderOptions) -> Result { + pub fn new( + options: EmbedderOptions, + configuration_source: ConfigurationSource, + ) -> Result { let bearer = options.api_key.as_deref().map(|api_key| format!("Bearer {api_key}")); let client = ureq::AgentBuilder::new() @@ -133,28 +128,40 @@ impl Embedder { .max_idle_connections_per_host(REQUEST_PARALLELISM * 2) .build(); + let request = Request::new(options.request)?; + let response = Response::new(options.response, &request)?; + + let data = EmbedderData { + client, + bearer, + url: options.url, + request, + response, + configuration_source, + }; + let dimensions = if let Some(dimensions) = options.dimensions { dimensions } else { - infer_dimensions(&client, &options, bearer.as_deref())? + infer_dimensions(&data)? }; - Ok(Self { client, dimensions, options, bearer }) + Ok(Self { data, dimensions, distribution: options.distribution }) } pub fn embed(&self, texts: Vec) -> Result>, EmbedError> { - embed(&self.client, &self.options, self.bearer.as_deref(), texts.as_slice(), texts.len()) + embed(&self.data, texts.as_slice(), texts.len(), Some(self.dimensions)) } pub fn embed_ref(&self, texts: &[S]) -> Result>, EmbedError> where S: AsRef + Serialize, { - embed(&self.client, &self.options, self.bearer.as_deref(), texts, texts.len()) + embed(&self.data, texts, texts.len(), Some(self.dimensions)) } pub fn embed_tokens(&self, tokens: &[usize]) -> Result, EmbedError> { - let mut embeddings = embed(&self.client, &self.options, self.bearer.as_deref(), tokens, 1)?; + let mut embeddings = embed(&self.data, tokens, 1, Some(self.dimensions))?; // unwrap: guaranteed that embeddings.len() == 1, otherwise the previous line terminated in error Ok(embeddings.pop().unwrap()) } @@ -179,7 +186,7 @@ impl Embedder { } pub fn prompt_count_in_chunk_hint(&self) -> usize { - match self.options.input_type { + match self.data.request.input_type() { InputType::Text => 1, InputType::TextArray => 10, } @@ -190,87 +197,44 @@ impl Embedder { } pub fn distribution(&self) -> Option { - self.options.distribution + self.distribution } } -fn infer_dimensions( - client: &ureq::Agent, - options: &EmbedderOptions, - bearer: Option<&str>, -) -> Result { - let v = embed(client, options, bearer, ["test"].as_slice(), 1) +fn infer_dimensions(data: &EmbedderData) -> Result { + let v = embed(data, ["test"].as_slice(), 1, None) .map_err(NewEmbedderError::could_not_determine_dimension)?; // unwrap: guaranteed that v.len() == 1, otherwise the previous line terminated in error Ok(v.first().unwrap().dimension()) } fn embed( - client: &ureq::Agent, - options: &EmbedderOptions, - bearer: Option<&str>, + data: &EmbedderData, inputs: &[S], expected_count: usize, + expected_dimension: Option, ) -> Result>, EmbedError> where S: Serialize, { - let request = client.post(&options.url); - let request = - if let Some(bearer) = bearer { request.set("Authorization", bearer) } else { request }; + let request = data.client.post(&data.url); + let request = if let Some(bearer) = &data.bearer { + request.set("Authorization", bearer) + } else { + request + }; let request = request.set("Content-Type", "application/json"); - let input_value = match options.input_type { - InputType::Text => serde_json::json!(inputs.first()), - InputType::TextArray => serde_json::json!(inputs), - }; - - let body = match options.input_field.as_slice() { - [] => { - // inject input in body - input_value - } - [input] => { - let mut body = options.query.clone(); - - body.as_object_mut() - .ok_or_else(|| { - EmbedError::rest_not_an_object( - options.query.clone(), - options.input_field.clone(), - ) - })? - .insert(input.clone(), input_value); - body - } - [path @ .., input] => { - let mut body = options.query.clone(); - - let mut current_value = &mut body; - for component in path { - current_value = current_value - .as_object_mut() - .ok_or_else(|| { - EmbedError::rest_not_an_object( - options.query.clone(), - options.input_field.clone(), - ) - })? - .entry(component.clone()) - .or_insert(serde_json::json!({})); - } - - current_value.as_object_mut().unwrap().insert(input.clone(), input_value); - body - } - }; + let body = data.request.inject_texts(inputs); for attempt in 0..10 { let response = request.clone().send_json(&body); - let result = check_response(response); + let result = check_response(response, data.configuration_source); let retry_duration = match result { - Ok(response) => return response_to_embedding(response, options, expected_count), + Ok(response) => { + return response_to_embedding(response, data, expected_count, expected_dimension) + } Err(retry) => { tracing::warn!("Failed: {}", retry.error); retry.into_duration(attempt) @@ -288,13 +252,16 @@ where } let response = request.send_json(&body); - let result = check_response(response); - result - .map_err(Retry::into_error) - .and_then(|response| response_to_embedding(response, options, expected_count)) + let result = check_response(response, data.configuration_source); + result.map_err(Retry::into_error).and_then(|response| { + response_to_embedding(response, data, expected_count, expected_dimension) + }) } -fn check_response(response: Result) -> Result { +fn check_response( + response: Result, + configuration_source: ConfigurationSource, +) -> Result { match response { Ok(response) => Ok(response), Err(ureq::Error::Status(code, response)) => { @@ -302,7 +269,10 @@ fn check_response(response: Result) -> Result Retry::give_up(EmbedError::rest_unauthorized(error_response)), 429 => Retry::rate_limited(EmbedError::rest_too_many_requests(error_response)), - 400 => Retry::give_up(EmbedError::rest_bad_request(error_response)), + 400 => Retry::give_up(EmbedError::rest_bad_request( + error_response, + configuration_source, + )), 500..=599 => { Retry::retry_later(EmbedError::rest_internal_server_error(code, error_response)) } @@ -320,68 +290,111 @@ fn check_response(response: Result) -> Result, ) -> Result>, EmbedError> { let response: serde_json::Value = response.into_json().map_err(EmbedError::rest_response_deserialization)?; - let mut current_value = &response; - for component in &options.path_to_embeddings { - let component = component.as_ref(); - current_value = current_value.get(component).ok_or_else(|| { - EmbedError::rest_response_missing_embeddings( - response.clone(), - component, - &options.path_to_embeddings, - ) - })?; - } - - let embeddings = match options.input_type { - InputType::Text => { - for component in &options.embedding_object { - current_value = current_value.get(component).ok_or_else(|| { - EmbedError::rest_response_missing_embeddings( - response.clone(), - component, - &options.embedding_object, - ) - })?; - } - let embeddings = current_value.to_owned(); - let embeddings: Embedding = - serde_json::from_value(embeddings).map_err(EmbedError::rest_response_format)?; - - vec![Embeddings::from_single_embedding(embeddings)] - } - InputType::TextArray => { - let empty = vec![]; - let values = current_value.as_array().unwrap_or(&empty); - let mut embeddings: Vec> = Vec::with_capacity(expected_count); - for value in values { - let mut current_value = value; - for component in &options.embedding_object { - current_value = current_value.get(component).ok_or_else(|| { - EmbedError::rest_response_missing_embeddings( - response.clone(), - component, - &options.embedding_object, - ) - })?; - } - let embedding = current_value.to_owned(); - let embedding: Embedding = - serde_json::from_value(embedding).map_err(EmbedError::rest_response_format)?; - embeddings.push(Embeddings::from_single_embedding(embedding)); - } - embeddings - } - }; + let embeddings = data.response.extract_embeddings(response)?; if embeddings.len() != expected_count { return Err(EmbedError::rest_response_embedding_count(expected_count, embeddings.len())); } + if let Some(dimensions) = expected_dimensions { + for embedding in &embeddings { + if embedding.dimension() != dimensions { + return Err(EmbedError::rest_unexpected_dimension( + dimensions, + embedding.dimension(), + )); + } + } + } + Ok(embeddings) } + +pub(super) const REQUEST_PLACEHOLDER: &str = "{{text}}"; +pub(super) const RESPONSE_PLACEHOLDER: &str = "{{embedding}}"; +pub(super) const REPEAT_PLACEHOLDER: &str = "{{..}}"; + +#[derive(Debug)] +pub struct Request { + template: ValueTemplate, +} + +impl Request { + pub fn new(template: serde_json::Value) -> Result { + let template = match ValueTemplate::new(template, REQUEST_PLACEHOLDER, REPEAT_PLACEHOLDER) { + Ok(template) => template, + Err(error) => { + let message = + error.error_message("request", REQUEST_PLACEHOLDER, REPEAT_PLACEHOLDER); + return Err(NewEmbedderError::rest_could_not_parse_template(message)); + } + }; + + Ok(Self { template }) + } + + fn input_type(&self) -> InputType { + if self.template.has_array_value() { + InputType::TextArray + } else { + InputType::Text + } + } + + pub fn inject_texts( + &self, + texts: impl IntoIterator, + ) -> serde_json::Value { + self.template.inject(texts.into_iter().map(|s| serde_json::json!(s))).unwrap() + } +} + +#[derive(Debug)] +pub struct Response { + template: ValueTemplate, +} + +impl Response { + pub fn new(template: serde_json::Value, request: &Request) -> Result { + let template = match ValueTemplate::new(template, RESPONSE_PLACEHOLDER, REPEAT_PLACEHOLDER) + { + Ok(template) => template, + Err(error) => { + let message = + error.error_message("response", RESPONSE_PLACEHOLDER, REPEAT_PLACEHOLDER); + return Err(NewEmbedderError::rest_could_not_parse_template(message)); + } + }; + + match (template.has_array_value(), request.template.has_array_value()) { + (true, true) | (false, false) => Ok(Self {template}), + (true, false) => Err(NewEmbedderError::rest_could_not_parse_template("in `response`: `response` has multiple embeddings, but `request` has only one text to embed".to_string())), + (false, true) => Err(NewEmbedderError::rest_could_not_parse_template("in `response`: `response` has a single embedding, but `request` has multiple texts to embed".to_string())), + } + } + + pub fn extract_embeddings( + &self, + response: serde_json::Value, + ) -> Result>, EmbedError> { + let extracted_values: Vec = match self.template.extract(response) { + Ok(extracted_values) => extracted_values, + Err(error) => { + let error_message = + error.error_message("response", "{{embedding}}", "an array of numbers"); + return Err(EmbedError::rest_extraction_error(error_message)); + } + }; + let embeddings: Vec> = + extracted_values.into_iter().map(Embeddings::from_single_embedding).collect(); + + Ok(embeddings) + } +} diff --git a/milli/src/vector/settings.rs b/milli/src/vector/settings.rs index 4b04e3370..e15999d4f 100644 --- a/milli/src/vector/settings.rs +++ b/milli/src/vector/settings.rs @@ -2,7 +2,6 @@ use deserr::Deserr; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; -use super::rest::InputType; use super::{ollama, openai, DistributionShift}; use crate::prompt::PromptData; use crate::update::Setting; @@ -36,19 +35,10 @@ pub struct EmbeddingSettings { pub url: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] - pub query: Setting, + pub request: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] - pub input_field: Setting>, - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - #[deserr(default)] - pub path_to_embeddings: Setting>, - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - #[deserr(default)] - pub embedding_object: Setting>, - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - #[deserr(default)] - pub input_type: Setting, + pub response: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] pub distribution: Setting, @@ -112,11 +102,8 @@ impl SettingsDiff { mut dimensions, mut document_template, mut url, - mut query, - mut input_field, - mut path_to_embeddings, - mut embedding_object, - mut input_type, + mut request, + mut response, mut distribution, } = old; @@ -128,11 +115,8 @@ impl SettingsDiff { dimensions: new_dimensions, document_template: new_document_template, url: new_url, - query: new_query, - input_field: new_input_field, - path_to_embeddings: new_path_to_embeddings, - embedding_object: new_embedding_object, - input_type: new_input_type, + request: new_request, + response: new_response, distribution: new_distribution, } = new; @@ -148,11 +132,8 @@ impl SettingsDiff { &mut revision, &mut dimensions, &mut url, - &mut query, - &mut input_field, - &mut path_to_embeddings, - &mut embedding_object, - &mut input_type, + &mut request, + &mut response, &mut document_template, ) } @@ -177,19 +158,10 @@ impl SettingsDiff { } } } - if query.apply(new_query) { + if request.apply(new_request) { ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); } - if input_field.apply(new_input_field) { - ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); - } - if path_to_embeddings.apply(new_path_to_embeddings) { - ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); - } - if embedding_object.apply(new_embedding_object) { - ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); - } - if input_type.apply(new_input_type) { + if response.apply(new_response) { ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); } if document_template.apply(new_document_template) { @@ -210,11 +182,8 @@ impl SettingsDiff { dimensions, document_template, url, - query, - input_field, - path_to_embeddings, - embedding_object, - input_type, + request, + response, distribution, }; @@ -246,11 +215,8 @@ fn apply_default_for_source( revision: &mut Setting, dimensions: &mut Setting, url: &mut Setting, - query: &mut Setting, - input_field: &mut Setting>, - path_to_embeddings: &mut Setting>, - embedding_object: &mut Setting>, - input_type: &mut Setting, + request: &mut Setting, + response: &mut Setting, document_template: &mut Setting, ) { match source { @@ -259,55 +225,40 @@ fn apply_default_for_source( *revision = Setting::Reset; *dimensions = Setting::NotSet; *url = Setting::NotSet; - *query = Setting::NotSet; - *input_field = Setting::NotSet; - *path_to_embeddings = Setting::NotSet; - *embedding_object = Setting::NotSet; - *input_type = Setting::NotSet; + *request = Setting::NotSet; + *response = Setting::NotSet; } Setting::Set(EmbedderSource::Ollama) => { *model = Setting::Reset; *revision = Setting::NotSet; *dimensions = Setting::Reset; *url = Setting::NotSet; - *query = Setting::NotSet; - *input_field = Setting::NotSet; - *path_to_embeddings = Setting::NotSet; - *embedding_object = Setting::NotSet; - *input_type = Setting::NotSet; + *request = Setting::NotSet; + *response = Setting::NotSet; } Setting::Set(EmbedderSource::OpenAi) | Setting::Reset => { *model = Setting::Reset; *revision = Setting::NotSet; *dimensions = Setting::NotSet; *url = Setting::Reset; - *query = Setting::NotSet; - *input_field = Setting::NotSet; - *path_to_embeddings = Setting::NotSet; - *embedding_object = Setting::NotSet; - *input_type = Setting::NotSet; + *request = Setting::NotSet; + *response = Setting::NotSet; } Setting::Set(EmbedderSource::Rest) => { *model = Setting::NotSet; *revision = Setting::NotSet; *dimensions = Setting::Reset; *url = Setting::Reset; - *query = Setting::Reset; - *input_field = Setting::Reset; - *path_to_embeddings = Setting::Reset; - *embedding_object = Setting::Reset; - *input_type = Setting::Reset; + *request = Setting::Reset; + *response = Setting::Reset; } Setting::Set(EmbedderSource::UserProvided) => { *model = Setting::NotSet; *revision = Setting::NotSet; *dimensions = Setting::Reset; *url = Setting::NotSet; - *query = Setting::NotSet; - *input_field = Setting::NotSet; - *path_to_embeddings = Setting::NotSet; - *embedding_object = Setting::NotSet; - *input_type = Setting::NotSet; + *request = Setting::NotSet; + *response = Setting::NotSet; *document_template = Setting::NotSet; } Setting::NotSet => {} @@ -340,11 +291,8 @@ impl EmbeddingSettings { pub const DOCUMENT_TEMPLATE: &'static str = "documentTemplate"; pub const URL: &'static str = "url"; - pub const QUERY: &'static str = "query"; - pub const INPUT_FIELD: &'static str = "inputField"; - pub const PATH_TO_EMBEDDINGS: &'static str = "pathToEmbeddings"; - pub const EMBEDDING_OBJECT: &'static str = "embeddingObject"; - pub const INPUT_TYPE: &'static str = "inputType"; + pub const REQUEST: &'static str = "request"; + pub const RESPONSE: &'static str = "response"; pub const DISTRIBUTION: &'static str = "distribution"; @@ -374,11 +322,8 @@ impl EmbeddingSettings { EmbedderSource::Rest, ], Self::URL => &[EmbedderSource::Ollama, EmbedderSource::Rest, EmbedderSource::OpenAi], - Self::QUERY => &[EmbedderSource::Rest], - Self::INPUT_FIELD => &[EmbedderSource::Rest], - Self::PATH_TO_EMBEDDINGS => &[EmbedderSource::Rest], - Self::EMBEDDING_OBJECT => &[EmbedderSource::Rest], - Self::INPUT_TYPE => &[EmbedderSource::Rest], + Self::REQUEST => &[EmbedderSource::Rest], + Self::RESPONSE => &[EmbedderSource::Rest], Self::DISTRIBUTION => &[ EmbedderSource::HuggingFace, EmbedderSource::Ollama, @@ -423,11 +368,8 @@ impl EmbeddingSettings { Self::DIMENSIONS, Self::DOCUMENT_TEMPLATE, Self::URL, - Self::QUERY, - Self::INPUT_FIELD, - Self::PATH_TO_EMBEDDINGS, - Self::EMBEDDING_OBJECT, - Self::INPUT_TYPE, + Self::REQUEST, + Self::RESPONSE, Self::DISTRIBUTION, ], } @@ -496,11 +438,8 @@ impl From for EmbeddingSettings { dimensions: Setting::NotSet, document_template: Setting::Set(prompt.template), url: Setting::NotSet, - query: Setting::NotSet, - input_field: Setting::NotSet, - path_to_embeddings: Setting::NotSet, - embedding_object: Setting::NotSet, - input_type: Setting::NotSet, + request: Setting::NotSet, + response: Setting::NotSet, distribution: distribution.map(Setting::Set).unwrap_or_default(), }, super::EmbedderOptions::OpenAi(super::openai::EmbedderOptions { @@ -517,11 +456,8 @@ impl From for EmbeddingSettings { dimensions: dimensions.map(Setting::Set).unwrap_or_default(), document_template: Setting::Set(prompt.template), url: url.map(Setting::Set).unwrap_or_default(), - query: Setting::NotSet, - input_field: Setting::NotSet, - path_to_embeddings: Setting::NotSet, - embedding_object: Setting::NotSet, - input_type: Setting::NotSet, + request: Setting::NotSet, + response: Setting::NotSet, distribution: distribution.map(Setting::Set).unwrap_or_default(), }, super::EmbedderOptions::Ollama(super::ollama::EmbedderOptions { @@ -537,11 +473,8 @@ impl From for EmbeddingSettings { dimensions: Setting::NotSet, document_template: Setting::Set(prompt.template), url: url.map(Setting::Set).unwrap_or_default(), - query: Setting::NotSet, - input_field: Setting::NotSet, - path_to_embeddings: Setting::NotSet, - embedding_object: Setting::NotSet, - input_type: Setting::NotSet, + request: Setting::NotSet, + response: Setting::NotSet, distribution: distribution.map(Setting::Set).unwrap_or_default(), }, super::EmbedderOptions::UserProvided(super::manual::EmbedderOptions { @@ -555,22 +488,16 @@ impl From for EmbeddingSettings { dimensions: Setting::Set(dimensions), document_template: Setting::NotSet, url: Setting::NotSet, - query: Setting::NotSet, - input_field: Setting::NotSet, - path_to_embeddings: Setting::NotSet, - embedding_object: Setting::NotSet, - input_type: Setting::NotSet, + request: Setting::NotSet, + response: Setting::NotSet, distribution: distribution.map(Setting::Set).unwrap_or_default(), }, super::EmbedderOptions::Rest(super::rest::EmbedderOptions { api_key, dimensions, url, - query, - input_field, - path_to_embeddings, - embedding_object, - input_type, + request, + response, distribution, }) => Self { source: Setting::Set(EmbedderSource::Rest), @@ -580,11 +507,8 @@ impl From for EmbeddingSettings { dimensions: dimensions.map(Setting::Set).unwrap_or_default(), document_template: Setting::Set(prompt.template), url: Setting::Set(url), - query: Setting::Set(query), - input_field: Setting::Set(input_field), - path_to_embeddings: Setting::Set(path_to_embeddings), - embedding_object: Setting::Set(embedding_object), - input_type: Setting::Set(input_type), + request: Setting::Set(request), + response: Setting::Set(response), distribution: distribution.map(Setting::Set).unwrap_or_default(), }, } @@ -602,11 +526,8 @@ impl From for EmbeddingConfig { dimensions, document_template, url, - query, - input_field, - path_to_embeddings, - embedding_object, - input_type, + request, + response, distribution, } = value; @@ -669,22 +590,13 @@ impl From for EmbeddingConfig { }); } EmbedderSource::Rest => { - let embedder_options = super::rest::EmbedderOptions::default(); - this.embedder_options = super::EmbedderOptions::Rest(super::rest::EmbedderOptions { api_key: api_key.set(), dimensions: dimensions.set(), url: url.set().unwrap(), - query: query.set().unwrap_or(embedder_options.query), - input_field: input_field.set().unwrap_or(embedder_options.input_field), - path_to_embeddings: path_to_embeddings - .set() - .unwrap_or(embedder_options.path_to_embeddings), - embedding_object: embedding_object - .set() - .unwrap_or(embedder_options.embedding_object), - input_type: input_type.set().unwrap_or(embedder_options.input_type), + request: request.set().unwrap(), + response: response.set().unwrap(), distribution: distribution.set(), }) }