Vector settings to add indexingFragments and searchFragments

This commit is contained in:
Louis Dureuil 2025-06-29 23:55:28 +02:00
parent d48baece51
commit f3d5c74c02
No known key found for this signature in database

View file

@ -2,6 +2,8 @@ use std::collections::BTreeMap;
use std::num::NonZeroUsize; use std::num::NonZeroUsize;
use deserr::Deserr; use deserr::Deserr;
use either::Either;
use itertools::Itertools;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use utoipa::ToSchema; use utoipa::ToSchema;
@ -229,6 +231,35 @@ pub struct EmbeddingSettings {
/// - 🏗️ When modified for sources `ollama` and `rest`, embeddings are always regenerated /// - 🏗️ When modified for sources `ollama` and `rest`, embeddings are always regenerated
pub url: Setting<String>, pub url: Setting<String>,
/// Template fragments that will be reassembled and sent to the remote embedder at indexing time.
///
/// # Availability
///
/// - This parameter is available for sources `rest`.
///
/// # 🔄 Reindexing
///
/// - 🏗️ When a fragment is deleted by passing `null` to its name, the corresponding embeddings are removed from documents.
/// - 🏗️ When a fragment is modified, the corresponding embeddings are regenerated if their rendered version changes.
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<BTreeMap<String, serde_json::Value>>)]
pub indexing_fragments: Setting<BTreeMap<String, Option<Fragment>>>,
/// Template fragments that will be reassembled and sent to the remote embedder at search time.
///
/// # Availability
///
/// - This parameter is available for sources `rest`.
///
/// # 🔄 Reindexing
///
/// - 🌱 Changing the value of this parameter never regenerates embeddings
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<BTreeMap<String, serde_json::Value>>)]
pub search_fragments: Setting<BTreeMap<String, Option<Fragment>>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")] #[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)] #[deserr(default)]
#[schema(value_type = Option<serde_json::Value>)] #[schema(value_type = Option<serde_json::Value>)]
@ -483,6 +514,36 @@ pub struct SubEmbeddingSettings {
/// - 🌱 When modified for source `openAi`, embeddings are never regenerated /// - 🌱 When modified for source `openAi`, embeddings are never regenerated
/// - 🏗️ When modified for sources `ollama` and `rest`, embeddings are always regenerated /// - 🏗️ When modified for sources `ollama` and `rest`, embeddings are always regenerated
pub url: Setting<String>, pub url: Setting<String>,
/// Template fragments that will be reassembled and sent to the remote embedder at indexing time.
///
/// # Availability
///
/// - This parameter is available for sources `rest`.
///
/// # 🔄 Reindexing
///
/// - 🏗️ When a fragment is deleted by passing `null` to its name, the corresponding embeddings are removed from documents.
/// - 🏗️ When a fragment is modified, the corresponding embeddings are regenerated if their rendered version changes.
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<BTreeMap<String, serde_json::Value>>)]
pub indexing_fragments: Setting<BTreeMap<String, Option<Fragment>>>,
/// Template fragments that will be reassembled and sent to the remote embedder at search time.
///
/// # Availability
///
/// - This parameter is available for sources `rest`.
///
/// # 🔄 Reindexing
///
/// - 🌱 Changing the value of this parameter never regenerates embeddings
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<BTreeMap<String, serde_json::Value>>)]
pub search_fragments: Setting<BTreeMap<String, Option<Fragment>>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")] #[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)] #[deserr(default)]
#[schema(value_type = Option<serde_json::Value>)] #[schema(value_type = Option<serde_json::Value>)]
@ -555,16 +616,24 @@ pub struct SubEmbeddingSettings {
} }
/// Indicates what action should take place during a reindexing operation for an embedder /// Indicates what action should take place during a reindexing operation for an embedder
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] #[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum ReindexAction { pub enum ReindexAction {
/// An indexing operation should take place for this embedder, keeping existing vectors /// An indexing operation should take place for this embedder, keeping existing vectors
/// and checking whether the document template changed or not /// and checking whether the document template changed or not
RegeneratePrompts, RegeneratePrompts,
RegenerateFragments(Vec<(String, RegenerateFragment)>),
/// An indexing operation should take place for all documents for this embedder, removing existing vectors /// An indexing operation should take place for all documents for this embedder, removing existing vectors
/// (except userProvided ones) /// (except userProvided ones)
FullReindex, FullReindex,
} }
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum RegenerateFragment {
Update,
Remove,
Add,
}
pub enum SettingsDiff { pub enum SettingsDiff {
Remove, Remove,
Reindex { action: ReindexAction, updated_settings: EmbeddingSettings, quantize: bool }, Reindex { action: ReindexAction, updated_settings: EmbeddingSettings, quantize: bool },
@ -577,6 +646,12 @@ pub struct EmbedderAction {
pub is_being_quantized: bool, pub is_being_quantized: bool,
pub write_back: Option<WriteBackToDocuments>, pub write_back: Option<WriteBackToDocuments>,
pub reindex: Option<ReindexAction>, pub reindex: Option<ReindexAction>,
pub remove_fragments: Option<RemoveFragments>,
}
#[derive(Debug)]
pub struct RemoveFragments {
pub fragment_ids: Vec<u8>,
} }
impl EmbedderAction { impl EmbedderAction {
@ -592,6 +667,10 @@ impl EmbedderAction {
self.reindex.as_ref() self.reindex.as_ref()
} }
pub fn remove_fragments(&self) -> Option<&RemoveFragments> {
self.remove_fragments.as_ref()
}
pub fn with_is_being_quantized(mut self, quantize: bool) -> Self { pub fn with_is_being_quantized(mut self, quantize: bool) -> Self {
self.is_being_quantized = quantize; self.is_being_quantized = quantize;
self self
@ -603,11 +682,23 @@ impl EmbedderAction {
is_being_quantized: false, is_being_quantized: false,
write_back: Some(write_back), write_back: Some(write_back),
reindex: None, reindex: None,
remove_fragments: None,
} }
} }
pub fn with_reindex(reindex: ReindexAction, was_quantized: bool) -> Self { pub fn with_reindex(reindex: ReindexAction, was_quantized: bool) -> Self {
Self { was_quantized, is_being_quantized: false, write_back: None, reindex: Some(reindex) } Self {
was_quantized,
is_being_quantized: false,
write_back: None,
reindex: Some(reindex),
remove_fragments: None,
}
}
pub fn with_remove_fragments(mut self, remove_fragments: RemoveFragments) -> Self {
self.remove_fragments = Some(remove_fragments);
self
} }
} }
@ -634,6 +725,8 @@ impl SettingsDiff {
mut dimensions, mut dimensions,
mut document_template, mut document_template,
mut url, mut url,
mut indexing_fragments,
mut search_fragments,
mut request, mut request,
mut response, mut response,
mut search_embedder, mut search_embedder,
@ -653,6 +746,8 @@ impl SettingsDiff {
dimensions: new_dimensions, dimensions: new_dimensions,
document_template: new_document_template, document_template: new_document_template,
url: new_url, url: new_url,
indexing_fragments: new_indexing_fragments,
search_fragments: new_search_fragments,
request: new_request, request: new_request,
response: new_response, response: new_response,
search_embedder: new_search_embedder, search_embedder: new_search_embedder,
@ -684,6 +779,8 @@ impl SettingsDiff {
&mut document_template, &mut document_template,
&mut document_template_max_bytes, &mut document_template_max_bytes,
&mut url, &mut url,
&mut indexing_fragments,
&mut search_fragments,
&mut request, &mut request,
&mut response, &mut response,
&mut headers, &mut headers,
@ -696,6 +793,8 @@ impl SettingsDiff {
new_document_template, new_document_template,
new_document_template_max_bytes, new_document_template_max_bytes,
new_url, new_url,
new_indexing_fragments,
new_search_fragments,
new_request, new_request,
new_response, new_response,
new_headers, new_headers,
@ -722,6 +821,8 @@ impl SettingsDiff {
dimensions, dimensions,
document_template, document_template,
url, url,
indexing_fragments,
search_fragments,
request, request,
response, response,
search_embedder, search_embedder,
@ -769,6 +870,8 @@ impl SettingsDiff {
mut document_template, mut document_template,
mut document_template_max_bytes, mut document_template_max_bytes,
mut url, mut url,
mut indexing_fragments,
mut search_fragments,
mut request, mut request,
mut response, mut response,
mut headers, mut headers,
@ -794,6 +897,8 @@ impl SettingsDiff {
document_template: new_document_template, document_template: new_document_template,
document_template_max_bytes: new_document_template_max_bytes, document_template_max_bytes: new_document_template_max_bytes,
url: new_url, url: new_url,
indexing_fragments: new_indexing_fragments,
search_fragments: new_search_fragments,
request: new_request, request: new_request,
response: new_response, response: new_response,
headers: new_headers, headers: new_headers,
@ -814,6 +919,8 @@ impl SettingsDiff {
&mut document_template, &mut document_template,
&mut document_template_max_bytes, &mut document_template_max_bytes,
&mut url, &mut url,
&mut indexing_fragments,
&mut search_fragments,
&mut request, &mut request,
&mut response, &mut response,
&mut headers, &mut headers,
@ -826,6 +933,8 @@ impl SettingsDiff {
new_document_template, new_document_template,
new_document_template_max_bytes, new_document_template_max_bytes,
new_url, new_url,
new_indexing_fragments,
new_search_fragments,
new_request, new_request,
new_response, new_response,
new_headers, new_headers,
@ -846,6 +955,8 @@ impl SettingsDiff {
dimensions, dimensions,
document_template, document_template,
url, url,
indexing_fragments,
search_fragments,
request, request,
response, response,
headers, headers,
@ -875,6 +986,8 @@ impl SettingsDiff {
document_template: &mut Setting<String>, document_template: &mut Setting<String>,
document_template_max_bytes: &mut Setting<usize>, document_template_max_bytes: &mut Setting<usize>,
url: &mut Setting<String>, url: &mut Setting<String>,
indexing_fragments: &mut Setting<BTreeMap<String, Option<Fragment>>>,
search_fragments: &mut Setting<BTreeMap<String, Option<Fragment>>>,
request: &mut Setting<serde_json::Value>, request: &mut Setting<serde_json::Value>,
response: &mut Setting<serde_json::Value>, response: &mut Setting<serde_json::Value>,
headers: &mut Setting<BTreeMap<String, String>>, headers: &mut Setting<BTreeMap<String, String>>,
@ -887,6 +1000,8 @@ impl SettingsDiff {
new_document_template: Setting<String>, new_document_template: Setting<String>,
new_document_template_max_bytes: Setting<usize>, new_document_template_max_bytes: Setting<usize>,
new_url: Setting<String>, new_url: Setting<String>,
new_indexing_fragments: Setting<BTreeMap<String, Option<Fragment>>>,
new_search_fragments: Setting<BTreeMap<String, Option<Fragment>>>,
new_request: Setting<serde_json::Value>, new_request: Setting<serde_json::Value>,
new_response: Setting<serde_json::Value>, new_response: Setting<serde_json::Value>,
new_headers: Setting<BTreeMap<String, String>>, new_headers: Setting<BTreeMap<String, String>>,
@ -902,6 +1017,8 @@ impl SettingsDiff {
pooling, pooling,
dimensions, dimensions,
url, url,
indexing_fragments,
search_fragments,
request, request,
response, response,
document_template, document_template,
@ -941,6 +1058,104 @@ impl SettingsDiff {
} }
} }
} }
*search_fragments = match (std::mem::take(search_fragments), new_search_fragments) {
(Setting::Set(search_fragments), Setting::Set(new_search_fragments)) => {
Setting::Set(
search_fragments
.into_iter()
.merge_join_by(new_search_fragments, |(left, _), (right, _)| {
left.cmp(right)
})
.map(|eob| {
match eob {
// merge fragments
itertools::EitherOrBoth::Both((name, _), (_, right)) => {
(name, right)
}
// unchanged fragment
itertools::EitherOrBoth::Left(left) => left,
// new fragment
itertools::EitherOrBoth::Right(right) => right,
}
})
.collect(),
)
}
(_, Setting::Reset) => Setting::Reset,
(left, Setting::NotSet) => left,
(Setting::NotSet | Setting::Reset, Setting::Set(new_search_fragments)) => {
Setting::Set(new_search_fragments)
}
};
let mut regenerate_fragments = Vec::new();
*indexing_fragments = match (std::mem::take(indexing_fragments), new_indexing_fragments) {
(Setting::Set(fragments), Setting::Set(new_fragments)) => {
Setting::Set(
fragments
.into_iter()
.merge_join_by(new_fragments, |(left, _), (right, _)| left.cmp(right))
.map(|eob| {
match eob {
// merge fragments
itertools::EitherOrBoth::Both(
(name, left),
(other_name, right),
) => {
if left == right {
(name, left)
} else {
match right {
Some(right) => {
regenerate_fragments
.push((other_name, RegenerateFragment::Update));
(name, Some(right))
}
None => {
regenerate_fragments
.push((other_name, RegenerateFragment::Remove));
(name, None)
}
}
}
}
// unchanged fragment
itertools::EitherOrBoth::Left(left) => left,
// new fragment
itertools::EitherOrBoth::Right((name, right)) => {
if right.is_some() {
regenerate_fragments
.push((name.clone(), RegenerateFragment::Add));
}
(name, right)
}
}
})
.collect(),
)
}
// remove all fragments => move to document template
(_, Setting::Reset) => {
ReindexAction::push_action(reindex_action, ReindexAction::FullReindex);
Setting::Reset
}
// add all fragments
(Setting::NotSet | Setting::Reset, Setting::Set(new_fragments)) => {
ReindexAction::push_action(reindex_action, ReindexAction::FullReindex);
Setting::Set(new_fragments)
}
// no change
(left, Setting::NotSet) => left,
};
if !regenerate_fragments.is_empty() {
ReindexAction::push_action(
reindex_action,
ReindexAction::RegenerateFragments(regenerate_fragments),
);
}
if request.apply(new_request) { if request.apply(new_request) {
ReindexAction::push_action(reindex_action, ReindexAction::FullReindex); ReindexAction::push_action(reindex_action, ReindexAction::FullReindex);
} }
@ -972,10 +1187,16 @@ impl SettingsDiff {
impl ReindexAction { impl ReindexAction {
fn push_action(this: &mut Option<Self>, other: Self) { fn push_action(this: &mut Option<Self>, other: Self) {
*this = match (*this, other) { use ReindexAction::*;
(_, ReindexAction::FullReindex) => Some(ReindexAction::FullReindex), *this = match (this.take(), other) {
(Some(ReindexAction::FullReindex), _) => Some(ReindexAction::FullReindex), (_, FullReindex) => Some(FullReindex),
(_, ReindexAction::RegeneratePrompts) => Some(ReindexAction::RegeneratePrompts), (Some(FullReindex), _) => Some(FullReindex),
(_, RegenerateFragments(fragments)) => Some(RegenerateFragments(fragments)),
(Some(RegenerateFragments(fragments)), RegeneratePrompts) => {
Some(RegenerateFragments(fragments))
}
(Some(RegeneratePrompts), RegeneratePrompts) => Some(RegeneratePrompts),
(None, RegeneratePrompts) => Some(RegeneratePrompts),
} }
} }
} }
@ -988,6 +1209,8 @@ fn apply_default_for_source(
pooling: &mut Setting<OverridePooling>, pooling: &mut Setting<OverridePooling>,
dimensions: &mut Setting<usize>, dimensions: &mut Setting<usize>,
url: &mut Setting<String>, url: &mut Setting<String>,
indexing_fragments: &mut Setting<BTreeMap<String, Option<Fragment>>>,
search_fragments: &mut Setting<BTreeMap<String, Option<Fragment>>>,
request: &mut Setting<serde_json::Value>, request: &mut Setting<serde_json::Value>,
response: &mut Setting<serde_json::Value>, response: &mut Setting<serde_json::Value>,
document_template: &mut Setting<String>, document_template: &mut Setting<String>,
@ -1003,6 +1226,8 @@ fn apply_default_for_source(
*pooling = Setting::Reset; *pooling = Setting::Reset;
*dimensions = Setting::NotSet; *dimensions = Setting::NotSet;
*url = Setting::NotSet; *url = Setting::NotSet;
*indexing_fragments = Setting::NotSet;
*search_fragments = Setting::NotSet;
*request = Setting::NotSet; *request = Setting::NotSet;
*response = Setting::NotSet; *response = Setting::NotSet;
*headers = Setting::NotSet; *headers = Setting::NotSet;
@ -1015,6 +1240,8 @@ fn apply_default_for_source(
*pooling = Setting::NotSet; *pooling = Setting::NotSet;
*dimensions = Setting::Reset; *dimensions = Setting::Reset;
*url = Setting::NotSet; *url = Setting::NotSet;
*indexing_fragments = Setting::NotSet;
*search_fragments = Setting::NotSet;
*request = Setting::NotSet; *request = Setting::NotSet;
*response = Setting::NotSet; *response = Setting::NotSet;
*headers = Setting::NotSet; *headers = Setting::NotSet;
@ -1027,6 +1254,8 @@ fn apply_default_for_source(
*pooling = Setting::NotSet; *pooling = Setting::NotSet;
*dimensions = Setting::NotSet; *dimensions = Setting::NotSet;
*url = Setting::Reset; *url = Setting::Reset;
*indexing_fragments = Setting::NotSet;
*search_fragments = Setting::NotSet;
*request = Setting::NotSet; *request = Setting::NotSet;
*response = Setting::NotSet; *response = Setting::NotSet;
*headers = Setting::NotSet; *headers = Setting::NotSet;
@ -1039,6 +1268,8 @@ fn apply_default_for_source(
*pooling = Setting::NotSet; *pooling = Setting::NotSet;
*dimensions = Setting::Reset; *dimensions = Setting::Reset;
*url = Setting::Reset; *url = Setting::Reset;
*indexing_fragments = Setting::Reset;
*search_fragments = Setting::Reset;
*request = Setting::Reset; *request = Setting::Reset;
*response = Setting::Reset; *response = Setting::Reset;
*headers = Setting::Reset; *headers = Setting::Reset;
@ -1051,6 +1282,8 @@ fn apply_default_for_source(
*pooling = Setting::NotSet; *pooling = Setting::NotSet;
*dimensions = Setting::Reset; *dimensions = Setting::Reset;
*url = Setting::NotSet; *url = Setting::NotSet;
*indexing_fragments = Setting::NotSet;
*search_fragments = Setting::NotSet;
*request = Setting::NotSet; *request = Setting::NotSet;
*response = Setting::NotSet; *response = Setting::NotSet;
*document_template = Setting::NotSet; *document_template = Setting::NotSet;
@ -1065,6 +1298,8 @@ fn apply_default_for_source(
*pooling = Setting::NotSet; *pooling = Setting::NotSet;
*dimensions = Setting::NotSet; *dimensions = Setting::NotSet;
*url = Setting::NotSet; *url = Setting::NotSet;
*indexing_fragments = Setting::NotSet;
*search_fragments = Setting::NotSet;
*request = Setting::NotSet; *request = Setting::NotSet;
*response = Setting::NotSet; *response = Setting::NotSet;
*document_template = Setting::NotSet; *document_template = Setting::NotSet;
@ -1131,6 +1366,8 @@ pub enum MetaEmbeddingSetting {
DocumentTemplate, DocumentTemplate,
DocumentTemplateMaxBytes, DocumentTemplateMaxBytes,
Url, Url,
IndexingFragments,
SearchFragments,
Request, Request,
Response, Response,
Headers, Headers,
@ -1153,6 +1390,8 @@ impl MetaEmbeddingSetting {
DocumentTemplate => "documentTemplate", DocumentTemplate => "documentTemplate",
DocumentTemplateMaxBytes => "documentTemplateMaxBytes", DocumentTemplateMaxBytes => "documentTemplateMaxBytes",
Url => "url", Url => "url",
IndexingFragments => "indexingFragments",
SearchFragments => "searchFragments",
Request => "request", Request => "request",
Response => "response", Response => "response",
Headers => "headers", Headers => "headers",
@ -1176,6 +1415,8 @@ impl EmbeddingSettings {
dimensions: &Setting<usize>, dimensions: &Setting<usize>,
api_key: &Setting<String>, api_key: &Setting<String>,
url: &Setting<String>, url: &Setting<String>,
indexing_fragments: &Setting<BTreeMap<String, Option<Fragment>>>,
search_fragments: &Setting<BTreeMap<String, Option<Fragment>>>,
request: &Setting<serde_json::Value>, request: &Setting<serde_json::Value>,
response: &Setting<serde_json::Value>, response: &Setting<serde_json::Value>,
document_template: &Setting<String>, document_template: &Setting<String>,
@ -1210,6 +1451,20 @@ impl EmbeddingSettings {
)?; )?;
Self::check_setting(embedder_name, source, MetaEmbeddingSetting::ApiKey, context, api_key)?; Self::check_setting(embedder_name, source, MetaEmbeddingSetting::ApiKey, context, api_key)?;
Self::check_setting(embedder_name, source, MetaEmbeddingSetting::Url, context, url)?; Self::check_setting(embedder_name, source, MetaEmbeddingSetting::Url, context, url)?;
Self::check_setting(
embedder_name,
source,
MetaEmbeddingSetting::IndexingFragments,
context,
indexing_fragments,
)?;
Self::check_setting(
embedder_name,
source,
MetaEmbeddingSetting::SearchFragments,
context,
search_fragments,
)?;
Self::check_setting( Self::check_setting(
embedder_name, embedder_name,
source, source,
@ -1348,8 +1603,8 @@ impl EmbeddingSettings {
) => FieldStatus::Allowed, ) => FieldStatus::Allowed,
( (
OpenAi, OpenAi,
Revision | Pooling | Request | Response | Headers | SearchEmbedder Revision | Pooling | IndexingFragments | SearchFragments | Request | Response
| IndexingEmbedder, | Headers | SearchEmbedder | IndexingEmbedder,
_, _,
) => FieldStatus::Disallowed, ) => FieldStatus::Disallowed,
( (
@ -1359,8 +1614,8 @@ impl EmbeddingSettings {
) => FieldStatus::Allowed, ) => FieldStatus::Allowed,
( (
HuggingFace, HuggingFace,
ApiKey | Dimensions | Url | Request | Response | Headers | SearchEmbedder ApiKey | Dimensions | Url | IndexingFragments | SearchFragments | Request
| IndexingEmbedder, | Response | Headers | SearchEmbedder | IndexingEmbedder,
_, _,
) => FieldStatus::Disallowed, ) => FieldStatus::Disallowed,
(Ollama, Model, _) => FieldStatus::Mandatory, (Ollama, Model, _) => FieldStatus::Mandatory,
@ -1371,8 +1626,8 @@ impl EmbeddingSettings {
) => FieldStatus::Allowed, ) => FieldStatus::Allowed,
( (
Ollama, Ollama,
Revision | Pooling | Request | Response | Headers | SearchEmbedder Revision | Pooling | IndexingFragments | SearchFragments | Request | Response
| IndexingEmbedder, | Headers | SearchEmbedder | IndexingEmbedder,
_, _,
) => FieldStatus::Disallowed, ) => FieldStatus::Disallowed,
(UserProvided, Dimensions, _) => FieldStatus::Mandatory, (UserProvided, Dimensions, _) => FieldStatus::Mandatory,
@ -1386,6 +1641,8 @@ impl EmbeddingSettings {
| DocumentTemplate | DocumentTemplate
| DocumentTemplateMaxBytes | DocumentTemplateMaxBytes
| Url | Url
| IndexingFragments
| SearchFragments
| Request | Request
| Response | Response
| Headers | Headers
@ -1404,6 +1661,10 @@ impl EmbeddingSettings {
| Headers, | Headers,
_, _,
) => FieldStatus::Allowed, ) => FieldStatus::Allowed,
(Rest, IndexingFragments, NotNested | Indexing) => FieldStatus::Allowed,
(Rest, IndexingFragments, Search) => FieldStatus::Disallowed,
(Rest, SearchFragments, NotNested | Search) => FieldStatus::Allowed,
(Rest, SearchFragments, Indexing) => FieldStatus::Disallowed,
(Rest, Model | Revision | Pooling | SearchEmbedder | IndexingEmbedder, _) => { (Rest, Model | Revision | Pooling | SearchEmbedder | IndexingEmbedder, _) => {
FieldStatus::Disallowed FieldStatus::Disallowed
} }
@ -1419,6 +1680,8 @@ impl EmbeddingSettings {
| DocumentTemplate | DocumentTemplate
| DocumentTemplateMaxBytes | DocumentTemplateMaxBytes
| Url | Url
| IndexingFragments
| SearchFragments
| Request | Request
| Response | Response
| Headers, | Headers,
@ -1512,6 +1775,11 @@ impl std::fmt::Display for EmbedderSource {
} }
} }
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr, ToSchema)]
pub struct Fragment {
pub value: serde_json::Value,
}
impl EmbeddingSettings { impl EmbeddingSettings {
fn from_hugging_face( fn from_hugging_face(
super::hf::EmbedderOptions { super::hf::EmbedderOptions {
@ -1534,6 +1802,8 @@ impl EmbeddingSettings {
document_template, document_template,
document_template_max_bytes, document_template_max_bytes,
url: Setting::NotSet, url: Setting::NotSet,
indexing_fragments: Setting::NotSet,
search_fragments: Setting::NotSet,
request: Setting::NotSet, request: Setting::NotSet,
response: Setting::NotSet, response: Setting::NotSet,
headers: Setting::NotSet, headers: Setting::NotSet,
@ -1566,6 +1836,8 @@ impl EmbeddingSettings {
document_template, document_template,
document_template_max_bytes, document_template_max_bytes,
url: Setting::some_or_not_set(url), url: Setting::some_or_not_set(url),
indexing_fragments: Setting::NotSet,
search_fragments: Setting::NotSet,
request: Setting::NotSet, request: Setting::NotSet,
response: Setting::NotSet, response: Setting::NotSet,
headers: Setting::NotSet, headers: Setting::NotSet,
@ -1598,6 +1870,8 @@ impl EmbeddingSettings {
document_template, document_template,
document_template_max_bytes, document_template_max_bytes,
url: Setting::some_or_not_set(url), url: Setting::some_or_not_set(url),
indexing_fragments: Setting::NotSet,
search_fragments: Setting::NotSet,
request: Setting::NotSet, request: Setting::NotSet,
response: Setting::NotSet, response: Setting::NotSet,
headers: Setting::NotSet, headers: Setting::NotSet,
@ -1622,6 +1896,8 @@ impl EmbeddingSettings {
document_template: Setting::NotSet, document_template: Setting::NotSet,
document_template_max_bytes: Setting::NotSet, document_template_max_bytes: Setting::NotSet,
url: Setting::NotSet, url: Setting::NotSet,
indexing_fragments: Setting::NotSet,
search_fragments: Setting::NotSet,
request: Setting::NotSet, request: Setting::NotSet,
response: Setting::NotSet, response: Setting::NotSet,
headers: Setting::NotSet, headers: Setting::NotSet,
@ -1638,6 +1914,8 @@ impl EmbeddingSettings {
dimensions, dimensions,
url, url,
request, request,
indexing_fragments,
search_fragments,
response, response,
distribution, distribution,
headers, headers,
@ -1656,6 +1934,26 @@ impl EmbeddingSettings {
document_template, document_template,
document_template_max_bytes, document_template_max_bytes,
url: Setting::Set(url), url: Setting::Set(url),
indexing_fragments: if indexing_fragments.is_empty() {
Setting::NotSet
} else {
Setting::Set(
indexing_fragments
.into_iter()
.map(|(name, fragment)| (name, Some(Fragment { value: fragment })))
.collect(),
)
},
search_fragments: if search_fragments.is_empty() {
Setting::NotSet
} else {
Setting::Set(
search_fragments
.into_iter()
.map(|(name, fragment)| (name, Some(Fragment { value: fragment })))
.collect(),
)
},
request: Setting::Set(request), request: Setting::Set(request),
response: Setting::Set(response), response: Setting::Set(response),
distribution: Setting::some_or_not_set(distribution), distribution: Setting::some_or_not_set(distribution),
@ -1714,6 +2012,8 @@ impl From<EmbeddingConfig> for EmbeddingSettings {
document_template: Setting::NotSet, document_template: Setting::NotSet,
document_template_max_bytes: Setting::NotSet, document_template_max_bytes: Setting::NotSet,
url: Setting::NotSet, url: Setting::NotSet,
indexing_fragments: Setting::NotSet,
search_fragments: Setting::NotSet,
request: Setting::NotSet, request: Setting::NotSet,
response: Setting::NotSet, response: Setting::NotSet,
headers: Setting::NotSet, headers: Setting::NotSet,
@ -1786,6 +2086,8 @@ impl From<EmbeddingSettings> for SubEmbeddingSettings {
document_template, document_template,
document_template_max_bytes, document_template_max_bytes,
url, url,
indexing_fragments,
search_fragments,
request, request,
response, response,
headers, headers,
@ -1804,6 +2106,8 @@ impl From<EmbeddingSettings> for SubEmbeddingSettings {
document_template, document_template,
document_template_max_bytes, document_template_max_bytes,
url, url,
indexing_fragments,
search_fragments,
request, request,
response, response,
headers, headers,
@ -1828,6 +2132,8 @@ impl From<EmbeddingSettings> for EmbeddingConfig {
document_template, document_template,
document_template_max_bytes, document_template_max_bytes,
url, url,
indexing_fragments,
search_fragments,
request, request,
response, response,
distribution, distribution,
@ -1879,6 +2185,8 @@ impl From<EmbeddingSettings> for EmbeddingConfig {
EmbedderSource::Rest => SubEmbedderOptions::rest( EmbedderSource::Rest => SubEmbedderOptions::rest(
url.set().unwrap(), url.set().unwrap(),
api_key, api_key,
indexing_fragments,
search_fragments,
request.set().unwrap(), request.set().unwrap(),
response.set().unwrap(), response.set().unwrap(),
headers, headers,
@ -1922,6 +2230,8 @@ impl SubEmbedderOptions {
document_template: _, document_template: _,
document_template_max_bytes: _, document_template_max_bytes: _,
url, url,
indexing_fragments,
search_fragments,
request, request,
response, response,
headers, headers,
@ -1944,6 +2254,8 @@ impl SubEmbedderOptions {
EmbedderSource::Rest => Self::rest( EmbedderSource::Rest => Self::rest(
url.set().unwrap(), url.set().unwrap(),
api_key, api_key,
indexing_fragments,
search_fragments,
request.set().unwrap(), request.set().unwrap(),
response.set().unwrap(), response.set().unwrap(),
headers, headers,
@ -2010,9 +2322,13 @@ impl SubEmbedderOptions {
distribution: distribution.set(), distribution: distribution.set(),
}) })
} }
#[allow(clippy::too_many_arguments)]
fn rest( fn rest(
url: String, url: String,
api_key: Setting<String>, api_key: Setting<String>,
indexing_fragments: Setting<BTreeMap<String, Option<Fragment>>>,
search_fragments: Setting<BTreeMap<String, Option<Fragment>>>,
request: serde_json::Value, request: serde_json::Value,
response: serde_json::Value, response: serde_json::Value,
headers: Setting<BTreeMap<String, String>>, headers: Setting<BTreeMap<String, String>>,
@ -2027,6 +2343,22 @@ impl SubEmbedderOptions {
response, response,
distribution: distribution.set(), distribution: distribution.set(),
headers: headers.set().unwrap_or_default(), headers: headers.set().unwrap_or_default(),
search_fragments: search_fragments
.set()
.unwrap_or_default()
.into_iter()
.filter_map(|(name, fragment)| {
Some((name, fragment.map(|fragment| fragment.value)?))
})
.collect(),
indexing_fragments: indexing_fragments
.set()
.unwrap_or_default()
.into_iter()
.filter_map(|(name, fragment)| {
Some((name, fragment.map(|fragment| fragment.value)?))
})
.collect(),
}) })
} }
fn ollama( fn ollama(
@ -2066,3 +2398,20 @@ impl From<SubEmbedderOptions> for EmbedderOptions {
} }
} }
} }
pub(crate) fn fragments_from_settings(
setting: &Setting<EmbeddingSettings>,
) -> impl Iterator<Item = String> + '_ {
let Some(setting) = setting.as_ref().set() else { return Either::Left(None.into_iter()) };
if let Some(setting) = setting.indexing_fragments.as_ref().set() {
Either::Right(setting.keys().cloned())
} else {
let Some(setting) = setting.indexing_embedder.as_ref().set() else {
return Either::Left(None.into_iter());
};
let Some(setting) = setting.indexing_fragments.as_ref().set() else {
return Either::Left(None.into_iter());
};
Either::Right(setting.keys().cloned())
}
}