From 11759c4be4be37a5bf41d8ff62059b0639949cc3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 18 Feb 2025 14:16:41 +0100 Subject: [PATCH 1/6] Support pooling --- crates/milli/src/vector/error.rs | 44 ++++++++++ crates/milli/src/vector/hf.rs | 136 +++++++++++++++++++++++++++---- 2 files changed, 166 insertions(+), 14 deletions(-) diff --git a/crates/milli/src/vector/error.rs b/crates/milli/src/vector/error.rs index d1b2516f5..650249bff 100644 --- a/crates/milli/src/vector/error.rs +++ b/crates/milli/src/vector/error.rs @@ -262,6 +262,31 @@ impl NewEmbedderError { } } + pub fn open_pooling_config( + pooling_config_filename: PathBuf, + inner: std::io::Error, + ) -> NewEmbedderError { + let open_config = OpenPoolingConfig { filename: pooling_config_filename, inner }; + + Self { + kind: NewEmbedderErrorKind::OpenPoolingConfig(open_config), + fault: FaultSource::Runtime, + } + } + + pub fn deserialize_pooling_config( + model_name: String, + pooling_config_filename: PathBuf, + inner: serde_json::Error, + ) -> NewEmbedderError { + let deserialize_pooling_config = + DeserializePoolingConfig { model_name, filename: pooling_config_filename, inner }; + Self { + kind: NewEmbedderErrorKind::DeserializePoolingConfig(deserialize_pooling_config), + fault: FaultSource::Runtime, + } + } + pub fn open_tokenizer( tokenizer_filename: PathBuf, inner: Box, @@ -319,6 +344,13 @@ pub struct OpenConfig { pub inner: std::io::Error, } +#[derive(Debug, thiserror::Error)] +#[error("could not open pooling config at {filename}: {inner}")] +pub struct OpenPoolingConfig { + pub filename: PathBuf, + pub inner: std::io::Error, +} + #[derive(Debug, thiserror::Error)] #[error("for model '{model_name}', could not deserialize config at {filename} as JSON: {inner}")] pub struct DeserializeConfig { @@ -327,6 +359,14 @@ pub struct DeserializeConfig { pub inner: serde_json::Error, } +#[derive(Debug, thiserror::Error)] +#[error("for model '{model_name}', could not deserialize file at `{filename}` as a pooling config: {inner}")] +pub struct DeserializePoolingConfig { + pub model_name: String, + pub filename: PathBuf, + pub inner: serde_json::Error, +} + #[derive(Debug, thiserror::Error)] #[error("model `{model_name}` appears to be unsupported{}\n - inner error: {inner}", if architectures.is_empty() { @@ -354,8 +394,12 @@ pub enum NewEmbedderErrorKind { #[error(transparent)] OpenConfig(OpenConfig), #[error(transparent)] + OpenPoolingConfig(OpenPoolingConfig), + #[error(transparent)] DeserializeConfig(DeserializeConfig), #[error(transparent)] + DeserializePoolingConfig(DeserializePoolingConfig), + #[error(transparent)] UnsupportedModel(UnsupportedModel), #[error(transparent)] OpenTokenizer(OpenTokenizer), diff --git a/crates/milli/src/vector/hf.rs b/crates/milli/src/vector/hf.rs index 447a88f5d..9ec34daef 100644 --- a/crates/milli/src/vector/hf.rs +++ b/crates/milli/src/vector/hf.rs @@ -58,6 +58,7 @@ pub struct Embedder { tokenizer: Tokenizer, options: EmbedderOptions, dimensions: usize, + pooling: Pooling, } impl std::fmt::Debug for Embedder { @@ -66,10 +67,53 @@ impl std::fmt::Debug for Embedder { .field("model", &self.options.model) .field("tokenizer", &self.tokenizer) .field("options", &self.options) + .field("pooling", &self.pooling) .finish() } } +#[derive(Clone, Copy, serde::Deserialize)] +struct PoolingConfig { + #[serde(default)] + pub pooling_mode_cls_token: bool, + #[serde(default)] + pub pooling_mode_mean_tokens: bool, + #[serde(default)] + pub pooling_mode_max_tokens: bool, + #[serde(default)] + pub pooling_mode_mean_sqrt_len_tokens: bool, + #[serde(default)] + pub pooling_mode_lasttoken: bool, +} + +#[derive(Debug, Clone, Copy, Default)] +pub enum Pooling { + #[default] + Mean, + Cls, + Max, + MeanSqrtLen, + LastToken, +} + +impl From for Pooling { + fn from(value: PoolingConfig) -> Self { + if value.pooling_mode_cls_token { + Self::Cls + } else if value.pooling_mode_mean_tokens { + Self::Mean + } else if value.pooling_mode_lasttoken { + Self::LastToken + } else if value.pooling_mode_mean_sqrt_len_tokens { + Self::MeanSqrtLen + } else if value.pooling_mode_max_tokens { + Self::Max + } else { + Self::default() + } + } +} + impl Embedder { pub fn new(options: EmbedderOptions) -> std::result::Result { let device = match candle_core::Device::cuda_if_available(0) { @@ -83,7 +127,7 @@ impl Embedder { Some(revision) => Repo::with_revision(options.model.clone(), RepoType::Model, revision), None => Repo::model(options.model.clone()), }; - let (config_filename, tokenizer_filename, weights_filename, weight_source) = { + let (config_filename, tokenizer_filename, weights_filename, weight_source, pooling) = { let api = Api::new().map_err(NewEmbedderError::new_api_fail)?; let api = api.repo(repo); let config = api.get("config.json").map_err(NewEmbedderError::api_get)?; @@ -97,7 +141,36 @@ impl Embedder { }) .map_err(NewEmbedderError::api_get)? }; - (config, tokenizer, weights, source) + let pooling = match api.get("1_Pooling/config.json") { + Ok(pooling) => Some(pooling), + Err(hf_hub::api::sync::ApiError::RequestError(error)) + if matches!(*error, ureq::Error::Status(404, _,)) => + { + // ignore the error if the file simply doesn't exist + None + } + Err(error) => return Err(NewEmbedderError::api_get(error)), + }; + let pooling: Pooling = match pooling { + Some(pooling_filename) => { + let pooling = std::fs::read_to_string(&pooling_filename).map_err(|inner| { + NewEmbedderError::open_pooling_config(pooling_filename.clone(), inner) + })?; + + let pooling: PoolingConfig = + serde_json::from_str(&pooling).map_err(|inner| { + NewEmbedderError::deserialize_pooling_config( + options.model.clone(), + pooling_filename, + inner, + ) + })?; + pooling.into() + } + None => Pooling::default(), + }; + + (config, tokenizer, weights, source, pooling) }; let config = std::fs::read_to_string(&config_filename) @@ -122,6 +195,8 @@ impl Embedder { }, }; + tracing::debug!(model = options.model, weight=?weight_source, pooling=?pooling, "model config"); + let model = BertModel::load(vb, &config).map_err(NewEmbedderError::load_model)?; if let Some(pp) = tokenizer.get_padding_mut() { @@ -134,7 +209,7 @@ impl Embedder { tokenizer.with_padding(Some(pp)); } - let mut this = Self { model, tokenizer, options, dimensions: 0 }; + let mut this = Self { model, tokenizer, options, dimensions: 0, pooling }; let embeddings = this .embed(vec!["test".into()]) @@ -168,17 +243,53 @@ impl Embedder { .forward(&token_ids, &token_type_ids, None) .map_err(EmbedError::model_forward)?; - // Apply some avg-pooling by taking the mean embedding value for all tokens (including padding) - let (_n_sentence, n_tokens, _hidden_size) = - embeddings.dims3().map_err(EmbedError::tensor_shape)?; - - let embeddings = (embeddings.sum(1).map_err(EmbedError::tensor_value)? / (n_tokens as f64)) - .map_err(EmbedError::tensor_shape)?; + let embeddings = Self::pooling(embeddings, self.pooling)?; let embeddings: Vec = embeddings.to_vec2().map_err(EmbedError::tensor_shape)?; Ok(embeddings) } + fn pooling(embeddings: Tensor, pooling: Pooling) -> Result { + match pooling { + Pooling::Mean => Self::mean_pooling(embeddings), + Pooling::Cls => Self::cls_pooling(embeddings), + Pooling::Max => Self::max_pooling(embeddings), + Pooling::MeanSqrtLen => Self::mean_sqrt_pooling(embeddings), + Pooling::LastToken => Self::last_token_pooling(embeddings), + } + } + + fn cls_pooling(embeddings: Tensor) -> Result { + embeddings.get_on_dim(1, 0).map_err(EmbedError::tensor_value) + } + + fn mean_sqrt_pooling(embeddings: Tensor) -> Result { + let (_n_sentence, n_tokens, _hidden_size) = + embeddings.dims3().map_err(EmbedError::tensor_shape)?; + + (embeddings.sum(1).map_err(EmbedError::tensor_value)? / (n_tokens as f64).sqrt()) + .map_err(EmbedError::tensor_shape) + } + + fn mean_pooling(embeddings: Tensor) -> Result { + let (_n_sentence, n_tokens, _hidden_size) = + embeddings.dims3().map_err(EmbedError::tensor_shape)?; + + (embeddings.sum(1).map_err(EmbedError::tensor_value)? / (n_tokens as f64)) + .map_err(EmbedError::tensor_shape) + } + + fn max_pooling(embeddings: Tensor) -> Result { + embeddings.max(1).map_err(EmbedError::tensor_shape) + } + + fn last_token_pooling(embeddings: Tensor) -> Result { + let (_n_sentence, n_tokens, _hidden_size) = + embeddings.dims3().map_err(EmbedError::tensor_shape)?; + + embeddings.get_on_dim(1, n_tokens - 1).map_err(EmbedError::tensor_value) + } + pub fn embed_one(&self, text: &str) -> std::result::Result { let tokens = self.tokenizer.encode(text, true).map_err(EmbedError::tokenize)?; let token_ids = tokens.get_ids(); @@ -192,11 +303,8 @@ impl Embedder { .forward(&token_ids, &token_type_ids, None) .map_err(EmbedError::model_forward)?; - // Apply some avg-pooling by taking the mean embedding value for all tokens (including padding) - let (_n_sentence, n_tokens, _hidden_size) = - embeddings.dims3().map_err(EmbedError::tensor_shape)?; - let embedding = (embeddings.sum(1).map_err(EmbedError::tensor_value)? / (n_tokens as f64)) - .map_err(EmbedError::tensor_shape)?; + let embedding = Self::pooling(embeddings, self.pooling)?; + let embedding = embedding.squeeze(0).map_err(EmbedError::tensor_shape)?; let embedding: Embedding = embedding.to_vec1().map_err(EmbedError::tensor_shape)?; Ok(embedding) From 7b4ce468a6dce934fe0900d422eef9cd12428706 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 18 Feb 2025 17:12:23 +0100 Subject: [PATCH 2/6] Allow overriding pooling method --- .../milli/src/update/index_documents/mod.rs | 1 + crates/milli/src/update/settings.rs | 9 +++++ crates/milli/src/vector/hf.rs | 38 ++++++++++++++++++- crates/milli/src/vector/settings.rs | 31 +++++++++++++++ 4 files changed, 78 insertions(+), 1 deletion(-) diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index 56c26ed29..d62128eaa 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -2763,6 +2763,7 @@ mod tests { source: Setting::Set(crate::vector::settings::EmbedderSource::UserProvided), model: Setting::NotSet, revision: Setting::NotSet, + pooling: Setting::NotSet, api_key: Setting::NotSet, dimensions: Setting::Set(3), document_template: Setting::NotSet, diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index 85259c2d0..0d0648fc8 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -1676,6 +1676,7 @@ fn validate_prompt( source, model, revision, + pooling, api_key, dimensions, document_template: Setting::Set(template), @@ -1709,6 +1710,7 @@ fn validate_prompt( source, model, revision, + pooling, api_key, dimensions, document_template: Setting::Set(template), @@ -1735,6 +1737,7 @@ pub fn validate_embedding_settings( source, model, revision, + pooling, api_key, dimensions, document_template, @@ -1776,6 +1779,7 @@ pub fn validate_embedding_settings( source, model, revision, + pooling, api_key, dimensions, document_template, @@ -1791,6 +1795,7 @@ pub fn validate_embedding_settings( match inferred_source { EmbedderSource::OpenAi => { check_unset(&revision, EmbeddingSettings::REVISION, inferred_source, name)?; + check_unset(&pooling, EmbeddingSettings::POOLING, inferred_source, name)?; check_unset(&request, EmbeddingSettings::REQUEST, inferred_source, name)?; check_unset(&response, EmbeddingSettings::RESPONSE, inferred_source, name)?; @@ -1829,6 +1834,7 @@ pub fn validate_embedding_settings( EmbedderSource::Ollama => { check_set(&model, EmbeddingSettings::MODEL, inferred_source, name)?; check_unset(&revision, EmbeddingSettings::REVISION, inferred_source, name)?; + check_unset(&pooling, EmbeddingSettings::POOLING, inferred_source, name)?; check_unset(&request, EmbeddingSettings::REQUEST, inferred_source, name)?; check_unset(&response, EmbeddingSettings::RESPONSE, inferred_source, name)?; @@ -1846,6 +1852,7 @@ pub fn validate_embedding_settings( EmbedderSource::UserProvided => { check_unset(&model, EmbeddingSettings::MODEL, inferred_source, name)?; check_unset(&revision, EmbeddingSettings::REVISION, inferred_source, name)?; + check_unset(&pooling, EmbeddingSettings::POOLING, inferred_source, name)?; check_unset(&api_key, EmbeddingSettings::API_KEY, inferred_source, name)?; check_unset( &document_template, @@ -1869,6 +1876,7 @@ pub fn validate_embedding_settings( EmbedderSource::Rest => { check_unset(&model, EmbeddingSettings::MODEL, inferred_source, name)?; check_unset(&revision, EmbeddingSettings::REVISION, inferred_source, name)?; + check_unset(&pooling, EmbeddingSettings::POOLING, inferred_source, name)?; check_set(&url, EmbeddingSettings::URL, inferred_source, name)?; check_set(&request, EmbeddingSettings::REQUEST, inferred_source, name)?; check_set(&response, EmbeddingSettings::RESPONSE, inferred_source, name)?; @@ -1878,6 +1886,7 @@ pub fn validate_embedding_settings( source, model, revision, + pooling, api_key, dimensions, document_template, diff --git a/crates/milli/src/vector/hf.rs b/crates/milli/src/vector/hf.rs index 9ec34daef..b01a66255 100644 --- a/crates/milli/src/vector/hf.rs +++ b/crates/milli/src/vector/hf.rs @@ -34,6 +34,30 @@ pub struct EmbedderOptions { pub model: String, pub revision: Option, pub distribution: Option, + #[serde(default)] + pub pooling: OverridePooling, +} + +#[derive( + Debug, + Clone, + Copy, + Default, + Hash, + PartialEq, + Eq, + serde::Deserialize, + serde::Serialize, + utoipa::ToSchema, + deserr::Deserr, +)] +#[deserr(rename_all = camelCase, deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +pub enum OverridePooling { + UseModel, + ForceCls, + #[default] + ForceMean, } impl EmbedderOptions { @@ -42,6 +66,7 @@ impl EmbedderOptions { model: "BAAI/bge-base-en-v1.5".to_string(), revision: Some("617ca489d9e86b49b8167676d8220688b99db36e".into()), distribution: None, + pooling: OverridePooling::UseModel, } } } @@ -95,6 +120,15 @@ pub enum Pooling { MeanSqrtLen, LastToken, } +impl Pooling { + fn override_with(&mut self, pooling: OverridePooling) { + match pooling { + OverridePooling::UseModel => {} + OverridePooling::ForceCls => *self = Pooling::Cls, + OverridePooling::ForceMean => *self = Pooling::Mean, + } + } +} impl From for Pooling { fn from(value: PoolingConfig) -> Self { @@ -151,7 +185,7 @@ impl Embedder { } Err(error) => return Err(NewEmbedderError::api_get(error)), }; - let pooling: Pooling = match pooling { + let mut pooling: Pooling = match pooling { Some(pooling_filename) => { let pooling = std::fs::read_to_string(&pooling_filename).map_err(|inner| { NewEmbedderError::open_pooling_config(pooling_filename.clone(), inner) @@ -170,6 +204,8 @@ impl Embedder { None => Pooling::default(), }; + pooling.override_with(options.pooling); + (config, tokenizer, weights, source, pooling) }; diff --git a/crates/milli/src/vector/settings.rs b/crates/milli/src/vector/settings.rs index 86028c1c4..f10407e42 100644 --- a/crates/milli/src/vector/settings.rs +++ b/crates/milli/src/vector/settings.rs @@ -6,6 +6,7 @@ use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use utoipa::ToSchema; +use super::hf::OverridePooling; use super::{ollama, openai, DistributionShift}; use crate::prompt::{default_max_bytes, PromptData}; use crate::update::Setting; @@ -30,6 +31,10 @@ pub struct EmbeddingSettings { pub revision: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] + #[schema(value_type = Option)] + pub pooling: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] #[schema(value_type = Option)] pub api_key: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] @@ -164,6 +169,7 @@ impl SettingsDiff { mut source, mut model, mut revision, + mut pooling, mut api_key, mut dimensions, mut document_template, @@ -180,6 +186,7 @@ impl SettingsDiff { source: new_source, model: new_model, revision: new_revision, + pooling: new_pooling, api_key: new_api_key, dimensions: new_dimensions, document_template: new_document_template, @@ -210,6 +217,7 @@ impl SettingsDiff { &source, &mut model, &mut revision, + &mut pooling, &mut dimensions, &mut url, &mut request, @@ -225,6 +233,9 @@ impl SettingsDiff { if revision.apply(new_revision) { ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); } + if pooling.apply(new_pooling) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } if dimensions.apply(new_dimensions) { match source { // regenerate on dimensions change in OpenAI since truncation is supported @@ -290,6 +301,7 @@ impl SettingsDiff { source, model, revision, + pooling, api_key, dimensions, document_template, @@ -338,6 +350,7 @@ fn apply_default_for_source( source: &Setting, model: &mut Setting, revision: &mut Setting, + pooling: &mut Setting, dimensions: &mut Setting, url: &mut Setting, request: &mut Setting, @@ -350,6 +363,7 @@ fn apply_default_for_source( Setting::Set(EmbedderSource::HuggingFace) => { *model = Setting::Reset; *revision = Setting::Reset; + *pooling = Setting::Reset; *dimensions = Setting::NotSet; *url = Setting::NotSet; *request = Setting::NotSet; @@ -359,6 +373,7 @@ fn apply_default_for_source( Setting::Set(EmbedderSource::Ollama) => { *model = Setting::Reset; *revision = Setting::NotSet; + *pooling = Setting::NotSet; *dimensions = Setting::Reset; *url = Setting::NotSet; *request = Setting::NotSet; @@ -368,6 +383,7 @@ fn apply_default_for_source( Setting::Set(EmbedderSource::OpenAi) | Setting::Reset => { *model = Setting::Reset; *revision = Setting::NotSet; + *pooling = Setting::NotSet; *dimensions = Setting::NotSet; *url = Setting::Reset; *request = Setting::NotSet; @@ -377,6 +393,7 @@ fn apply_default_for_source( Setting::Set(EmbedderSource::Rest) => { *model = Setting::NotSet; *revision = Setting::NotSet; + *pooling = Setting::NotSet; *dimensions = Setting::Reset; *url = Setting::Reset; *request = Setting::Reset; @@ -386,6 +403,7 @@ fn apply_default_for_source( Setting::Set(EmbedderSource::UserProvided) => { *model = Setting::NotSet; *revision = Setting::NotSet; + *pooling = Setting::NotSet; *dimensions = Setting::Reset; *url = Setting::NotSet; *request = Setting::NotSet; @@ -419,6 +437,7 @@ impl EmbeddingSettings { pub const SOURCE: &'static str = "source"; pub const MODEL: &'static str = "model"; pub const REVISION: &'static str = "revision"; + pub const POOLING: &'static str = "pooling"; pub const API_KEY: &'static str = "apiKey"; pub const DIMENSIONS: &'static str = "dimensions"; pub const DOCUMENT_TEMPLATE: &'static str = "documentTemplate"; @@ -446,6 +465,7 @@ impl EmbeddingSettings { &[EmbedderSource::HuggingFace, EmbedderSource::OpenAi, EmbedderSource::Ollama] } Self::REVISION => &[EmbedderSource::HuggingFace], + Self::POOLING => &[EmbedderSource::HuggingFace], Self::API_KEY => { &[EmbedderSource::OpenAi, EmbedderSource::Ollama, EmbedderSource::Rest] } @@ -500,6 +520,7 @@ impl EmbeddingSettings { Self::SOURCE, Self::MODEL, Self::REVISION, + Self::POOLING, Self::DOCUMENT_TEMPLATE, Self::DOCUMENT_TEMPLATE_MAX_BYTES, Self::DISTRIBUTION, @@ -592,10 +613,12 @@ impl From for EmbeddingSettings { model, revision, distribution, + pooling, }) => Self { source: Setting::Set(EmbedderSource::HuggingFace), model: Setting::Set(model), revision: Setting::some_or_not_set(revision), + pooling: Setting::Set(pooling), api_key: Setting::NotSet, dimensions: Setting::NotSet, document_template: Setting::Set(prompt.template), @@ -617,6 +640,7 @@ impl From for EmbeddingSettings { source: Setting::Set(EmbedderSource::OpenAi), model: Setting::Set(embedding_model.name().to_owned()), revision: Setting::NotSet, + pooling: Setting::NotSet, api_key: Setting::some_or_not_set(api_key), dimensions: Setting::some_or_not_set(dimensions), document_template: Setting::Set(prompt.template), @@ -638,6 +662,7 @@ impl From for EmbeddingSettings { source: Setting::Set(EmbedderSource::Ollama), model: Setting::Set(embedding_model), revision: Setting::NotSet, + pooling: Setting::NotSet, api_key: Setting::some_or_not_set(api_key), dimensions: Setting::some_or_not_set(dimensions), document_template: Setting::Set(prompt.template), @@ -656,6 +681,7 @@ impl From for EmbeddingSettings { source: Setting::Set(EmbedderSource::UserProvided), model: Setting::NotSet, revision: Setting::NotSet, + pooling: Setting::NotSet, api_key: Setting::NotSet, dimensions: Setting::Set(dimensions), document_template: Setting::NotSet, @@ -679,6 +705,7 @@ impl From for EmbeddingSettings { source: Setting::Set(EmbedderSource::Rest), model: Setting::NotSet, revision: Setting::NotSet, + pooling: Setting::NotSet, api_key: Setting::some_or_not_set(api_key), dimensions: Setting::some_or_not_set(dimensions), document_template: Setting::Set(prompt.template), @@ -701,6 +728,7 @@ impl From for EmbeddingConfig { source, model, revision, + pooling, api_key, dimensions, document_template, @@ -764,6 +792,9 @@ impl From for EmbeddingConfig { if let Some(revision) = revision.set() { options.revision = Some(revision); } + if let Some(pooling) = pooling.set() { + options.pooling = pooling; + } options.distribution = distribution.set(); this.embedder_options = super::EmbedderOptions::HuggingFace(options); } From cd0dfa3f1b7a848a27aca902036de11d2e05278c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 18 Feb 2025 17:21:52 +0100 Subject: [PATCH 3/6] Fix test cases --- ...ler__test_embedders__import_vectors-8.snap | 4 +- .../after_registering_settings_task.snap | 3 +- .../settings_update_processed.snap | 3 +- .../Intel to kefir succeeds.snap | 3 +- .../import_vectors/Intel to kefir.snap | 3 +- .../import_vectors/adding Intel succeeds.snap | 3 +- .../import_vectors/after adding Intel.snap | 3 +- ...ter_registering_settings_task_vectors.snap | 3 +- .../settings_update_processed_vectors.snap | 3 +- .../src/scheduler/test_embedders.rs | 43 ++++++++++--------- crates/meilisearch/tests/dumps/mod.rs | 1 + 11 files changed, 33 insertions(+), 39 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-8.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-8.snap index 76828ad7a..19b5cab92 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-8.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-8.snap @@ -1,12 +1,12 @@ --- source: crates/index-scheduler/src/scheduler/test_embedders.rs expression: simple_hf_config.embedder_options -snapshot_kind: text --- { "HuggingFace": { "model": "sentence-transformers/all-MiniLM-L6-v2", "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", - "distribution": null + "distribution": null, + "pooling": "useModel" } } diff --git a/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/after_registering_settings_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/after_registering_settings_task.snap index 92e37550a..fb2a9de43 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/after_registering_settings_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/after_registering_settings_task.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/scheduler/test.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [0,] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/settings_update_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/settings_update_processed.snap index bdd654672..f503e2a56 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/settings_update_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/settings_update_processed.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/scheduler/test.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir succeeds.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir succeeds.snap index 7de6af6b7..fcbab1a07 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir succeeds.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir succeeds.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/scheduler/test_embedders.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} 2 {uid: 2, batch_uid: 2, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir.snap index 68872a141..d6a677999 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/scheduler/test_embedders.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} 2 {uid: 2, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/adding Intel succeeds.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/adding Intel succeeds.snap index 1732eee6b..2dc23b3b4 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/adding Intel succeeds.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/adding Intel succeeds.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/scheduler/test_embedders.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after adding Intel.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after adding Intel.snap index 3777a7bc8..818cdd474 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after adding Intel.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after adding Intel.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/scheduler/test_embedders.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after_registering_settings_task_vectors.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after_registering_settings_task_vectors.snap index 33bd5c0d2..172f80633 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after_registering_settings_task_vectors.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after_registering_settings_task_vectors.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/scheduler/test_embedders.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [0,] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/settings_update_processed_vectors.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/settings_update_processed_vectors.snap index e5baae150..1635614e8 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/settings_update_processed_vectors.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/settings_update_processed_vectors.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/scheduler/test_embedders.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [] diff --git a/crates/index-scheduler/src/scheduler/test_embedders.rs b/crates/index-scheduler/src/scheduler/test_embedders.rs index 5ec58bc53..b1c619441 100644 --- a/crates/index-scheduler/src/scheduler/test_embedders.rs +++ b/crates/index-scheduler/src/scheduler/test_embedders.rs @@ -404,31 +404,32 @@ fn import_vectors_first_and_embedder_later() { // even though we specified the vector for the ID 3, it shouldn't be marked // as user provided since we explicitely marked it as NOT user provided. snapshot!(format!("{conf:#?}"), @r###" - [ - IndexEmbeddingConfig { - name: "my_doggo_embedder", - config: EmbeddingConfig { - embedder_options: HuggingFace( - EmbedderOptions { - model: "sentence-transformers/all-MiniLM-L6-v2", - revision: Some( - "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", - ), - distribution: None, - }, - ), - prompt: PromptData { - template: "{{doc.doggo}}", - max_bytes: Some( - 400, + [ + IndexEmbeddingConfig { + name: "my_doggo_embedder", + config: EmbeddingConfig { + embedder_options: HuggingFace( + EmbedderOptions { + model: "sentence-transformers/all-MiniLM-L6-v2", + revision: Some( + "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", ), + distribution: None, + pooling: UseModel, }, - quantized: None, + ), + prompt: PromptData { + template: "{{doc.doggo}}", + max_bytes: Some( + 400, + ), }, - user_provided: RoaringBitmap<[1, 2]>, + quantized: None, }, - ] - "###); + user_provided: RoaringBitmap<[1, 2]>, + }, + ] + "###); let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); let embedding = &embeddings["my_doggo_embedder"]; diff --git a/crates/meilisearch/tests/dumps/mod.rs b/crates/meilisearch/tests/dumps/mod.rs index 1b07afdfd..6912a5b48 100644 --- a/crates/meilisearch/tests/dumps/mod.rs +++ b/crates/meilisearch/tests/dumps/mod.rs @@ -2380,6 +2380,7 @@ async fn generate_and_import_dump_containing_vectors() { "source": "huggingFace", "model": "sentence-transformers/all-MiniLM-L6-v2", "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "pooling": "useModel", "documentTemplate": "{{doc.doggo}}", "documentTemplateMaxBytes": 400 } From 1005a60fb855ec18747601dd73fa3ffffde070ab Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 19 Feb 2025 11:03:48 +0100 Subject: [PATCH 4/6] Fixup dump settings --- crates/dump/src/reader/v6/mod.rs | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/crates/dump/src/reader/v6/mod.rs b/crates/dump/src/reader/v6/mod.rs index 9e0d07c78..3f469a38d 100644 --- a/crates/dump/src/reader/v6/mod.rs +++ b/crates/dump/src/reader/v6/mod.rs @@ -3,6 +3,7 @@ use std::io::{BufRead, BufReader, ErrorKind}; use std::path::Path; pub use meilisearch_types::milli; +use meilisearch_types::milli::vector::hf::OverridePooling; use tempfile::TempDir; use time::OffsetDateTime; use tracing::debug; @@ -252,7 +253,29 @@ impl V6IndexReader { } pub fn settings(&mut self) -> Result> { - let settings: Settings = serde_json::from_reader(&mut self.settings)?; + let mut settings: Settings = serde_json::from_reader(&mut self.settings)?; + patch_embedders(&mut settings); Ok(settings.check()) } } + +fn patch_embedders(settings: &mut Settings) { + if let Setting::Set(embedders) = &mut settings.embedders { + for (_, settings) in embedders { + let Setting::Set(settings) = &mut settings.inner else { + continue; + }; + if settings.source != Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace) + { + continue; + } + settings.pooling = match settings.pooling { + Setting::Set(pooling) => Setting::Set(pooling), + // if the pooling for a hugging face embedder is not set, force it to `forceMean` + // for backward compatibility with v1.13 + // dumps created in v1.14 and up will have the setting set for hugging face embedders + Setting::Reset | Setting::NotSet => Setting::Set(OverridePooling::ForceMean), + }; + } + } +} From b367c71ad27cdbc3f8bd54c6ffd97722657838d9 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 19 Feb 2025 11:31:17 +0100 Subject: [PATCH 5/6] fixup test --- .../dump__reader__test__import_dump_v6_with_vectors-5.snap | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-5.snap b/crates/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-5.snap index 77694a629..3c770a6eb 100644 --- a/crates/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-5.snap +++ b/crates/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-5.snap @@ -1,5 +1,5 @@ --- -source: dump/src/reader/mod.rs +source: crates/dump/src/reader/mod.rs expression: vector_index.settings().unwrap() --- { @@ -49,6 +49,7 @@ expression: vector_index.settings().unwrap() "source": "huggingFace", "model": "BAAI/bge-base-en-v1.5", "revision": "617ca489d9e86b49b8167676d8220688b99db36e", + "pooling": "forceMean", "documentTemplate": "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}" } }, From 589bf30ec6165800ce17e164599a4c89313a4f77 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 19 Feb 2025 11:38:07 +0100 Subject: [PATCH 6/6] make clippy happy --- crates/dump/src/reader/v6/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/dump/src/reader/v6/mod.rs b/crates/dump/src/reader/v6/mod.rs index 3f469a38d..d9ceec114 100644 --- a/crates/dump/src/reader/v6/mod.rs +++ b/crates/dump/src/reader/v6/mod.rs @@ -261,7 +261,7 @@ impl V6IndexReader { fn patch_embedders(settings: &mut Settings) { if let Setting::Set(embedders) = &mut settings.embedders { - for (_, settings) in embedders { + for settings in embedders.values_mut() { let Setting::Set(settings) = &mut settings.inner else { continue; };