4822: HuggingFace: Clearer error message when a model is not supported r=Kerollmops a=dureuill

# Pull Request

## Related issue
Context: <https://github.com/meilisearch/meilisearch/discussions/4820>

## What does this PR do?
- Improve error message when a model configuration cannot be loaded and its "architectures" field does not contain "BertModel"

Co-authored-by: Louis Dureuil <louis@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2024-07-23 14:09:47 +00:00 committed by GitHub
commit ecee0c922f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 53 additions and 8 deletions

View File

@ -217,16 +217,41 @@ impl NewEmbedderError {
}
pub fn deserialize_config(
model_name: String,
config: String,
config_filename: PathBuf,
inner: serde_json::Error,
) -> NewEmbedderError {
let deserialize_config = DeserializeConfig { config, filename: config_filename, inner };
match serde_json::from_str(&config) {
Ok(value) => {
let value: serde_json::Value = value;
let architectures = match value.get("architectures") {
Some(serde_json::Value::Array(architectures)) => architectures
.iter()
.filter_map(|value| match value {
serde_json::Value::String(s) => Some(s.to_owned()),
_ => None,
})
.collect(),
_ => vec![],
};
let unsupported_model = UnsupportedModel { model_name, inner, architectures };
Self {
kind: NewEmbedderErrorKind::UnsupportedModel(unsupported_model),
fault: FaultSource::User,
}
}
Err(error) => {
let deserialize_config =
DeserializeConfig { model_name, filename: config_filename, inner: error };
Self {
kind: NewEmbedderErrorKind::DeserializeConfig(deserialize_config),
fault: FaultSource::Runtime,
}
}
}
}
pub fn open_tokenizer(
tokenizer_filename: PathBuf,
@ -252,7 +277,7 @@ impl NewEmbedderError {
}
pub fn safetensor_weight(inner: candle_core::Error) -> Self {
Self { kind: NewEmbedderErrorKind::PytorchWeight(inner), fault: FaultSource::Runtime }
Self { kind: NewEmbedderErrorKind::SafetensorWeight(inner), fault: FaultSource::Runtime }
}
pub fn load_model(inner: candle_core::Error) -> Self {
@ -275,13 +300,26 @@ pub struct OpenConfig {
}
#[derive(Debug, thiserror::Error)]
#[error("could not deserialize config at {filename}: {inner}. Config follows:\n{config}")]
#[error("for model '{model_name}', could not deserialize config at {filename} as JSON: {inner}")]
pub struct DeserializeConfig {
pub config: String,
pub model_name: String,
pub filename: PathBuf,
pub inner: serde_json::Error,
}
#[derive(Debug, thiserror::Error)]
#[error("model `{model_name}` appears to be unsupported{}\n - inner error: {inner}",
if architectures.is_empty() {
"\n - Note: only models with architecture \"BertModel\" are supported.".to_string()
} else {
format!("\n - Note: model has declared architectures `{architectures:?}`, only models with architecture `\"BertModel\"` are supported.")
})]
pub struct UnsupportedModel {
pub model_name: String,
pub inner: serde_json::Error,
pub architectures: Vec<String>,
}
#[derive(Debug, thiserror::Error)]
#[error("could not open tokenizer at {filename}: {inner}")]
pub struct OpenTokenizer {
@ -298,6 +336,8 @@ pub enum NewEmbedderErrorKind {
#[error(transparent)]
DeserializeConfig(DeserializeConfig),
#[error(transparent)]
UnsupportedModel(UnsupportedModel),
#[error(transparent)]
OpenTokenizer(OpenTokenizer),
#[error("could not build weights from Pytorch weights: {0}")]
PytorchWeight(candle_core::Error),

View File

@ -103,7 +103,12 @@ impl Embedder {
let config = std::fs::read_to_string(&config_filename)
.map_err(|inner| NewEmbedderError::open_config(config_filename.clone(), inner))?;
let config: Config = serde_json::from_str(&config).map_err(|inner| {
NewEmbedderError::deserialize_config(config, config_filename, inner)
NewEmbedderError::deserialize_config(
options.model.clone(),
config,
config_filename,
inner,
)
})?;
let mut tokenizer = Tokenizer::from_file(&tokenizer_filename)
.map_err(|inner| NewEmbedderError::open_tokenizer(tokenizer_filename, inner))?;