mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 20:37:15 +02:00
Small commit to add hybrid search and autoembedding
This commit is contained in:
parent
21bcf32109
commit
13c2c6c16b
42 changed files with 4045 additions and 246 deletions
229
milli/src/vector/error.rs
Normal file
229
milli/src/vector/error.rs
Normal file
|
@ -0,0 +1,229 @@
|
|||
use std::path::PathBuf;
|
||||
|
||||
use hf_hub::api::sync::ApiError;
|
||||
|
||||
use crate::error::FaultSource;
|
||||
use crate::vector::openai::OpenAiError;
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("Error while generating embeddings: {inner}")]
|
||||
pub struct Error {
|
||||
pub inner: Box<ErrorKind>,
|
||||
}
|
||||
|
||||
impl<I: Into<ErrorKind>> From<I> for Error {
|
||||
fn from(value: I) -> Self {
|
||||
Self { inner: Box::new(value.into()) }
|
||||
}
|
||||
}
|
||||
|
||||
impl Error {
|
||||
pub fn fault(&self) -> FaultSource {
|
||||
match &*self.inner {
|
||||
ErrorKind::NewEmbedderError(inner) => inner.fault,
|
||||
ErrorKind::EmbedError(inner) => inner.fault,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum ErrorKind {
|
||||
#[error(transparent)]
|
||||
NewEmbedderError(#[from] NewEmbedderError),
|
||||
#[error(transparent)]
|
||||
EmbedError(#[from] EmbedError),
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("{fault}: {kind}")]
|
||||
pub struct EmbedError {
|
||||
pub kind: EmbedErrorKind,
|
||||
pub fault: FaultSource,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum EmbedErrorKind {
|
||||
#[error("could not tokenize: {0}")]
|
||||
Tokenize(Box<dyn std::error::Error + Send + Sync>),
|
||||
#[error("unexpected tensor shape: {0}")]
|
||||
TensorShape(candle_core::Error),
|
||||
#[error("unexpected tensor value: {0}")]
|
||||
TensorValue(candle_core::Error),
|
||||
#[error("could not run model: {0}")]
|
||||
ModelForward(candle_core::Error),
|
||||
#[error("could not reach OpenAI: {0}")]
|
||||
OpenAiNetwork(reqwest::Error),
|
||||
#[error("unexpected response from OpenAI: {0}")]
|
||||
OpenAiUnexpected(reqwest::Error),
|
||||
#[error("could not authenticate against OpenAI: {0}")]
|
||||
OpenAiAuth(OpenAiError),
|
||||
#[error("sent too many requests to OpenAI: {0}")]
|
||||
OpenAiTooManyRequests(OpenAiError),
|
||||
#[error("received internal error from OpenAI: {0}")]
|
||||
OpenAiInternalServerError(OpenAiError),
|
||||
#[error("sent too many tokens in a request to OpenAI: {0}")]
|
||||
OpenAiTooManyTokens(OpenAiError),
|
||||
#[error("received unhandled HTTP status code {0} from OpenAI")]
|
||||
OpenAiUnhandledStatusCode(u16),
|
||||
}
|
||||
|
||||
impl EmbedError {
|
||||
pub fn tokenize(inner: Box<dyn std::error::Error + Send + Sync>) -> Self {
|
||||
Self { kind: EmbedErrorKind::Tokenize(inner), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
pub fn tensor_shape(inner: candle_core::Error) -> Self {
|
||||
Self { kind: EmbedErrorKind::TensorShape(inner), fault: FaultSource::Bug }
|
||||
}
|
||||
|
||||
pub fn tensor_value(inner: candle_core::Error) -> Self {
|
||||
Self { kind: EmbedErrorKind::TensorValue(inner), fault: FaultSource::Bug }
|
||||
}
|
||||
|
||||
pub fn model_forward(inner: candle_core::Error) -> Self {
|
||||
Self { kind: EmbedErrorKind::ModelForward(inner), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
pub fn openai_network(inner: reqwest::Error) -> Self {
|
||||
Self { kind: EmbedErrorKind::OpenAiNetwork(inner), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
pub fn openai_unexpected(inner: reqwest::Error) -> EmbedError {
|
||||
Self { kind: EmbedErrorKind::OpenAiUnexpected(inner), fault: FaultSource::Bug }
|
||||
}
|
||||
|
||||
pub(crate) fn openai_auth_error(inner: OpenAiError) -> EmbedError {
|
||||
Self { kind: EmbedErrorKind::OpenAiAuth(inner), fault: FaultSource::User }
|
||||
}
|
||||
|
||||
pub(crate) fn openai_too_many_requests(inner: OpenAiError) -> EmbedError {
|
||||
Self { kind: EmbedErrorKind::OpenAiTooManyRequests(inner), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
pub(crate) fn openai_internal_server_error(inner: OpenAiError) -> EmbedError {
|
||||
Self { kind: EmbedErrorKind::OpenAiInternalServerError(inner), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
pub(crate) fn openai_too_many_tokens(inner: OpenAiError) -> EmbedError {
|
||||
Self { kind: EmbedErrorKind::OpenAiTooManyTokens(inner), fault: FaultSource::Bug }
|
||||
}
|
||||
|
||||
pub(crate) fn openai_unhandled_status_code(code: u16) -> EmbedError {
|
||||
Self { kind: EmbedErrorKind::OpenAiUnhandledStatusCode(code), fault: FaultSource::Bug }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("{fault}: {kind}")]
|
||||
pub struct NewEmbedderError {
|
||||
pub kind: NewEmbedderErrorKind,
|
||||
pub fault: FaultSource,
|
||||
}
|
||||
|
||||
impl NewEmbedderError {
|
||||
pub fn open_config(config_filename: PathBuf, inner: std::io::Error) -> NewEmbedderError {
|
||||
let open_config = OpenConfig { filename: config_filename, inner };
|
||||
|
||||
Self { kind: NewEmbedderErrorKind::OpenConfig(open_config), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
pub fn deserialize_config(
|
||||
config: String,
|
||||
config_filename: PathBuf,
|
||||
inner: serde_json::Error,
|
||||
) -> NewEmbedderError {
|
||||
let deserialize_config = DeserializeConfig { config, filename: config_filename, inner };
|
||||
Self {
|
||||
kind: NewEmbedderErrorKind::DeserializeConfig(deserialize_config),
|
||||
fault: FaultSource::Runtime,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn open_tokenizer(
|
||||
tokenizer_filename: PathBuf,
|
||||
inner: Box<dyn std::error::Error + Send + Sync>,
|
||||
) -> NewEmbedderError {
|
||||
let open_tokenizer = OpenTokenizer { filename: tokenizer_filename, inner };
|
||||
Self {
|
||||
kind: NewEmbedderErrorKind::OpenTokenizer(open_tokenizer),
|
||||
fault: FaultSource::Runtime,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_api_fail(inner: ApiError) -> Self {
|
||||
Self { kind: NewEmbedderErrorKind::NewApiFail(inner), fault: FaultSource::Bug }
|
||||
}
|
||||
|
||||
pub fn api_get(inner: ApiError) -> Self {
|
||||
Self { kind: NewEmbedderErrorKind::ApiGet(inner), fault: FaultSource::Undecided }
|
||||
}
|
||||
|
||||
pub fn pytorch_weight(inner: candle_core::Error) -> Self {
|
||||
Self { kind: NewEmbedderErrorKind::PytorchWeight(inner), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
pub fn safetensor_weight(inner: candle_core::Error) -> Self {
|
||||
Self { kind: NewEmbedderErrorKind::PytorchWeight(inner), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
pub fn load_model(inner: candle_core::Error) -> Self {
|
||||
Self { kind: NewEmbedderErrorKind::LoadModel(inner), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
pub fn openai_initialize_web_client(inner: reqwest::Error) -> Self {
|
||||
Self { kind: NewEmbedderErrorKind::InitWebClient(inner), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
pub fn openai_invalid_api_key_format(inner: reqwest::header::InvalidHeaderValue) -> Self {
|
||||
Self { kind: NewEmbedderErrorKind::InvalidApiKeyFormat(inner), fault: FaultSource::User }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("could not open config at {filename:?}: {inner}")]
|
||||
pub struct OpenConfig {
|
||||
pub filename: PathBuf,
|
||||
pub inner: std::io::Error,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("could not deserialize config at {filename}: {inner}. Config follows:\n{config}")]
|
||||
pub struct DeserializeConfig {
|
||||
pub config: String,
|
||||
pub filename: PathBuf,
|
||||
pub inner: serde_json::Error,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("could not open tokenizer at {filename}: {inner}")]
|
||||
pub struct OpenTokenizer {
|
||||
pub filename: PathBuf,
|
||||
#[source]
|
||||
pub inner: Box<dyn std::error::Error + Send + Sync>,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum NewEmbedderErrorKind {
|
||||
// hf
|
||||
#[error(transparent)]
|
||||
OpenConfig(OpenConfig),
|
||||
#[error(transparent)]
|
||||
DeserializeConfig(DeserializeConfig),
|
||||
#[error(transparent)]
|
||||
OpenTokenizer(OpenTokenizer),
|
||||
#[error("could not build weights from Pytorch weights: {0}")]
|
||||
PytorchWeight(candle_core::Error),
|
||||
#[error("could not build weights from Safetensor weights: {0}")]
|
||||
SafetensorWeight(candle_core::Error),
|
||||
#[error("could not spawn HG_HUB API client: {0}")]
|
||||
NewApiFail(ApiError),
|
||||
#[error("fetching file from HG_HUB failed: {0}")]
|
||||
ApiGet(ApiError),
|
||||
#[error("loading model failed: {0}")]
|
||||
LoadModel(candle_core::Error),
|
||||
// openai
|
||||
#[error("initializing web client for sending embedding requests failed: {0}")]
|
||||
InitWebClient(reqwest::Error),
|
||||
#[error("The API key passed to Authorization error was in an invalid format: {0}")]
|
||||
InvalidApiKeyFormat(reqwest::header::InvalidHeaderValue),
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue