mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-11 05:54:30 +01:00
user_provided => regenerate
This commit is contained in:
parent
a89eea233b
commit
3bc8f81abc
@ -958,10 +958,10 @@ impl IndexScheduler {
|
|||||||
.is_some_and(|conf| conf.user_provided.contains(id));
|
.is_some_and(|conf| conf.user_provided.contains(id));
|
||||||
|
|
||||||
let embeddings = ExplicitVectors {
|
let embeddings = ExplicitVectors {
|
||||||
embeddings: VectorOrArrayOfVectors::from_array_of_vectors(
|
embeddings: Some(
|
||||||
embeddings,
|
VectorOrArrayOfVectors::from_array_of_vectors(embeddings),
|
||||||
),
|
),
|
||||||
user_provided,
|
regenerate: !user_provided,
|
||||||
};
|
};
|
||||||
vectors.insert(
|
vectors.insert(
|
||||||
embedder_name,
|
embedder_name,
|
||||||
|
@ -625,7 +625,10 @@ fn some_documents<'a, 't: 'a>(
|
|||||||
.iter()
|
.iter()
|
||||||
.find(|conf| conf.name == name)
|
.find(|conf| conf.name == name)
|
||||||
.is_some_and(|conf| conf.user_provided.contains(key));
|
.is_some_and(|conf| conf.user_provided.contains(key));
|
||||||
let embeddings = ExplicitVectors { embeddings: vector.into(), user_provided };
|
let embeddings = ExplicitVectors {
|
||||||
|
embeddings: Some(vector.into()),
|
||||||
|
regenerate: !user_provided,
|
||||||
|
};
|
||||||
vectors.insert(
|
vectors.insert(
|
||||||
name,
|
name,
|
||||||
serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?,
|
serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?,
|
||||||
|
@ -1072,7 +1072,8 @@ fn make_hits(
|
|||||||
.iter()
|
.iter()
|
||||||
.find(|conf| conf.name == name)
|
.find(|conf| conf.name == name)
|
||||||
.is_some_and(|conf| conf.user_provided.contains(id));
|
.is_some_and(|conf| conf.user_provided.contains(id));
|
||||||
let embeddings = ExplicitVectors { embeddings: vector.into(), user_provided };
|
let embeddings =
|
||||||
|
ExplicitVectors { embeddings: Some(vector.into()), regenerate: !user_provided };
|
||||||
vectors.insert(name, serde_json::to_value(embeddings)?);
|
vectors.insert(name, serde_json::to_value(embeddings)?);
|
||||||
}
|
}
|
||||||
document.insert("_vectors".into(), vectors.into());
|
document.insert("_vectors".into(), vectors.into());
|
||||||
|
@ -260,28 +260,33 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
// 2. an existing embedder changed so that it must regenerate all generated embeddings.
|
// 2. an existing embedder changed so that it must regenerate all generated embeddings.
|
||||||
// For a new embedder, there can be `_vectors.embedder` embeddings to add to the DB
|
// For a new embedder, there can be `_vectors.embedder` embeddings to add to the DB
|
||||||
VectorState::Inline(vectors) => {
|
VectorState::Inline(vectors) => {
|
||||||
if vectors.is_user_provided() {
|
if !vectors.must_regenerate() {
|
||||||
add_to_user_provided.insert(docid);
|
add_to_user_provided.insert(docid);
|
||||||
}
|
}
|
||||||
let add_vectors = vectors.into_array_of_vectors();
|
|
||||||
|
|
||||||
if add_vectors.len() > usize::from(u8::MAX) {
|
match vectors.into_array_of_vectors() {
|
||||||
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
Some(add_vectors) => {
|
||||||
document_id().to_string(),
|
if add_vectors.len() > usize::from(u8::MAX) {
|
||||||
add_vectors.len(),
|
return Err(crate::Error::UserError(
|
||||||
)));
|
crate::UserError::TooManyVectors(
|
||||||
|
document_id().to_string(),
|
||||||
|
add_vectors.len(),
|
||||||
|
),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
VectorStateDelta::NowManual(add_vectors)
|
||||||
|
}
|
||||||
|
None => VectorStateDelta::NoChange,
|
||||||
}
|
}
|
||||||
|
|
||||||
VectorStateDelta::NowManual(add_vectors)
|
|
||||||
}
|
}
|
||||||
// this happens only when an existing embedder changed. We cannot regenerate userProvided vectors
|
// this happens only when an existing embedder changed. We cannot regenerate userProvided vectors
|
||||||
VectorState::InDb => VectorStateDelta::NoChange,
|
VectorState::Manual => VectorStateDelta::NoChange,
|
||||||
// generated vectors must be regenerated
|
// generated vectors must be regenerated
|
||||||
VectorState::Generated => regenerate_prompt(obkv, prompt, new_fields_ids_map)?,
|
VectorState::Generated => regenerate_prompt(obkv, prompt, new_fields_ids_map)?,
|
||||||
},
|
},
|
||||||
// prompt regeneration is only triggered for existing embedders
|
// prompt regeneration is only triggered for existing embedders
|
||||||
ExtractionAction::SettingsRegeneratePrompts { old_prompt } => {
|
ExtractionAction::SettingsRegeneratePrompts { old_prompt } => {
|
||||||
if !old.is_user_provided() {
|
if old.must_regenerate() {
|
||||||
regenerate_if_prompt_changed(
|
regenerate_if_prompt_changed(
|
||||||
obkv,
|
obkv,
|
||||||
(old_prompt, prompt),
|
(old_prompt, prompt),
|
||||||
@ -362,31 +367,32 @@ fn extract_vector_document_diff(
|
|||||||
(old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap),
|
(old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap),
|
||||||
document_id: impl Fn() -> Value,
|
document_id: impl Fn() -> Value,
|
||||||
) -> Result<VectorStateDelta> {
|
) -> Result<VectorStateDelta> {
|
||||||
match (old.is_user_provided(), new.is_user_provided()) {
|
match (old.must_regenerate(), new.must_regenerate()) {
|
||||||
(true, true) | (false, false) => {}
|
(true, true) | (false, false) => {}
|
||||||
(true, false) => {
|
(true, false) => {
|
||||||
remove_from_user_provided.insert(docid);
|
add_to_user_provided.insert(docid);
|
||||||
}
|
}
|
||||||
(false, true) => {
|
(false, true) => {
|
||||||
add_to_user_provided.insert(docid);
|
remove_from_user_provided.insert(docid);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let delta = match (old, new) {
|
let delta = match (old, new) {
|
||||||
// regardless of the previous state, if a document now contains inline _vectors, they must
|
// regardless of the previous state, if a document now contains inline _vectors, they must
|
||||||
// be extracted manually
|
// be extracted manually
|
||||||
(_old, VectorState::Inline(new)) => {
|
(_old, VectorState::Inline(new)) => match new.into_array_of_vectors() {
|
||||||
let add_vectors = new.into_array_of_vectors();
|
Some(add_vectors) => {
|
||||||
|
if add_vectors.len() > usize::from(u8::MAX) {
|
||||||
|
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
||||||
|
document_id().to_string(),
|
||||||
|
add_vectors.len(),
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
if add_vectors.len() > usize::from(u8::MAX) {
|
VectorStateDelta::NowManual(add_vectors)
|
||||||
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
|
||||||
document_id().to_string(),
|
|
||||||
add_vectors.len(),
|
|
||||||
)));
|
|
||||||
}
|
}
|
||||||
|
None => VectorStateDelta::NoChange,
|
||||||
VectorStateDelta::NowManual(add_vectors)
|
},
|
||||||
}
|
|
||||||
// no `_vectors` anywhere, we check for document removal and otherwise we regenerate the prompt if the
|
// no `_vectors` anywhere, we check for document removal and otherwise we regenerate the prompt if the
|
||||||
// document changed
|
// document changed
|
||||||
(VectorState::Generated, VectorState::Generated) => {
|
(VectorState::Generated, VectorState::Generated) => {
|
||||||
@ -437,7 +443,7 @@ fn extract_vector_document_diff(
|
|||||||
VectorStateDelta::NowRemoved
|
VectorStateDelta::NowRemoved
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
(_old, VectorState::InDb) => {
|
(_old, VectorState::Manual) => {
|
||||||
// Do we keep this document?
|
// Do we keep this document?
|
||||||
let document_is_kept = obkv
|
let document_is_kept = obkv
|
||||||
.iter()
|
.iter()
|
||||||
|
@ -1068,8 +1068,10 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
Some(Ok((
|
Some(Ok((
|
||||||
name.to_string(),
|
name.to_string(),
|
||||||
serde_json::to_value(ExplicitVectors {
|
serde_json::to_value(ExplicitVectors {
|
||||||
embeddings: VectorOrArrayOfVectors::from_array_of_vectors(vectors),
|
embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors(
|
||||||
user_provided: true,
|
vectors,
|
||||||
|
)),
|
||||||
|
regenerate: false,
|
||||||
})
|
})
|
||||||
.unwrap(),
|
.unwrap(),
|
||||||
)))
|
)))
|
||||||
|
@ -18,18 +18,20 @@ pub enum Vectors {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Vectors {
|
impl Vectors {
|
||||||
pub fn is_user_provided(&self) -> bool {
|
pub fn must_regenerate(&self) -> bool {
|
||||||
match self {
|
match self {
|
||||||
Vectors::ImplicitlyUserProvided(_) => true,
|
Vectors::ImplicitlyUserProvided(_) => false,
|
||||||
Vectors::Explicit(ExplicitVectors { user_provided, .. }) => *user_provided,
|
Vectors::Explicit(ExplicitVectors { regenerate, .. }) => *regenerate,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn into_array_of_vectors(self) -> Vec<Embedding> {
|
pub fn into_array_of_vectors(self) -> Option<Vec<Embedding>> {
|
||||||
match self {
|
match self {
|
||||||
Vectors::ImplicitlyUserProvided(embeddings)
|
Vectors::ImplicitlyUserProvided(embeddings) => {
|
||||||
| Vectors::Explicit(ExplicitVectors { embeddings, user_provided: _ }) => {
|
Some(embeddings.into_array_of_vectors().unwrap_or_default())
|
||||||
embeddings.into_array_of_vectors().unwrap_or_default()
|
}
|
||||||
|
Vectors::Explicit(ExplicitVectors { embeddings, regenerate: _ }) => {
|
||||||
|
embeddings.map(|embeddings| embeddings.into_array_of_vectors().unwrap_or_default())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -38,22 +40,22 @@ impl Vectors {
|
|||||||
#[derive(serde::Serialize, serde::Deserialize, Debug)]
|
#[derive(serde::Serialize, serde::Deserialize, Debug)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
pub struct ExplicitVectors {
|
pub struct ExplicitVectors {
|
||||||
pub embeddings: VectorOrArrayOfVectors,
|
pub embeddings: Option<VectorOrArrayOfVectors>,
|
||||||
pub user_provided: bool,
|
pub regenerate: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub enum VectorState {
|
pub enum VectorState {
|
||||||
Inline(Vectors),
|
Inline(Vectors),
|
||||||
InDb,
|
Manual,
|
||||||
Generated,
|
Generated,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl VectorState {
|
impl VectorState {
|
||||||
pub fn is_user_provided(&self) -> bool {
|
pub fn must_regenerate(&self) -> bool {
|
||||||
match self {
|
match self {
|
||||||
VectorState::Inline(vectors) => vectors.is_user_provided(),
|
VectorState::Inline(vectors) => vectors.must_regenerate(),
|
||||||
VectorState::InDb => true,
|
VectorState::Manual => false,
|
||||||
VectorState::Generated => false,
|
VectorState::Generated => true,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -96,7 +98,7 @@ impl ParsedVectorsDiff {
|
|||||||
.flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, VectorState::Inline(vec))).collect());
|
.flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, VectorState::Inline(vec))).collect());
|
||||||
for embedding_config in embedders_configs {
|
for embedding_config in embedders_configs {
|
||||||
if embedding_config.user_provided.contains(docid) {
|
if embedding_config.user_provided.contains(docid) {
|
||||||
old.entry(embedding_config.name.to_string()).or_insert(VectorState::InDb);
|
old.entry(embedding_config.name.to_string()).or_insert(VectorState::Manual);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -121,7 +123,7 @@ impl ParsedVectorsDiff {
|
|||||||
let old = self.old.remove(embedder_name).unwrap_or(VectorState::Generated);
|
let old = self.old.remove(embedder_name).unwrap_or(VectorState::Generated);
|
||||||
let state_from_old = match old {
|
let state_from_old = match old {
|
||||||
// assume a userProvided is still userProvided
|
// assume a userProvided is still userProvided
|
||||||
VectorState::InDb => VectorState::InDb,
|
VectorState::Manual => VectorState::Manual,
|
||||||
// generated is still generated
|
// generated is still generated
|
||||||
VectorState::Generated => VectorState::Generated,
|
VectorState::Generated => VectorState::Generated,
|
||||||
// weird case that shouldn't happen were the previous docs version is inline,
|
// weird case that shouldn't happen were the previous docs version is inline,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user