Merge pull request #5668 from meilisearch/fix-must-regenerate

Various fixes to embedding regeneration
This commit is contained in:
Clément Renault 2025-06-12 14:48:38 +00:00 committed by GitHub
commit e2b549c5ee
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -111,6 +111,8 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> {
let prompt = chunks.prompt(); let prompt = chunks.prompt();
let old_vectors = old_vectors.vectors_for_key(embedder_name)?.unwrap(); let old_vectors = old_vectors.vectors_for_key(embedder_name)?.unwrap();
// case where we have a `_vectors` field in the updated document
if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| { if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| {
new_vectors.vectors_for_key(embedder_name).transpose() new_vectors.vectors_for_key(embedder_name).transpose()
}) { }) {
@ -130,18 +132,9 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> {
error: error.to_string(), error: error.to_string(),
})?, })?,
)?; )?;
// regenerate if the new `_vectors` fields is set to.
} else if new_vectors.regenerate { } else if new_vectors.regenerate {
let new_rendered = prompt.render_document( let new_rendered = prompt.render_document(
update.external_document_id(),
update.current(
&context.rtxn,
context.index,
context.db_fields_ids_map,
)?,
context.new_fields_ids_map,
&context.doc_alloc,
)?;
let old_rendered = prompt.render_document(
update.external_document_id(), update.external_document_id(),
update.merged( update.merged(
&context.rtxn, &context.rtxn,
@ -151,7 +144,31 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> {
context.new_fields_ids_map, context.new_fields_ids_map,
&context.doc_alloc, &context.doc_alloc,
)?; )?;
if new_rendered != old_rendered { let must_regenerate = if !old_vectors.regenerate {
// we just enabled `regenerate`
true
} else {
let old_rendered = prompt.render_document(
update.external_document_id(),
update.current(
&context.rtxn,
context.index,
context.db_fields_ids_map,
)?,
context.new_fields_ids_map,
&context.doc_alloc,
);
if let Ok(old_rendered) = old_rendered {
// must regenerate if the rendered changed
new_rendered != old_rendered
} else {
// cannot check previous rendered, better regenerate
true
}
};
if must_regenerate {
chunks.set_autogenerated( chunks.set_autogenerated(
update.docid(), update.docid(),
update.external_document_id(), update.external_document_id(),
@ -160,17 +177,8 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> {
)?; )?;
} }
} }
// no `_vectors` field, so only regenerate if the document is already set to in the DB.
} else if old_vectors.regenerate { } else if old_vectors.regenerate {
let old_rendered = prompt.render_document(
update.external_document_id(),
update.current(
&context.rtxn,
context.index,
context.db_fields_ids_map,
)?,
context.new_fields_ids_map,
&context.doc_alloc,
)?;
let new_rendered = prompt.render_document( let new_rendered = prompt.render_document(
update.external_document_id(), update.external_document_id(),
update.merged( update.merged(
@ -181,7 +189,28 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> {
context.new_fields_ids_map, context.new_fields_ids_map,
&context.doc_alloc, &context.doc_alloc,
)?; )?;
if new_rendered != old_rendered {
let must_regenerate = {
let old_rendered = prompt.render_document(
update.external_document_id(),
update.current(
&context.rtxn,
context.index,
context.db_fields_ids_map,
)?,
context.new_fields_ids_map,
&context.doc_alloc,
);
if let Ok(old_rendered) = old_rendered {
// regenerate if the rendered version changed
new_rendered != old_rendered
} else {
// if we cannot render the previous version of the documents, let's regenerate
true
}
};
if must_regenerate {
chunks.set_autogenerated( chunks.set_autogenerated(
update.docid(), update.docid(),
update.external_document_id(), update.external_document_id(),