diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 47bd622ae..43647e786 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -111,6 +111,8 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { let prompt = chunks.prompt(); let old_vectors = old_vectors.vectors_for_key(embedder_name)?.unwrap(); + + // case where we have a `_vectors` field in the updated document if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| { new_vectors.vectors_for_key(embedder_name).transpose() }) { @@ -130,18 +132,9 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { error: error.to_string(), })?, )?; + // regenerate if the new `_vectors` fields is set to. } else if new_vectors.regenerate { let new_rendered = prompt.render_document( - update.external_document_id(), - update.current( - &context.rtxn, - context.index, - context.db_fields_ids_map, - )?, - context.new_fields_ids_map, - &context.doc_alloc, - )?; - let old_rendered = prompt.render_document( update.external_document_id(), update.merged( &context.rtxn, @@ -151,7 +144,31 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { context.new_fields_ids_map, &context.doc_alloc, )?; - if new_rendered != old_rendered { + let must_regenerate = if !old_vectors.regenerate { + // we just enabled `regenerate` + true + } else { + let old_rendered = prompt.render_document( + update.external_document_id(), + update.current( + &context.rtxn, + context.index, + context.db_fields_ids_map, + )?, + context.new_fields_ids_map, + &context.doc_alloc, + ); + + if let Ok(old_rendered) = old_rendered { + // must regenerate if the rendered changed + new_rendered != old_rendered + } else { + // cannot check previous rendered, better regenerate + true + } + }; + + if must_regenerate { chunks.set_autogenerated( update.docid(), update.external_document_id(), @@ -160,17 +177,8 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { )?; } } + // no `_vectors` field, so only regenerate if the document is already set to in the DB. } else if old_vectors.regenerate { - let old_rendered = prompt.render_document( - update.external_document_id(), - update.current( - &context.rtxn, - context.index, - context.db_fields_ids_map, - )?, - context.new_fields_ids_map, - &context.doc_alloc, - )?; let new_rendered = prompt.render_document( update.external_document_id(), update.merged( @@ -181,7 +189,28 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { context.new_fields_ids_map, &context.doc_alloc, )?; - if new_rendered != old_rendered { + + let must_regenerate = { + let old_rendered = prompt.render_document( + update.external_document_id(), + update.current( + &context.rtxn, + context.index, + context.db_fields_ids_map, + )?, + context.new_fields_ids_map, + &context.doc_alloc, + ); + if let Ok(old_rendered) = old_rendered { + // regenerate if the rendered version changed + new_rendered != old_rendered + } else { + // if we cannot render the previous version of the documents, let's regenerate + true + } + }; + + if must_regenerate { chunks.set_autogenerated( update.docid(), update.external_document_id(),