From a3254d7d7d9085aa8c83929cec1f64fb88c1e686 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 7 Jul 2025 11:57:08 +0200 Subject: [PATCH] Implement dumpless upgrade from v1.15 to v1.16 --- crates/milli/src/update/upgrade/mod.rs | 4 ++ crates/milli/src/update/upgrade/v1_15.rs | 13 +++++++ crates/milli/src/update/upgrade/v1_16.rs | 48 ++++++++++++++++++++++++ crates/milli/src/vector/db.rs | 7 ++++ 4 files changed, 72 insertions(+) create mode 100644 crates/milli/src/update/upgrade/v1_16.rs diff --git a/crates/milli/src/update/upgrade/mod.rs b/crates/milli/src/update/upgrade/mod.rs index c23e5c8b1..f53319a37 100644 --- a/crates/milli/src/update/upgrade/mod.rs +++ b/crates/milli/src/update/upgrade/mod.rs @@ -2,6 +2,7 @@ mod v1_12; mod v1_13; mod v1_14; mod v1_15; +mod v1_16; use heed::RwTxn; use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3}; use v1_13::{V1_13_0_To_V1_13_1, V1_13_1_To_Latest_V1_13}; @@ -10,6 +11,7 @@ use v1_15::Latest_V1_14_To_Latest_V1_15; use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH}; use crate::progress::{Progress, VariableNameStep}; +use crate::update::upgrade::v1_16::Latest_V1_15_To_V1_16_0; use crate::{Index, InternalError, Result}; trait UpgradeIndex { @@ -31,6 +33,7 @@ const UPGRADE_FUNCTIONS: &[&dyn UpgradeIndex] = &[ &V1_13_1_To_Latest_V1_13 {}, &Latest_V1_13_To_Latest_V1_14 {}, &Latest_V1_14_To_Latest_V1_15 {}, + &Latest_V1_15_To_V1_16_0 {}, // This is the last upgrade function, it will be called when the index is up to date. // any other upgrade function should be added before this one. &ToCurrentNoOp {}, @@ -58,6 +61,7 @@ const fn start(from: (u32, u32, u32)) -> Option { (1, 14, _) => function_index!(5), // We must handle the current version in the match because in case of a failure some index may have been upgraded but not other. (1, 15, _) => function_index!(6), + (1, 16, _) => function_index!(7), // We deliberately don't add a placeholder with (VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH) here to force manually // considering dumpless upgrade. (_major, _minor, _patch) => return None, diff --git a/crates/milli/src/update/upgrade/v1_15.rs b/crates/milli/src/update/upgrade/v1_15.rs index cea4783a1..9ca25d06b 100644 --- a/crates/milli/src/update/upgrade/v1_15.rs +++ b/crates/milli/src/update/upgrade/v1_15.rs @@ -1,4 +1,6 @@ use heed::RwTxn; +use roaring::RoaringBitmap; +use serde::{Deserialize, Serialize}; use super::UpgradeIndex; use crate::progress::Progress; @@ -26,3 +28,14 @@ impl UpgradeIndex for Latest_V1_14_To_Latest_V1_15 { (1, 15, 0) } } + +/// Parts of v1.15 `IndexingEmbeddingConfig` that are relevant for upgrade to v1.16 +/// +/// # Warning +/// +/// This object should not be rewritten to the DB, only read to get the name and `user_provided` roaring. +#[derive(Debug, Deserialize, Serialize)] +pub struct IndexEmbeddingConfig { + pub name: String, + pub user_provided: RoaringBitmap, +} diff --git a/crates/milli/src/update/upgrade/v1_16.rs b/crates/milli/src/update/upgrade/v1_16.rs new file mode 100644 index 000000000..f43efd77d --- /dev/null +++ b/crates/milli/src/update/upgrade/v1_16.rs @@ -0,0 +1,48 @@ +use heed::types::{SerdeJson, Str}; +use heed::RwTxn; + +use super::UpgradeIndex; +use crate::progress::Progress; +use crate::vector::db::{EmbedderInfo, EmbeddingStatus}; +use crate::{Index, InternalError, Result}; + +#[allow(non_camel_case_types)] +pub(super) struct Latest_V1_15_To_V1_16_0(); + +impl UpgradeIndex for Latest_V1_15_To_V1_16_0 { + fn upgrade( + &self, + wtxn: &mut RwTxn, + index: &Index, + _original: (u32, u32, u32), + _progress: Progress, + ) -> Result { + let v1_15_indexing_configs = index + .main + .remap_types::>>() + .get(wtxn, crate::index::main_key::EMBEDDING_CONFIGS)? + .unwrap_or_default(); + + let embedders = index.embedding_configs(); + for config in v1_15_indexing_configs { + let embedder_id = embedders.embedder_id(wtxn, &config.name)?.ok_or( + InternalError::DatabaseMissingEntry { + db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, + key: None, + }, + )?; + let info = EmbedderInfo { + embedder_id, + // v1.15 used not to make a difference between `user_provided` and `! regenerate`. + embedding_status: EmbeddingStatus::from_user_provided(config.user_provided), + }; + embedders.put_embedder_info(wtxn, &config.name, &info)?; + } + + Ok(false) + } + + fn target_version(&self) -> (u32, u32, u32) { + (1, 16, 0) + } +} diff --git a/crates/milli/src/vector/db.rs b/crates/milli/src/vector/db.rs index 0e890fac9..2fea75d68 100644 --- a/crates/milli/src/vector/db.rs +++ b/crates/milli/src/vector/db.rs @@ -117,6 +117,13 @@ impl EmbeddingStatus { Default::default() } + /// Create a new `EmbeddingStatus` that assumes that any `user_provided` docid is also skipping regenerate. + /// + /// Used for migration from v1.15 and earlier DBs. + pub(crate) fn from_user_provided(user_provided: RoaringBitmap) -> Self { + Self { user_provided, skip_regenerate_different_from_user_provided: Default::default() } + } + /// Whether the document contains user-provided vectors for that embedder. pub fn is_user_provided(&self, docid: DocumentId) -> bool { self.user_provided.contains(docid)