From 9eb6f522ea62e6dd06cedd8ee553b0d7101e1d1a Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 30 May 2024 11:50:30 +0200 Subject: [PATCH] wraps the index embedding config in a struct --- Cargo.lock | 4 +- index-scheduler/src/lib.rs | 77 +++++++++++-------- meilisearch-types/src/settings.rs | 3 +- milli/src/index.rs | 25 +++--- milli/src/update/index_documents/mod.rs | 4 +- .../src/update/index_documents/typed_chunk.rs | 18 +++-- milli/src/update/settings.rs | 56 ++++++++------ 7 files changed, 112 insertions(+), 75 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3b28a00e3..b00e94072 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5310,9 +5310,9 @@ dependencies = [ [[package]] name = "tracing-actix-web" -version = "0.7.10" +version = "0.7.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa069bd1503dd526ee793bb3fce408895136c95fc86d2edb2acf1c646d7f0684" +checksum = "4ee9e39a66d9b615644893ffc1704d2a89b5b315b7fd0228ad3182ca9a306b19" dependencies = [ "actix-web", "mutually_exclusive_features", diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index c76a207f5..d007acd2c 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -53,6 +53,7 @@ use meilisearch_types::heed::byteorder::BE; use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str, I128}; use meilisearch_types::heed::{self, Database, Env, PutFlags, RoTxn, RwTxn}; use meilisearch_types::milli::documents::DocumentsBatchBuilder; +use meilisearch_types::milli::index::IndexEmbeddingConfig; use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs}; use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32}; @@ -1459,33 +1460,39 @@ impl IndexScheduler { // TODO: consider using a type alias or a struct embedder/template pub fn embedders( &self, - embedding_configs: Vec<(String, milli::vector::EmbeddingConfig, RoaringBitmap)>, + embedding_configs: Vec, ) -> Result { let res: Result<_> = embedding_configs .into_iter() - .map(|(name, milli::vector::EmbeddingConfig { embedder_options, prompt }, _)| { - let prompt = - Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?); - // optimistically return existing embedder - { - let embedders = self.embedders.read().unwrap(); - if let Some(embedder) = embedders.get(&embedder_options) { - return Ok((name, (embedder.clone(), prompt))); + .map( + |IndexEmbeddingConfig { + name, + config: milli::vector::EmbeddingConfig { embedder_options, prompt }, + .. + }| { + let prompt = + Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?); + // optimistically return existing embedder + { + let embedders = self.embedders.read().unwrap(); + if let Some(embedder) = embedders.get(&embedder_options) { + return Ok((name, (embedder.clone(), prompt))); + } } - } - // add missing embedder - let embedder = Arc::new( - Embedder::new(embedder_options.clone()) - .map_err(meilisearch_types::milli::vector::Error::from) - .map_err(meilisearch_types::milli::Error::from)?, - ); - { - let mut embedders = self.embedders.write().unwrap(); - embedders.insert(embedder_options, embedder.clone()); - } - Ok((name, (embedder, prompt))) - }) + // add missing embedder + let embedder = Arc::new( + Embedder::new(embedder_options.clone()) + .map_err(meilisearch_types::milli::vector::Error::from) + .map_err(meilisearch_types::milli::Error::from)?, + ); + { + let mut embedders = self.embedders.write().unwrap(); + embedders.insert(embedder_options, embedder.clone()); + } + Ok((name, (embedder, prompt))) + }, + ) .collect(); res.map(EmbeddingConfigs::new) } @@ -3055,10 +3062,10 @@ mod tests { let rtxn = index.read_txn().unwrap(); let configs = index.embedding_configs(&rtxn).unwrap(); - let (name, embedding_config, user_provided) = configs.first().unwrap(); + let IndexEmbeddingConfig { name, config, user_defined } = configs.first().unwrap(); insta::assert_snapshot!(name, @"default"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); - insta::assert_json_snapshot!(embedding_config.embedder_options); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); + insta::assert_json_snapshot!(config.embedder_options); } #[test] @@ -5022,15 +5029,17 @@ mod tests { let configs = index.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let (name, fakerest_config, user_provided) = configs.get(0).unwrap(); + let IndexEmbeddingConfig { name, config: fakerest_config, user_defined } = + configs.get(0).unwrap(); insta::assert_snapshot!(name, @"A_fakerest"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(fakerest_config.embedder_options); let fakerest_name = name.clone(); - let (name, simple_hf_config, user_provided) = configs.get(1).unwrap(); + let IndexEmbeddingConfig { name, config: simple_hf_config, user_defined } = + configs.get(1).unwrap(); insta::assert_snapshot!(name, @"B_small_hf"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(simple_hf_config.embedder_options); let simple_hf_name = name.clone(); @@ -5102,11 +5111,11 @@ mod tests { let configs = index.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let (name, _config, user_defined) = configs.get(0).unwrap(); + let IndexEmbeddingConfig { name, config: _, user_defined } = configs.get(0).unwrap(); insta::assert_snapshot!(name, @"A_fakerest"); insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); - let (name, _config, user_defined) = configs.get(1).unwrap(); + let IndexEmbeddingConfig { name, config: _, user_defined } = configs.get(1).unwrap(); insta::assert_snapshot!(name, @"B_small_hf"); insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); @@ -5178,11 +5187,13 @@ mod tests { let configs = index.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let (name, _config, user_defined) = configs.get(0).unwrap(); + let IndexEmbeddingConfig { name, config: _, user_defined } = + configs.get(0).unwrap(); insta::assert_snapshot!(name, @"A_fakerest"); insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); - let (name, _config, user_defined) = configs.get(1).unwrap(); + let IndexEmbeddingConfig { name, config: _, user_defined } = + configs.get(1).unwrap(); insta::assert_snapshot!(name, @"B_small_hf"); insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); diff --git a/meilisearch-types/src/settings.rs b/meilisearch-types/src/settings.rs index d1d82be68..8a9708d29 100644 --- a/meilisearch-types/src/settings.rs +++ b/meilisearch-types/src/settings.rs @@ -8,6 +8,7 @@ use std::str::FromStr; use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef}; use fst::IntoStreamer; +use milli::index::IndexEmbeddingConfig; use milli::proximity::ProximityPrecision; use milli::update::Setting; use milli::{Criterion, CriterionError, Index, DEFAULT_VALUES_PER_FACET}; @@ -672,7 +673,7 @@ pub fn settings( let embedders: BTreeMap<_, _> = index .embedding_configs(rtxn)? .into_iter() - .map(|(name, config, _)| (name, Setting::Set(config.into()))) + .map(|IndexEmbeddingConfig { name, config, .. }| (name, Setting::Set(config.into()))) .collect(); let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) }; diff --git a/milli/src/index.rs b/milli/src/index.rs index 569a9a692..a47c07e08 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -9,6 +9,7 @@ use heed::types::*; use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified}; use roaring::RoaringBitmap; use rstar::RTree; +use serde::{Deserialize, Serialize}; use time::OffsetDateTime; use crate::documents::PrimaryKey; @@ -1579,24 +1580,23 @@ impl Index { pub(crate) fn put_embedding_configs( &self, wtxn: &mut RwTxn<'_>, - configs: Vec<(String, EmbeddingConfig, RoaringBitmap)>, + configs: Vec, ) -> heed::Result<()> { - self.main - .remap_types::>>() - .put(wtxn, main_key::EMBEDDING_CONFIGS, &configs) + self.main.remap_types::>>().put( + wtxn, + main_key::EMBEDDING_CONFIGS, + &configs, + ) } pub(crate) fn delete_embedding_configs(&self, wtxn: &mut RwTxn<'_>) -> heed::Result { self.main.remap_key_type::().delete(wtxn, main_key::EMBEDDING_CONFIGS) } - pub fn embedding_configs( - &self, - rtxn: &RoTxn<'_>, - ) -> Result> { + pub fn embedding_configs(&self, rtxn: &RoTxn<'_>) -> Result> { Ok(self .main - .remap_types::>>() + .remap_types::>>() .get(rtxn, main_key::EMBEDDING_CONFIGS)? .unwrap_or_default()) } @@ -1668,6 +1668,13 @@ impl Index { } } +#[derive(Debug, Deserialize, Serialize)] +pub struct IndexEmbeddingConfig { + pub name: String, + pub config: EmbeddingConfig, + pub user_defined: RoaringBitmap, +} + #[cfg(test)] pub(crate) mod tests { use std::collections::HashSet; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index a03e4333e..2dc93f67a 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -785,6 +785,7 @@ mod tests { use super::*; use crate::documents::documents_batch_reader_from_objects; use crate::index::tests::TempIndex; + use crate::index::IndexEmbeddingConfig; use crate::search::TermsMatchingStrategy; use crate::update::Setting; use crate::{db_snap, Filter, Search}; @@ -2620,7 +2621,8 @@ mod tests { let rtxn = index.read_txn().unwrap(); let mut embedding_configs = index.embedding_configs(&rtxn).unwrap(); - let (embedder_name, embedder, user_defined) = embedding_configs.pop().unwrap(); + let IndexEmbeddingConfig { name: embedder_name, config: embedder, user_defined } = + embedding_configs.pop().unwrap(); insta::assert_snapshot!(embedder_name, @"manual"); insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0, 1, 2]>"); let embedder = diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 2c4e17858..078010554 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -20,6 +20,7 @@ use super::MergeFn; use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; use crate::facet::FacetType; use crate::index::db_name::DOCUMENTS; +use crate::index::IndexEmbeddingConfig; use crate::proximity::MAX_DISTANCE; use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; @@ -156,8 +157,11 @@ pub(crate) fn write_typed_chunk_into_index( let mut docids = index.documents_ids(wtxn)?; let mut iter = merger.into_stream_merger_iter()?; - let embedders: BTreeSet<_> = - index.embedding_configs(wtxn)?.into_iter().map(|(name, _, _)| name).collect(); + let embedders: BTreeSet<_> = index + .embedding_configs(wtxn)? + .into_iter() + .map(|IndexEmbeddingConfig { name, .. }| name) + .collect(); let mut vectors_buffer = Vec::new(); while let Some((key, reader)) = iter.next()? { let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); @@ -653,10 +657,12 @@ pub(crate) fn write_typed_chunk_into_index( let Some((expected_dimension, embedder_name)) = params else { unreachable!() }; let mut embedding_configs = index.embedding_configs(&wtxn)?; - let (_name, _conf, ud) = - embedding_configs.iter_mut().find(|config| config.0 == embedder_name).unwrap(); - *ud -= remove_from_user_defined; - *ud |= user_defined; + let index_embedder_config = embedding_configs + .iter_mut() + .find(|IndexEmbeddingConfig { name, .. }| name == &embedder_name) + .unwrap(); + index_embedder_config.user_defined -= remove_from_user_defined; + index_embedder_config.user_defined |= user_defined; index.put_embedding_configs(wtxn, embedding_configs)?; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 64998bcc3..6b07e614e 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -15,7 +15,9 @@ use super::index_documents::{IndexDocumentsConfig, Transform}; use super::IndexerConfig; use crate::criterion::Criterion; use crate::error::UserError; -use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; +use crate::index::{ + IndexEmbeddingConfig, DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS, +}; use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; @@ -930,8 +932,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { let old_configs: BTreeMap, RoaringBitmap)> = old_configs .into_iter() - .map(|(name, setting, user_defined)| { - (name, (Setting::Set(setting.into()), user_defined)) + .map(|IndexEmbeddingConfig { name, config, user_defined }| { + (name, (Setting::Set(config.into()), user_defined)) }) .collect(); @@ -975,23 +977,27 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { } } } - let new_configs: Vec<(String, EmbeddingConfig, RoaringBitmap)> = new_configs + let new_configs: Vec = new_configs .into_iter() - .filter_map(|(name, (setting, user_defined))| match setting { - Setting::Set(settings) => Some((name, settings.into(), user_defined)), - Setting::Reset => None, - Setting::NotSet => { - Some((name, EmbeddingSettings::default().into(), user_defined)) + .filter_map(|(name, (config, user_defined))| match config { + Setting::Set(config) => { + Some(IndexEmbeddingConfig { name, config: config.into(), user_defined }) } + Setting::Reset => None, + Setting::NotSet => Some(IndexEmbeddingConfig { + name, + config: EmbeddingSettings::default().into(), + user_defined, + }), }) .collect(); self.index.embedder_category_id.clear(self.wtxn)?; - for (index, (embedder_name, _, _)) in new_configs.iter().enumerate() { + for (index, index_embedding_config) in new_configs.iter().enumerate() { self.index.embedder_category_id.put_with_flags( self.wtxn, heed::PutFlags::APPEND, - embedder_name, + &index_embedding_config.name, &index .try_into() .map_err(|_| UserError::TooManyEmbedders(new_configs.len()))?, @@ -1371,21 +1377,25 @@ impl InnerIndexSettings { } } -fn embedders( - embedding_configs: Vec<(String, EmbeddingConfig, RoaringBitmap)>, -) -> Result { +fn embedders(embedding_configs: Vec) -> Result { let res: Result<_> = embedding_configs .into_iter() - .map(|(name, EmbeddingConfig { embedder_options, prompt }, _)| { - let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?); + .map( + |IndexEmbeddingConfig { + name, + config: EmbeddingConfig { embedder_options, prompt }, + .. + }| { + let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?); - let embedder = Arc::new( - Embedder::new(embedder_options.clone()) - .map_err(crate::vector::Error::from) - .map_err(crate::Error::from)?, - ); - Ok((name, (embedder, prompt))) - }) + let embedder = Arc::new( + Embedder::new(embedder_options.clone()) + .map_err(crate::vector::Error::from) + .map_err(crate::Error::from)?, + ); + Ok((name, (embedder, prompt))) + }, + ) .collect(); res.map(EmbeddingConfigs::new) }