wraps the index embedding config in a struct

This commit is contained in:
Tamo 2024-05-30 11:50:30 +02:00
parent 04f6523f3c
commit 9eb6f522ea
7 changed files with 112 additions and 75 deletions

View file

@ -785,6 +785,7 @@ mod tests {
use super::*;
use crate::documents::documents_batch_reader_from_objects;
use crate::index::tests::TempIndex;
use crate::index::IndexEmbeddingConfig;
use crate::search::TermsMatchingStrategy;
use crate::update::Setting;
use crate::{db_snap, Filter, Search};
@ -2620,7 +2621,8 @@ mod tests {
let rtxn = index.read_txn().unwrap();
let mut embedding_configs = index.embedding_configs(&rtxn).unwrap();
let (embedder_name, embedder, user_defined) = embedding_configs.pop().unwrap();
let IndexEmbeddingConfig { name: embedder_name, config: embedder, user_defined } =
embedding_configs.pop().unwrap();
insta::assert_snapshot!(embedder_name, @"manual");
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0, 1, 2]>");
let embedder =

View file

@ -20,6 +20,7 @@ use super::MergeFn;
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
use crate::facet::FacetType;
use crate::index::db_name::DOCUMENTS;
use crate::index::IndexEmbeddingConfig;
use crate::proximity::MAX_DISTANCE;
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
use crate::update::facet::FacetsUpdate;
@ -156,8 +157,11 @@ pub(crate) fn write_typed_chunk_into_index(
let mut docids = index.documents_ids(wtxn)?;
let mut iter = merger.into_stream_merger_iter()?;
let embedders: BTreeSet<_> =
index.embedding_configs(wtxn)?.into_iter().map(|(name, _, _)| name).collect();
let embedders: BTreeSet<_> = index
.embedding_configs(wtxn)?
.into_iter()
.map(|IndexEmbeddingConfig { name, .. }| name)
.collect();
let mut vectors_buffer = Vec::new();
while let Some((key, reader)) = iter.next()? {
let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
@ -653,10 +657,12 @@ pub(crate) fn write_typed_chunk_into_index(
let Some((expected_dimension, embedder_name)) = params else { unreachable!() };
let mut embedding_configs = index.embedding_configs(&wtxn)?;
let (_name, _conf, ud) =
embedding_configs.iter_mut().find(|config| config.0 == embedder_name).unwrap();
*ud -= remove_from_user_defined;
*ud |= user_defined;
let index_embedder_config = embedding_configs
.iter_mut()
.find(|IndexEmbeddingConfig { name, .. }| name == &embedder_name)
.unwrap();
index_embedder_config.user_defined -= remove_from_user_defined;
index_embedder_config.user_defined |= user_defined;
index.put_embedding_configs(wtxn, embedding_configs)?;

View file

@ -15,7 +15,9 @@ use super::index_documents::{IndexDocumentsConfig, Transform};
use super::IndexerConfig;
use crate::criterion::Criterion;
use crate::error::UserError;
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
use crate::index::{
IndexEmbeddingConfig, DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS,
};
use crate::order_by_map::OrderByMap;
use crate::proximity::ProximityPrecision;
use crate::update::index_documents::IndexDocumentsMethod;
@ -930,8 +932,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
let old_configs: BTreeMap<String, (Setting<EmbeddingSettings>, RoaringBitmap)> =
old_configs
.into_iter()
.map(|(name, setting, user_defined)| {
(name, (Setting::Set(setting.into()), user_defined))
.map(|IndexEmbeddingConfig { name, config, user_defined }| {
(name, (Setting::Set(config.into()), user_defined))
})
.collect();
@ -975,23 +977,27 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
}
}
}
let new_configs: Vec<(String, EmbeddingConfig, RoaringBitmap)> = new_configs
let new_configs: Vec<IndexEmbeddingConfig> = new_configs
.into_iter()
.filter_map(|(name, (setting, user_defined))| match setting {
Setting::Set(settings) => Some((name, settings.into(), user_defined)),
Setting::Reset => None,
Setting::NotSet => {
Some((name, EmbeddingSettings::default().into(), user_defined))
.filter_map(|(name, (config, user_defined))| match config {
Setting::Set(config) => {
Some(IndexEmbeddingConfig { name, config: config.into(), user_defined })
}
Setting::Reset => None,
Setting::NotSet => Some(IndexEmbeddingConfig {
name,
config: EmbeddingSettings::default().into(),
user_defined,
}),
})
.collect();
self.index.embedder_category_id.clear(self.wtxn)?;
for (index, (embedder_name, _, _)) in new_configs.iter().enumerate() {
for (index, index_embedding_config) in new_configs.iter().enumerate() {
self.index.embedder_category_id.put_with_flags(
self.wtxn,
heed::PutFlags::APPEND,
embedder_name,
&index_embedding_config.name,
&index
.try_into()
.map_err(|_| UserError::TooManyEmbedders(new_configs.len()))?,
@ -1371,21 +1377,25 @@ impl InnerIndexSettings {
}
}
fn embedders(
embedding_configs: Vec<(String, EmbeddingConfig, RoaringBitmap)>,
) -> Result<EmbeddingConfigs> {
fn embedders(embedding_configs: Vec<IndexEmbeddingConfig>) -> Result<EmbeddingConfigs> {
let res: Result<_> = embedding_configs
.into_iter()
.map(|(name, EmbeddingConfig { embedder_options, prompt }, _)| {
let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?);
.map(
|IndexEmbeddingConfig {
name,
config: EmbeddingConfig { embedder_options, prompt },
..
}| {
let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?);
let embedder = Arc::new(
Embedder::new(embedder_options.clone())
.map_err(crate::vector::Error::from)
.map_err(crate::Error::from)?,
);
Ok((name, (embedder, prompt)))
})
let embedder = Arc::new(
Embedder::new(embedder_options.clone())
.map_err(crate::vector::Error::from)
.map_err(crate::Error::from)?,
);
Ok((name, (embedder, prompt)))
},
)
.collect();
res.map(EmbeddingConfigs::new)
}