mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 12:27:13 +02:00
Implement in old settings indexer and old dump import indexer
This commit is contained in:
parent
f8232976ed
commit
cab5e35ff7
6 changed files with 824 additions and 246 deletions
File diff suppressed because it is too large
Load diff
|
@ -23,16 +23,17 @@ use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, Extra
|
||||||
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
||||||
use self::extract_geo_points::extract_geo_points;
|
use self::extract_geo_points::extract_geo_points;
|
||||||
use self::extract_vector_points::{
|
use self::extract_vector_points::{
|
||||||
extract_embeddings, extract_vector_points, ExtractedVectorPoints,
|
extract_embeddings_from_prompts, extract_vector_points, ExtractedVectorPoints,
|
||||||
};
|
};
|
||||||
use self::extract_word_docids::extract_word_docids;
|
use self::extract_word_docids::extract_word_docids;
|
||||||
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
||||||
use self::extract_word_position_docids::extract_word_position_docids;
|
use self::extract_word_position_docids::extract_word_position_docids;
|
||||||
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
|
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
|
||||||
use super::{helpers, TypedChunk};
|
use super::{helpers, TypedChunk};
|
||||||
use crate::index::IndexEmbeddingConfig;
|
|
||||||
use crate::progress::EmbedderStats;
|
use crate::progress::EmbedderStats;
|
||||||
|
use crate::update::index_documents::extract::extract_vector_points::extract_embeddings_from_fragments;
|
||||||
use crate::update::settings::InnerIndexSettingsDiff;
|
use crate::update::settings::InnerIndexSettingsDiff;
|
||||||
|
use crate::vector::db::EmbedderInfo;
|
||||||
use crate::vector::error::PossibleEmbeddingMistakes;
|
use crate::vector::error::PossibleEmbeddingMistakes;
|
||||||
use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
|
use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
|
||||||
|
|
||||||
|
@ -46,9 +47,9 @@ pub(crate) fn data_from_obkv_documents(
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||||
primary_key_id: FieldId,
|
primary_key_id: FieldId,
|
||||||
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
|
|
||||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||||
max_positions_per_attributes: Option<u32>,
|
max_positions_per_attributes: Option<u32>,
|
||||||
|
embedder_info: Arc<Vec<(String, EmbedderInfo)>>,
|
||||||
possible_embedding_mistakes: Arc<PossibleEmbeddingMistakes>,
|
possible_embedding_mistakes: Arc<PossibleEmbeddingMistakes>,
|
||||||
embedder_stats: &Arc<EmbedderStats>,
|
embedder_stats: &Arc<EmbedderStats>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
|
@ -61,8 +62,8 @@ pub(crate) fn data_from_obkv_documents(
|
||||||
original_documents_chunk,
|
original_documents_chunk,
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
embedders_configs.clone(),
|
|
||||||
settings_diff.clone(),
|
settings_diff.clone(),
|
||||||
|
embedder_info.clone(),
|
||||||
possible_embedding_mistakes.clone(),
|
possible_embedding_mistakes.clone(),
|
||||||
embedder_stats.clone(),
|
embedder_stats.clone(),
|
||||||
)
|
)
|
||||||
|
@ -231,8 +232,8 @@ fn send_original_documents_data(
|
||||||
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||||
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
|
|
||||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||||
|
embedder_info: Arc<Vec<(String, EmbedderInfo)>>,
|
||||||
possible_embedding_mistakes: Arc<PossibleEmbeddingMistakes>,
|
possible_embedding_mistakes: Arc<PossibleEmbeddingMistakes>,
|
||||||
embedder_stats: Arc<EmbedderStats>,
|
embedder_stats: Arc<EmbedderStats>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
|
@ -245,7 +246,6 @@ fn send_original_documents_data(
|
||||||
|
|
||||||
if index_vectors {
|
if index_vectors {
|
||||||
let settings_diff = settings_diff.clone();
|
let settings_diff = settings_diff.clone();
|
||||||
let embedders_configs = embedders_configs.clone();
|
|
||||||
|
|
||||||
let original_documents_chunk = original_documents_chunk.clone();
|
let original_documents_chunk = original_documents_chunk.clone();
|
||||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||||
|
@ -253,8 +253,8 @@ fn send_original_documents_data(
|
||||||
match extract_vector_points(
|
match extract_vector_points(
|
||||||
original_documents_chunk.clone(),
|
original_documents_chunk.clone(),
|
||||||
indexer,
|
indexer,
|
||||||
&embedders_configs,
|
|
||||||
&settings_diff,
|
&settings_diff,
|
||||||
|
embedder_info.as_slice(),
|
||||||
&possible_embedding_mistakes,
|
&possible_embedding_mistakes,
|
||||||
) {
|
) {
|
||||||
Ok((extracted_vectors, unused_vectors_distribution)) => {
|
Ok((extracted_vectors, unused_vectors_distribution)) => {
|
||||||
|
@ -262,16 +262,16 @@ fn send_original_documents_data(
|
||||||
manual_vectors,
|
manual_vectors,
|
||||||
remove_vectors,
|
remove_vectors,
|
||||||
prompts,
|
prompts,
|
||||||
|
inputs,
|
||||||
embedder_name,
|
embedder_name,
|
||||||
embedder,
|
runtime,
|
||||||
add_to_user_provided,
|
embedding_status_delta,
|
||||||
remove_from_user_provided,
|
|
||||||
} in extracted_vectors
|
} in extracted_vectors
|
||||||
{
|
{
|
||||||
let embeddings = match extract_embeddings(
|
let embeddings_from_prompts = match extract_embeddings_from_prompts(
|
||||||
prompts,
|
prompts,
|
||||||
indexer,
|
indexer,
|
||||||
embedder.clone(),
|
runtime.clone(),
|
||||||
&embedder_name,
|
&embedder_name,
|
||||||
&possible_embedding_mistakes,
|
&possible_embedding_mistakes,
|
||||||
&embedder_stats,
|
&embedder_stats,
|
||||||
|
@ -284,18 +284,37 @@ fn send_original_documents_data(
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let embeddings_from_fragments = match extract_embeddings_from_fragments(
|
||||||
|
inputs,
|
||||||
|
indexer,
|
||||||
|
runtime.clone(),
|
||||||
|
&embedder_name,
|
||||||
|
&possible_embedding_mistakes,
|
||||||
|
&embedder_stats,
|
||||||
|
&unused_vectors_distribution,
|
||||||
|
request_threads(),
|
||||||
|
) {
|
||||||
|
Ok(results) => Some(results),
|
||||||
|
Err(error) => {
|
||||||
|
let _ = lmdb_writer_sx.send(Err(error));
|
||||||
|
None
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
if !(remove_vectors.is_empty()
|
if !(remove_vectors.is_empty()
|
||||||
&& manual_vectors.is_empty()
|
&& manual_vectors.is_empty()
|
||||||
&& embeddings.as_ref().is_none_or(|e| e.is_empty()))
|
&& embeddings_from_prompts.as_ref().is_none_or(|e| e.is_empty())
|
||||||
|
&& embeddings_from_fragments.as_ref().is_none_or(|e| e.is_empty()))
|
||||||
{
|
{
|
||||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::VectorPoints {
|
let _ = lmdb_writer_sx.send(Ok(TypedChunk::VectorPoints {
|
||||||
remove_vectors,
|
remove_vectors,
|
||||||
embeddings,
|
embeddings_from_prompts,
|
||||||
expected_dimension: embedder.dimensions(),
|
embeddings_from_fragments,
|
||||||
|
expected_dimension: runtime.embedder.dimensions(),
|
||||||
manual_vectors,
|
manual_vectors,
|
||||||
embedder_name,
|
embedder_name,
|
||||||
add_to_user_provided,
|
embedding_status_delta,
|
||||||
remove_from_user_provided,
|
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -38,7 +38,8 @@ pub use crate::update::index_documents::helpers::CursorClonableMmap;
|
||||||
use crate::update::{
|
use crate::update::{
|
||||||
IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
|
IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
|
||||||
};
|
};
|
||||||
use crate::vector::{ArroyWrapper, EmbeddingConfigs};
|
use crate::vector::db::EmbedderInfo;
|
||||||
|
use crate::vector::{ArroyWrapper, RuntimeEmbedders};
|
||||||
use crate::{CboRoaringBitmapCodec, Index, Result, UserError};
|
use crate::{CboRoaringBitmapCodec, Index, Result, UserError};
|
||||||
|
|
||||||
static MERGED_DATABASE_COUNT: usize = 7;
|
static MERGED_DATABASE_COUNT: usize = 7;
|
||||||
|
@ -81,7 +82,7 @@ pub struct IndexDocuments<'t, 'i, 'a, FP, FA> {
|
||||||
should_abort: FA,
|
should_abort: FA,
|
||||||
added_documents: u64,
|
added_documents: u64,
|
||||||
deleted_documents: u64,
|
deleted_documents: u64,
|
||||||
embedders: EmbeddingConfigs,
|
embedders: RuntimeEmbedders,
|
||||||
embedder_stats: &'t Arc<EmbedderStats>,
|
embedder_stats: &'t Arc<EmbedderStats>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -172,7 +173,7 @@ where
|
||||||
Ok((self, Ok(indexed_documents)))
|
Ok((self, Ok(indexed_documents)))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn with_embedders(mut self, embedders: EmbeddingConfigs) -> Self {
|
pub fn with_embedders(mut self, embedders: RuntimeEmbedders) -> Self {
|
||||||
self.embedders = embedders;
|
self.embedders = embedders;
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
@ -226,7 +227,13 @@ where
|
||||||
settings_diff.new.recompute_searchables(self.wtxn, self.index)?;
|
settings_diff.new.recompute_searchables(self.wtxn, self.index)?;
|
||||||
|
|
||||||
let settings_diff = Arc::new(settings_diff);
|
let settings_diff = Arc::new(settings_diff);
|
||||||
let embedders_configs = Arc::new(self.index.embedding_configs(self.wtxn)?);
|
let embedder_infos: heed::Result<Vec<(String, EmbedderInfo)>> = self
|
||||||
|
.index
|
||||||
|
.embedding_configs()
|
||||||
|
.iter_embedder_info(self.wtxn)?
|
||||||
|
.map(|res| res.map(|(name, info)| (name.to_owned(), info)))
|
||||||
|
.collect();
|
||||||
|
let embedder_infos = Arc::new(embedder_infos?);
|
||||||
|
|
||||||
let possible_embedding_mistakes =
|
let possible_embedding_mistakes =
|
||||||
crate::vector::error::PossibleEmbeddingMistakes::new(&field_distribution);
|
crate::vector::error::PossibleEmbeddingMistakes::new(&field_distribution);
|
||||||
|
@ -328,9 +335,9 @@ where
|
||||||
pool_params,
|
pool_params,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
primary_key_id,
|
primary_key_id,
|
||||||
embedders_configs.clone(),
|
|
||||||
settings_diff_cloned,
|
settings_diff_cloned,
|
||||||
max_positions_per_attributes,
|
max_positions_per_attributes,
|
||||||
|
embedder_infos,
|
||||||
Arc::new(possible_embedding_mistakes),
|
Arc::new(possible_embedding_mistakes),
|
||||||
&embedder_stats
|
&embedder_stats
|
||||||
)
|
)
|
||||||
|
@ -430,21 +437,21 @@ where
|
||||||
TypedChunk::VectorPoints {
|
TypedChunk::VectorPoints {
|
||||||
expected_dimension,
|
expected_dimension,
|
||||||
remove_vectors,
|
remove_vectors,
|
||||||
embeddings,
|
embeddings_from_prompts,
|
||||||
|
embeddings_from_fragments,
|
||||||
manual_vectors,
|
manual_vectors,
|
||||||
embedder_name,
|
embedder_name,
|
||||||
add_to_user_provided,
|
embedding_status_delta,
|
||||||
remove_from_user_provided,
|
|
||||||
} => {
|
} => {
|
||||||
dimension.insert(embedder_name.clone(), expected_dimension);
|
dimension.insert(embedder_name.clone(), expected_dimension);
|
||||||
TypedChunk::VectorPoints {
|
TypedChunk::VectorPoints {
|
||||||
remove_vectors,
|
remove_vectors,
|
||||||
embeddings,
|
embeddings_from_prompts,
|
||||||
|
embeddings_from_fragments,
|
||||||
expected_dimension,
|
expected_dimension,
|
||||||
manual_vectors,
|
manual_vectors,
|
||||||
embedder_name,
|
embedder_name,
|
||||||
add_to_user_provided,
|
embedding_status_delta,
|
||||||
remove_from_user_provided,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
otherwise => otherwise,
|
otherwise => otherwise,
|
||||||
|
@ -480,7 +487,7 @@ where
|
||||||
// we should insert it in `dimension`
|
// we should insert it in `dimension`
|
||||||
for (name, action) in settings_diff.embedding_config_updates.iter() {
|
for (name, action) in settings_diff.embedding_config_updates.iter() {
|
||||||
if action.is_being_quantized && !dimension.contains_key(name.as_str()) {
|
if action.is_being_quantized && !dimension.contains_key(name.as_str()) {
|
||||||
let index = self.index.embedder_category_id.get(self.wtxn, name)?.ok_or(
|
let index = self.index.embedding_configs().embedder_id(self.wtxn, name)?.ok_or(
|
||||||
InternalError::DatabaseMissingEntry {
|
InternalError::DatabaseMissingEntry {
|
||||||
db_name: "embedder_category_id",
|
db_name: "embedder_category_id",
|
||||||
key: None,
|
key: None,
|
||||||
|
@ -488,7 +495,9 @@ where
|
||||||
)?;
|
)?;
|
||||||
let reader =
|
let reader =
|
||||||
ArroyWrapper::new(self.index.vector_arroy, index, action.was_quantized);
|
ArroyWrapper::new(self.index.vector_arroy, index, action.was_quantized);
|
||||||
let dim = reader.dimensions(self.wtxn)?;
|
let Some(dim) = reader.dimensions(self.wtxn)? else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
dimension.insert(name.to_string(), dim);
|
dimension.insert(name.to_string(), dim);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -498,12 +507,19 @@ where
|
||||||
let vector_arroy = self.index.vector_arroy;
|
let vector_arroy = self.index.vector_arroy;
|
||||||
let cancel = &self.should_abort;
|
let cancel = &self.should_abort;
|
||||||
|
|
||||||
let embedder_index = self.index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
|
let embedder_index =
|
||||||
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
|
self.index.embedding_configs().embedder_id(wtxn, &embedder_name)?.ok_or(
|
||||||
|
InternalError::DatabaseMissingEntry {
|
||||||
|
db_name: "embedder_category_id",
|
||||||
|
key: None,
|
||||||
|
},
|
||||||
)?;
|
)?;
|
||||||
let embedder_config = settings_diff.embedding_config_updates.get(&embedder_name);
|
let embedder_config = settings_diff.embedding_config_updates.get(&embedder_name);
|
||||||
let was_quantized =
|
let was_quantized = settings_diff
|
||||||
settings_diff.old.embedding_configs.get(&embedder_name).is_some_and(|conf| conf.2);
|
.old
|
||||||
|
.embedding_configs
|
||||||
|
.get(&embedder_name)
|
||||||
|
.is_some_and(|conf| conf.is_quantized);
|
||||||
let is_quantizing = embedder_config.is_some_and(|action| action.is_being_quantized);
|
let is_quantizing = embedder_config.is_some_and(|action| action.is_being_quantized);
|
||||||
|
|
||||||
pool.install(|| {
|
pool.install(|| {
|
||||||
|
@ -773,11 +789,11 @@ mod tests {
|
||||||
use crate::constants::RESERVED_GEO_FIELD_NAME;
|
use crate::constants::RESERVED_GEO_FIELD_NAME;
|
||||||
use crate::documents::mmap_from_objects;
|
use crate::documents::mmap_from_objects;
|
||||||
use crate::index::tests::TempIndex;
|
use crate::index::tests::TempIndex;
|
||||||
use crate::index::IndexEmbeddingConfig;
|
|
||||||
use crate::progress::Progress;
|
use crate::progress::Progress;
|
||||||
use crate::search::TermsMatchingStrategy;
|
use crate::search::TermsMatchingStrategy;
|
||||||
use crate::update::new::indexer;
|
use crate::update::new::indexer;
|
||||||
use crate::update::Setting;
|
use crate::update::Setting;
|
||||||
|
use crate::vector::db::IndexEmbeddingConfig;
|
||||||
use crate::{all_obkv_to_json, db_snap, Filter, FilterableAttributesRule, Search, UserError};
|
use crate::{all_obkv_to_json, db_snap, Filter, FilterableAttributesRule, Search, UserError};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -2028,7 +2044,7 @@ mod tests {
|
||||||
new_fields_ids_map,
|
new_fields_ids_map,
|
||||||
primary_key,
|
primary_key,
|
||||||
&document_changes,
|
&document_changes,
|
||||||
EmbeddingConfigs::default(),
|
RuntimeEmbedders::default(),
|
||||||
&|| false,
|
&|| false,
|
||||||
&Progress::default(),
|
&Progress::default(),
|
||||||
&Default::default(),
|
&Default::default(),
|
||||||
|
@ -2116,7 +2132,7 @@ mod tests {
|
||||||
new_fields_ids_map,
|
new_fields_ids_map,
|
||||||
primary_key,
|
primary_key,
|
||||||
&document_changes,
|
&document_changes,
|
||||||
EmbeddingConfigs::default(),
|
RuntimeEmbedders::default(),
|
||||||
&|| false,
|
&|| false,
|
||||||
&Progress::default(),
|
&Progress::default(),
|
||||||
&Default::default(),
|
&Default::default(),
|
||||||
|
@ -2277,7 +2293,7 @@ mod tests {
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let indexer_alloc = Bump::new();
|
let indexer_alloc = Bump::new();
|
||||||
let embedders = EmbeddingConfigs::default();
|
let embedders = RuntimeEmbedders::default();
|
||||||
let mut indexer = indexer::DocumentOperation::new();
|
let mut indexer = indexer::DocumentOperation::new();
|
||||||
indexer.replace_documents(&documents).unwrap();
|
indexer.replace_documents(&documents).unwrap();
|
||||||
indexer.delete_documents(&["2"]);
|
indexer.delete_documents(&["2"]);
|
||||||
|
@ -2343,7 +2359,7 @@ mod tests {
|
||||||
indexer.delete_documents(&["1", "2"]);
|
indexer.delete_documents(&["1", "2"]);
|
||||||
|
|
||||||
let indexer_alloc = Bump::new();
|
let indexer_alloc = Bump::new();
|
||||||
let embedders = EmbeddingConfigs::default();
|
let embedders = RuntimeEmbedders::default();
|
||||||
let (document_changes, _operation_stats, primary_key) = indexer
|
let (document_changes, _operation_stats, primary_key) = indexer
|
||||||
.into_changes(
|
.into_changes(
|
||||||
&indexer_alloc,
|
&indexer_alloc,
|
||||||
|
@ -2394,7 +2410,7 @@ mod tests {
|
||||||
{ "id": 3, "name": "jean", "age": 25 },
|
{ "id": 3, "name": "jean", "age": 25 },
|
||||||
]);
|
]);
|
||||||
let indexer_alloc = Bump::new();
|
let indexer_alloc = Bump::new();
|
||||||
let embedders = EmbeddingConfigs::default();
|
let embedders = RuntimeEmbedders::default();
|
||||||
let mut indexer = indexer::DocumentOperation::new();
|
let mut indexer = indexer::DocumentOperation::new();
|
||||||
indexer.update_documents(&documents).unwrap();
|
indexer.update_documents(&documents).unwrap();
|
||||||
|
|
||||||
|
@ -2446,7 +2462,7 @@ mod tests {
|
||||||
{ "id": 3, "legs": 4 },
|
{ "id": 3, "legs": 4 },
|
||||||
]);
|
]);
|
||||||
let indexer_alloc = Bump::new();
|
let indexer_alloc = Bump::new();
|
||||||
let embedders = EmbeddingConfigs::default();
|
let embedders = RuntimeEmbedders::default();
|
||||||
let mut indexer = indexer::DocumentOperation::new();
|
let mut indexer = indexer::DocumentOperation::new();
|
||||||
indexer.update_documents(&documents).unwrap();
|
indexer.update_documents(&documents).unwrap();
|
||||||
indexer.delete_documents(&["1", "2"]);
|
indexer.delete_documents(&["1", "2"]);
|
||||||
|
@ -2496,7 +2512,7 @@ mod tests {
|
||||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||||
|
|
||||||
let indexer_alloc = Bump::new();
|
let indexer_alloc = Bump::new();
|
||||||
let embedders = EmbeddingConfigs::default();
|
let embedders = RuntimeEmbedders::default();
|
||||||
let mut indexer = indexer::DocumentOperation::new();
|
let mut indexer = indexer::DocumentOperation::new();
|
||||||
indexer.delete_documents(&["1", "2"]);
|
indexer.delete_documents(&["1", "2"]);
|
||||||
|
|
||||||
|
@ -2552,7 +2568,7 @@ mod tests {
|
||||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||||
|
|
||||||
let indexer_alloc = Bump::new();
|
let indexer_alloc = Bump::new();
|
||||||
let embedders = EmbeddingConfigs::default();
|
let embedders = RuntimeEmbedders::default();
|
||||||
let mut indexer = indexer::DocumentOperation::new();
|
let mut indexer = indexer::DocumentOperation::new();
|
||||||
|
|
||||||
indexer.delete_documents(&["1", "2", "1", "2"]);
|
indexer.delete_documents(&["1", "2", "1", "2"]);
|
||||||
|
@ -2611,7 +2627,7 @@ mod tests {
|
||||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||||
|
|
||||||
let indexer_alloc = Bump::new();
|
let indexer_alloc = Bump::new();
|
||||||
let embedders = EmbeddingConfigs::default();
|
let embedders = RuntimeEmbedders::default();
|
||||||
let mut indexer = indexer::DocumentOperation::new();
|
let mut indexer = indexer::DocumentOperation::new();
|
||||||
|
|
||||||
let documents = documents!([
|
let documents = documents!([
|
||||||
|
@ -2661,7 +2677,7 @@ mod tests {
|
||||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||||
|
|
||||||
let indexer_alloc = Bump::new();
|
let indexer_alloc = Bump::new();
|
||||||
let embedders = EmbeddingConfigs::default();
|
let embedders = RuntimeEmbedders::default();
|
||||||
let mut indexer = indexer::DocumentOperation::new();
|
let mut indexer = indexer::DocumentOperation::new();
|
||||||
|
|
||||||
indexer.delete_documents(&["1"]);
|
indexer.delete_documents(&["1"]);
|
||||||
|
@ -2775,6 +2791,8 @@ mod tests {
|
||||||
document_template: Setting::NotSet,
|
document_template: Setting::NotSet,
|
||||||
document_template_max_bytes: Setting::NotSet,
|
document_template_max_bytes: Setting::NotSet,
|
||||||
url: Setting::NotSet,
|
url: Setting::NotSet,
|
||||||
|
indexing_fragments: Setting::NotSet,
|
||||||
|
search_fragments: Setting::NotSet,
|
||||||
request: Setting::NotSet,
|
request: Setting::NotSet,
|
||||||
response: Setting::NotSet,
|
response: Setting::NotSet,
|
||||||
distribution: Setting::NotSet,
|
distribution: Setting::NotSet,
|
||||||
|
@ -2801,17 +2819,27 @@ mod tests {
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
let mut embedding_configs = index.embedding_configs(&rtxn).unwrap();
|
let embedders = index.embedding_configs();
|
||||||
let IndexEmbeddingConfig { name: embedder_name, config: embedder, user_provided } =
|
let mut embedding_configs = embedders.embedding_configs(&rtxn).unwrap();
|
||||||
|
let IndexEmbeddingConfig { name: embedder_name, config: embedder, fragments } =
|
||||||
embedding_configs.pop().unwrap();
|
embedding_configs.pop().unwrap();
|
||||||
|
let info = embedders.embedder_info(&rtxn, &embedder_name).unwrap().unwrap();
|
||||||
|
insta::assert_snapshot!(info.embedder_id, @"0");
|
||||||
|
insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[0, 1, 2]>");
|
||||||
|
insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[0, 1, 2]>");
|
||||||
insta::assert_snapshot!(embedder_name, @"manual");
|
insta::assert_snapshot!(embedder_name, @"manual");
|
||||||
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[0, 1, 2]>");
|
insta::assert_debug_snapshot!(fragments, @r###"
|
||||||
|
FragmentConfigs(
|
||||||
|
[],
|
||||||
|
)
|
||||||
|
"###);
|
||||||
|
|
||||||
let embedder = std::sync::Arc::new(
|
let embedder = std::sync::Arc::new(
|
||||||
crate::vector::Embedder::new(embedder.embedder_options, 0).unwrap(),
|
crate::vector::Embedder::new(embedder.embedder_options, 0).unwrap(),
|
||||||
);
|
);
|
||||||
let res = index
|
let res = index
|
||||||
.search(&rtxn)
|
.search(&rtxn)
|
||||||
.semantic(embedder_name, embedder, false, Some([0.0, 1.0, 2.0].to_vec()))
|
.semantic(embedder_name, embedder, false, Some([0.0, 1.0, 2.0].to_vec()), None)
|
||||||
.execute()
|
.execute()
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(res.documents_ids.len(), 3);
|
assert_eq!(res.documents_ids.len(), 3);
|
||||||
|
@ -2860,7 +2888,7 @@ mod tests {
|
||||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||||
|
|
||||||
let indexer_alloc = Bump::new();
|
let indexer_alloc = Bump::new();
|
||||||
let embedders = EmbeddingConfigs::default();
|
let embedders = RuntimeEmbedders::default();
|
||||||
let mut indexer = indexer::DocumentOperation::new();
|
let mut indexer = indexer::DocumentOperation::new();
|
||||||
|
|
||||||
// OP
|
// OP
|
||||||
|
@ -2921,7 +2949,7 @@ mod tests {
|
||||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||||
|
|
||||||
let indexer_alloc = Bump::new();
|
let indexer_alloc = Bump::new();
|
||||||
let embedders = EmbeddingConfigs::default();
|
let embedders = RuntimeEmbedders::default();
|
||||||
let mut indexer = indexer::DocumentOperation::new();
|
let mut indexer = indexer::DocumentOperation::new();
|
||||||
|
|
||||||
indexer.delete_documents(&["1"]);
|
indexer.delete_documents(&["1"]);
|
||||||
|
@ -2980,7 +3008,7 @@ mod tests {
|
||||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||||
|
|
||||||
let indexer_alloc = Bump::new();
|
let indexer_alloc = Bump::new();
|
||||||
let embedders = EmbeddingConfigs::default();
|
let embedders = RuntimeEmbedders::default();
|
||||||
let mut indexer = indexer::DocumentOperation::new();
|
let mut indexer = indexer::DocumentOperation::new();
|
||||||
|
|
||||||
let documents = documents!([
|
let documents = documents!([
|
||||||
|
|
|
@ -31,7 +31,7 @@ use crate::update::index_documents::GrenadParameters;
|
||||||
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
||||||
use crate::update::{AvailableIds, UpdateIndexingStep};
|
use crate::update::{AvailableIds, UpdateIndexingStep};
|
||||||
use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
|
use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
|
||||||
use crate::vector::settings::WriteBackToDocuments;
|
use crate::vector::settings::{RemoveFragments, WriteBackToDocuments};
|
||||||
use crate::vector::ArroyWrapper;
|
use crate::vector::ArroyWrapper;
|
||||||
use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, Index, Result};
|
use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, Index, Result};
|
||||||
|
|
||||||
|
@ -933,10 +933,47 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||||
|
|
||||||
// delete all vectors from the embedders that need removal
|
// delete all vectors from the embedders that need removal
|
||||||
for (_, (reader, _)) in readers {
|
for (_, (reader, _)) in readers {
|
||||||
let dimensions = reader.dimensions(wtxn)?;
|
let Some(dimensions) = reader.dimensions(wtxn)? else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
reader.clear(wtxn, dimensions)?;
|
reader.clear(wtxn, dimensions)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// remove all vectors for the specified fragments
|
||||||
|
for (embedder_name, RemoveFragments { fragment_ids }, was_quantized) in
|
||||||
|
settings_diff.embedding_config_updates.iter().filter_map(|(name, action)| {
|
||||||
|
action.remove_fragments().map(|fragments| (name, fragments, action.was_quantized))
|
||||||
|
})
|
||||||
|
{
|
||||||
|
let Some(infos) = self.index.embedding_configs().embedder_info(wtxn, embedder_name)?
|
||||||
|
else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
let arroy =
|
||||||
|
ArroyWrapper::new(self.index.vector_arroy, infos.embedder_id, was_quantized);
|
||||||
|
let Some(dimensions) = arroy.dimensions(wtxn)? else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
for fragment_id in fragment_ids {
|
||||||
|
// we must keep the user provided embeddings that ended up in this store
|
||||||
|
|
||||||
|
if infos.embedding_status.user_provided_docids().is_empty() {
|
||||||
|
// no user provided: clear store
|
||||||
|
arroy.clear_store(wtxn, *fragment_id, dimensions)?;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// some user provided, remove only the ids that are not user provided
|
||||||
|
let to_delete = arroy.items_in_store(wtxn, *fragment_id, |items| {
|
||||||
|
items - infos.embedding_status.user_provided_docids()
|
||||||
|
})?;
|
||||||
|
|
||||||
|
for to_delete in to_delete {
|
||||||
|
arroy.del_item_in_store(wtxn, to_delete, *fragment_id, dimensions)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let grenad_params = GrenadParameters {
|
let grenad_params = GrenadParameters {
|
||||||
chunk_compression_type: self.indexer_settings.chunk_compression_type,
|
chunk_compression_type: self.indexer_settings.chunk_compression_type,
|
||||||
chunk_compression_level: self.indexer_settings.chunk_compression_level,
|
chunk_compression_level: self.indexer_settings.chunk_compression_level,
|
||||||
|
|
|
@ -4,6 +4,7 @@ use std::fs::File;
|
||||||
use std::io::{self, BufReader};
|
use std::io::{self, BufReader};
|
||||||
|
|
||||||
use bytemuck::allocation::pod_collect_to_vec;
|
use bytemuck::allocation::pod_collect_to_vec;
|
||||||
|
use byteorder::{BigEndian, ReadBytesExt as _};
|
||||||
use grenad::{MergeFunction, Merger, MergerBuilder};
|
use grenad::{MergeFunction, Merger, MergerBuilder};
|
||||||
use heed::types::Bytes;
|
use heed::types::Bytes;
|
||||||
use heed::{BytesDecode, RwTxn};
|
use heed::{BytesDecode, RwTxn};
|
||||||
|
@ -18,7 +19,6 @@ use super::helpers::{
|
||||||
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
|
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
|
||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::index::db_name::DOCUMENTS;
|
use crate::index::db_name::DOCUMENTS;
|
||||||
use crate::index::IndexEmbeddingConfig;
|
|
||||||
use crate::proximity::MAX_DISTANCE;
|
use crate::proximity::MAX_DISTANCE;
|
||||||
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
|
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
|
||||||
use crate::update::facet::FacetsUpdate;
|
use crate::update::facet::FacetsUpdate;
|
||||||
|
@ -26,6 +26,7 @@ use crate::update::index_documents::helpers::{
|
||||||
as_cloneable_grenad, try_split_array_at, KeepLatestObkv,
|
as_cloneable_grenad, try_split_array_at, KeepLatestObkv,
|
||||||
};
|
};
|
||||||
use crate::update::settings::InnerIndexSettingsDiff;
|
use crate::update::settings::InnerIndexSettingsDiff;
|
||||||
|
use crate::vector::db::{EmbeddingStatusDelta, IndexEmbeddingConfig};
|
||||||
use crate::vector::ArroyWrapper;
|
use crate::vector::ArroyWrapper;
|
||||||
use crate::{
|
use crate::{
|
||||||
lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError,
|
lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError,
|
||||||
|
@ -86,12 +87,14 @@ pub(crate) enum TypedChunk {
|
||||||
GeoPoints(grenad::Reader<BufReader<File>>),
|
GeoPoints(grenad::Reader<BufReader<File>>),
|
||||||
VectorPoints {
|
VectorPoints {
|
||||||
remove_vectors: grenad::Reader<BufReader<File>>,
|
remove_vectors: grenad::Reader<BufReader<File>>,
|
||||||
embeddings: Option<grenad::Reader<BufReader<File>>>,
|
// docid -> vector
|
||||||
|
embeddings_from_prompts: Option<grenad::Reader<BufReader<File>>>,
|
||||||
|
// docid, extractor_id -> Option<vector>,
|
||||||
|
embeddings_from_fragments: Option<grenad::Reader<BufReader<File>>>,
|
||||||
expected_dimension: usize,
|
expected_dimension: usize,
|
||||||
manual_vectors: grenad::Reader<BufReader<File>>,
|
manual_vectors: grenad::Reader<BufReader<File>>,
|
||||||
embedder_name: String,
|
embedder_name: String,
|
||||||
add_to_user_provided: RoaringBitmap,
|
embedding_status_delta: EmbeddingStatusDelta,
|
||||||
remove_from_user_provided: RoaringBitmap,
|
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -155,6 +158,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||||
let mut iter = merger.into_stream_merger_iter()?;
|
let mut iter = merger.into_stream_merger_iter()?;
|
||||||
|
|
||||||
let embedders: BTreeSet<_> = index
|
let embedders: BTreeSet<_> = index
|
||||||
|
.embedding_configs()
|
||||||
.embedding_configs(wtxn)?
|
.embedding_configs(wtxn)?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|IndexEmbeddingConfig { name, .. }| name)
|
.map(|IndexEmbeddingConfig { name, .. }| name)
|
||||||
|
@ -614,57 +618,66 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||||
let span = tracing::trace_span!(target: "indexing::write_db", "vector_points");
|
let span = tracing::trace_span!(target: "indexing::write_db", "vector_points");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
|
|
||||||
|
let embedders = index.embedding_configs();
|
||||||
|
|
||||||
let mut remove_vectors_builder = MergerBuilder::new(KeepFirst);
|
let mut remove_vectors_builder = MergerBuilder::new(KeepFirst);
|
||||||
let mut manual_vectors_builder = MergerBuilder::new(KeepFirst);
|
let mut manual_vectors_builder = MergerBuilder::new(KeepFirst);
|
||||||
let mut embeddings_builder = MergerBuilder::new(KeepFirst);
|
let mut embeddings_from_prompts_builder = MergerBuilder::new(KeepFirst);
|
||||||
let mut add_to_user_provided = RoaringBitmap::new();
|
let mut embeddings_from_fragments_builder = MergerBuilder::new(KeepFirst);
|
||||||
let mut remove_from_user_provided = RoaringBitmap::new();
|
|
||||||
let mut params = None;
|
let mut params = None;
|
||||||
|
let mut infos = None;
|
||||||
for typed_chunk in typed_chunks {
|
for typed_chunk in typed_chunks {
|
||||||
let TypedChunk::VectorPoints {
|
let TypedChunk::VectorPoints {
|
||||||
remove_vectors,
|
remove_vectors,
|
||||||
manual_vectors,
|
manual_vectors,
|
||||||
embeddings,
|
embeddings_from_prompts,
|
||||||
|
embeddings_from_fragments,
|
||||||
expected_dimension,
|
expected_dimension,
|
||||||
embedder_name,
|
embedder_name,
|
||||||
add_to_user_provided: aud,
|
embedding_status_delta,
|
||||||
remove_from_user_provided: rud,
|
|
||||||
} = typed_chunk
|
} = typed_chunk
|
||||||
else {
|
else {
|
||||||
unreachable!();
|
unreachable!();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if infos.is_none() {
|
||||||
|
infos = Some(embedders.embedder_info(wtxn, &embedder_name)?.ok_or(
|
||||||
|
InternalError::DatabaseMissingEntry {
|
||||||
|
db_name: "embedder_category_id",
|
||||||
|
key: None,
|
||||||
|
},
|
||||||
|
)?);
|
||||||
|
}
|
||||||
|
|
||||||
params = Some((expected_dimension, embedder_name));
|
params = Some((expected_dimension, embedder_name));
|
||||||
|
|
||||||
remove_vectors_builder.push(remove_vectors.into_cursor()?);
|
remove_vectors_builder.push(remove_vectors.into_cursor()?);
|
||||||
manual_vectors_builder.push(manual_vectors.into_cursor()?);
|
manual_vectors_builder.push(manual_vectors.into_cursor()?);
|
||||||
if let Some(embeddings) = embeddings {
|
if let Some(embeddings) = embeddings_from_prompts {
|
||||||
embeddings_builder.push(embeddings.into_cursor()?);
|
embeddings_from_prompts_builder.push(embeddings.into_cursor()?);
|
||||||
|
}
|
||||||
|
if let Some(embeddings) = embeddings_from_fragments {
|
||||||
|
embeddings_from_fragments_builder.push(embeddings.into_cursor()?);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(infos) = &mut infos {
|
||||||
|
embedding_status_delta.apply_to(&mut infos.embedding_status);
|
||||||
}
|
}
|
||||||
add_to_user_provided |= aud;
|
|
||||||
remove_from_user_provided |= rud;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// typed chunks has always at least 1 chunk.
|
// typed chunks has always at least 1 chunk.
|
||||||
let Some((expected_dimension, embedder_name)) = params else { unreachable!() };
|
let Some((expected_dimension, embedder_name)) = params else { unreachable!() };
|
||||||
|
let Some(infos) = infos else { unreachable!() };
|
||||||
|
|
||||||
let mut embedding_configs = index.embedding_configs(wtxn)?;
|
embedders.put_embedder_info(wtxn, &embedder_name, &infos)?;
|
||||||
let index_embedder_config = embedding_configs
|
|
||||||
.iter_mut()
|
|
||||||
.find(|IndexEmbeddingConfig { name, .. }| name == &embedder_name)
|
|
||||||
.unwrap();
|
|
||||||
index_embedder_config.user_provided -= remove_from_user_provided;
|
|
||||||
index_embedder_config.user_provided |= add_to_user_provided;
|
|
||||||
|
|
||||||
index.put_embedding_configs(wtxn, embedding_configs)?;
|
let binary_quantized = settings_diff
|
||||||
|
.old
|
||||||
let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
|
.embedding_configs
|
||||||
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
|
.get(&embedder_name)
|
||||||
)?;
|
.is_some_and(|conf| conf.is_quantized);
|
||||||
let binary_quantized =
|
|
||||||
settings_diff.old.embedding_configs.get(&embedder_name).is_some_and(|conf| conf.2);
|
|
||||||
// FIXME: allow customizing distance
|
// FIXME: allow customizing distance
|
||||||
let writer = ArroyWrapper::new(index.vector_arroy, embedder_index, binary_quantized);
|
let writer = ArroyWrapper::new(index.vector_arroy, infos.embedder_id, binary_quantized);
|
||||||
|
|
||||||
// remove vectors for docids we want them removed
|
// remove vectors for docids we want them removed
|
||||||
let merger = remove_vectors_builder.build();
|
let merger = remove_vectors_builder.build();
|
||||||
|
@ -674,8 +687,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||||
writer.del_items(wtxn, expected_dimension, docid)?;
|
writer.del_items(wtxn, expected_dimension, docid)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// add generated embeddings
|
// add generated embeddings -- from prompts
|
||||||
let merger = embeddings_builder.build();
|
let merger = embeddings_from_prompts_builder.build();
|
||||||
let mut iter = merger.into_stream_merger_iter()?;
|
let mut iter = merger.into_stream_merger_iter()?;
|
||||||
while let Some((key, value)) = iter.next()? {
|
while let Some((key, value)) = iter.next()? {
|
||||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||||
|
@ -702,6 +715,24 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||||
writer.add_items(wtxn, docid, &embeddings)?;
|
writer.add_items(wtxn, docid, &embeddings)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// add generated embeddings -- from fragments
|
||||||
|
let merger = embeddings_from_fragments_builder.build();
|
||||||
|
let mut iter = merger.into_stream_merger_iter()?;
|
||||||
|
while let Some((mut key, value)) = iter.next()? {
|
||||||
|
let docid = key.read_u32::<BigEndian>().unwrap();
|
||||||
|
let extractor_id = key.read_u8().unwrap();
|
||||||
|
if value.is_empty() {
|
||||||
|
writer.del_item_in_store(wtxn, docid, extractor_id, expected_dimension)?;
|
||||||
|
} else {
|
||||||
|
let data = pod_collect_to_vec(value);
|
||||||
|
// it is a code error to have embeddings and not expected_dimension
|
||||||
|
if data.len() != expected_dimension {
|
||||||
|
panic!("wrong dimensions")
|
||||||
|
}
|
||||||
|
writer.add_item_in_store(wtxn, docid, extractor_id, &data)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// perform the manual diff
|
// perform the manual diff
|
||||||
let merger = manual_vectors_builder.build();
|
let merger = manual_vectors_builder.build();
|
||||||
let mut iter = merger.into_stream_merger_iter()?;
|
let mut iter = merger.into_stream_merger_iter()?;
|
||||||
|
|
|
@ -6,9 +6,8 @@ use serde_json::value::RawValue;
|
||||||
use serde_json::{from_slice, Value};
|
use serde_json::{from_slice, Value};
|
||||||
|
|
||||||
use super::Embedding;
|
use super::Embedding;
|
||||||
use crate::index::IndexEmbeddingConfig;
|
|
||||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||||
use crate::{DocumentId, FieldId, InternalError, UserError};
|
use crate::{FieldId, InternalError, UserError};
|
||||||
|
|
||||||
#[derive(serde::Serialize, Debug)]
|
#[derive(serde::Serialize, Debug)]
|
||||||
#[serde(untagged)]
|
#[serde(untagged)]
|
||||||
|
@ -374,8 +373,7 @@ pub struct ParsedVectorsDiff {
|
||||||
|
|
||||||
impl ParsedVectorsDiff {
|
impl ParsedVectorsDiff {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
docid: DocumentId,
|
regenerate_for_embedders: impl Iterator<Item = String>,
|
||||||
embedders_configs: &[IndexEmbeddingConfig],
|
|
||||||
documents_diff: &KvReader<FieldId>,
|
documents_diff: &KvReader<FieldId>,
|
||||||
old_vectors_fid: Option<FieldId>,
|
old_vectors_fid: Option<FieldId>,
|
||||||
new_vectors_fid: Option<FieldId>,
|
new_vectors_fid: Option<FieldId>,
|
||||||
|
@ -396,10 +394,8 @@ impl ParsedVectorsDiff {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
.flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, VectorState::Inline(vec))).collect());
|
.flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, VectorState::Inline(vec))).collect());
|
||||||
for embedding_config in embedders_configs {
|
for name in regenerate_for_embedders {
|
||||||
if embedding_config.user_provided.contains(docid) {
|
old.entry(name).or_insert(VectorState::Generated);
|
||||||
old.entry(embedding_config.name.to_string()).or_insert(VectorState::Manual);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let new = 'new: {
|
let new = 'new: {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue