implements mor review comments

This commit is contained in:
Tamo 2024-06-05 15:19:22 +02:00
parent 49fa41ce65
commit b7349910d9
6 changed files with 31 additions and 35 deletions

View File

@ -5012,7 +5012,7 @@ mod tests {
insta::assert_json_snapshot!(task.details); insta::assert_json_snapshot!(task.details);
} }
handle.advance_n_successful_batches(1); handle.advance_one_successful_batch();
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "settings_update_processed_vectors"); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "settings_update_processed_vectors");
{ {

View File

@ -35,8 +35,8 @@ pub struct ExtractedVectorPoints {
// embedder // embedder
pub embedder_name: String, pub embedder_name: String,
pub embedder: Arc<Embedder>, pub embedder: Arc<Embedder>,
pub user_defined: RoaringBitmap, pub user_provided: RoaringBitmap,
pub remove_from_user_defined: RoaringBitmap, pub remove_from_user_provided: RoaringBitmap,
} }
enum VectorStateDelta { enum VectorStateDelta {
@ -82,9 +82,9 @@ struct EmbedderVectorExtractor {
remove_vectors_writer: Writer<BufWriter<File>>, remove_vectors_writer: Writer<BufWriter<File>>,
// The docids of the documents that contains a user defined embedding // The docids of the documents that contains a user defined embedding
user_defined: RoaringBitmap, user_provided: RoaringBitmap,
// The docids of the documents that contains an auto-generated embedding // The docids of the documents that contains an auto-generated embedding
remove_from_user_defined: RoaringBitmap, remove_from_user_provided: RoaringBitmap,
} }
/// Extracts the embedding vector contained in each document under the `_vectors` field. /// Extracts the embedding vector contained in each document under the `_vectors` field.
@ -140,8 +140,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
manual_vectors_writer, manual_vectors_writer,
prompts_writer, prompts_writer,
remove_vectors_writer, remove_vectors_writer,
user_defined: RoaringBitmap::new(), user_provided: RoaringBitmap::new(),
remove_from_user_defined: RoaringBitmap::new(), remove_from_user_provided: RoaringBitmap::new(),
}); });
} }
@ -179,8 +179,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
manual_vectors_writer, manual_vectors_writer,
prompts_writer, prompts_writer,
remove_vectors_writer, remove_vectors_writer,
user_defined, user_provided,
remove_from_user_defined, remove_from_user_provided,
} in extractors.iter_mut() } in extractors.iter_mut()
{ {
let delta = match parsed_vectors.remove(embedder_name) { let delta = match parsed_vectors.remove(embedder_name) {
@ -188,10 +188,10 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
match (old.map_or(true, |old| old.is_user_provided()), new.is_user_provided()) { match (old.map_or(true, |old| old.is_user_provided()), new.is_user_provided()) {
(true, true) | (false, false) => (), (true, true) | (false, false) => (),
(true, false) => { (true, false) => {
remove_from_user_defined.insert(docid); remove_from_user_provided.insert(docid);
} }
(false, true) => { (false, true) => {
user_defined.insert(docid); user_provided.insert(docid);
} }
} }
@ -214,7 +214,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
.map(|(_, deladd)| KvReaderDelAdd::new(deladd)) .map(|(_, deladd)| KvReaderDelAdd::new(deladd))
.any(|deladd| deladd.get(DelAdd::Addition).is_some()); .any(|deladd| deladd.get(DelAdd::Addition).is_some());
if document_is_kept && old.is_some() { if document_is_kept && old.is_some() {
remove_from_user_defined.insert(docid); remove_from_user_provided.insert(docid);
// becomes autogenerated // becomes autogenerated
VectorStateDelta::NowGenerated(prompt.render( VectorStateDelta::NowGenerated(prompt.render(
obkv, obkv,
@ -229,9 +229,9 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
} }
(None, Some(new)) => { (None, Some(new)) => {
if new.is_user_provided() { if new.is_user_provided() {
user_defined.insert(docid); user_provided.insert(docid);
} else { } else {
remove_from_user_defined.insert(docid); remove_from_user_provided.insert(docid);
} }
// was possibly autogenerated, remove all vectors for that document // was possibly autogenerated, remove all vectors for that document
let add_vectors = new.into_array_of_vectors(); let add_vectors = new.into_array_of_vectors();
@ -274,7 +274,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
VectorStateDelta::NoChange VectorStateDelta::NoChange
} }
} else { } else {
remove_from_user_defined.remove(docid); remove_from_user_provided.remove(docid);
VectorStateDelta::NowRemoved VectorStateDelta::NowRemoved
} }
} }
@ -301,8 +301,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
manual_vectors_writer, manual_vectors_writer,
prompts_writer, prompts_writer,
remove_vectors_writer, remove_vectors_writer,
user_defined, user_provided,
remove_from_user_defined, remove_from_user_provided,
} in extractors } in extractors
{ {
results.push(ExtractedVectorPoints { results.push(ExtractedVectorPoints {
@ -311,8 +311,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
prompts: writer_into_reader(prompts_writer)?, prompts: writer_into_reader(prompts_writer)?,
embedder, embedder,
embedder_name, embedder_name,
user_defined, user_provided,
remove_from_user_defined, remove_from_user_provided,
}) })
} }
@ -347,9 +347,6 @@ fn push_vectors_diff(
add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
// let merged_vectors_iter =
// itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add));
// insert vectors into the writer // insert vectors into the writer
for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) { for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) {
// Generate the key by extending the unique index to it. // Generate the key by extending the unique index to it.

View File

@ -248,8 +248,8 @@ fn send_original_documents_data(
prompts, prompts,
embedder_name, embedder_name,
embedder, embedder,
user_defined, user_provided,
remove_from_user_defined: auto_generated, remove_from_user_provided,
} in extracted_vectors } in extracted_vectors
{ {
let embeddings = match extract_embeddings( let embeddings = match extract_embeddings(
@ -274,8 +274,8 @@ fn send_original_documents_data(
expected_dimension: embedder.dimensions(), expected_dimension: embedder.dimensions(),
manual_vectors, manual_vectors,
embedder_name, embedder_name,
user_defined, user_provided,
remove_from_user_defined: auto_generated, remove_from_user_provided,
})); }));
} }
} }

View File

@ -503,8 +503,8 @@ where
embeddings, embeddings,
manual_vectors, manual_vectors,
embedder_name, embedder_name,
user_defined, user_provided,
remove_from_user_defined, remove_from_user_provided,
} => { } => {
dimension.insert(embedder_name.clone(), expected_dimension); dimension.insert(embedder_name.clone(), expected_dimension);
TypedChunk::VectorPoints { TypedChunk::VectorPoints {
@ -513,8 +513,8 @@ where
expected_dimension, expected_dimension,
manual_vectors, manual_vectors,
embedder_name, embedder_name,
user_defined, user_provided,
remove_from_user_defined, remove_from_user_provided,
} }
} }
otherwise => otherwise, otherwise => otherwise,

View File

@ -91,8 +91,8 @@ pub(crate) enum TypedChunk {
expected_dimension: usize, expected_dimension: usize,
manual_vectors: grenad::Reader<BufReader<File>>, manual_vectors: grenad::Reader<BufReader<File>>,
embedder_name: String, embedder_name: String,
user_defined: RoaringBitmap, user_provided: RoaringBitmap,
remove_from_user_defined: RoaringBitmap, remove_from_user_provided: RoaringBitmap,
}, },
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
} }
@ -635,8 +635,8 @@ pub(crate) fn write_typed_chunk_into_index(
embeddings, embeddings,
expected_dimension, expected_dimension,
embedder_name, embedder_name,
user_defined: ud, user_provided: ud,
remove_from_user_defined: rud, remove_from_user_provided: rud,
} = typed_chunk } = typed_chunk
else { else {
unreachable!(); unreachable!();

View File

@ -230,7 +230,6 @@ where
input_value input_value
} }
[input] => { [input] => {
dbg!(&options);
let mut body = options.query.clone(); let mut body = options.query.clone();
body.as_object_mut() body.as_object_mut()