From b7349910d9285ec485bb59f9712f7d211604b10f Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 5 Jun 2024 15:19:22 +0200 Subject: [PATCH] implements mor review comments --- index-scheduler/src/lib.rs | 2 +- .../extract/extract_vector_points.rs | 39 +++++++++---------- .../src/update/index_documents/extract/mod.rs | 8 ++-- milli/src/update/index_documents/mod.rs | 8 ++-- .../src/update/index_documents/typed_chunk.rs | 8 ++-- milli/src/vector/rest.rs | 1 - 6 files changed, 31 insertions(+), 35 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index f98e419a1..1f5a1fdcd 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -5012,7 +5012,7 @@ mod tests { insta::assert_json_snapshot!(task.details); } - handle.advance_n_successful_batches(1); + handle.advance_one_successful_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "settings_update_processed_vectors"); { diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 1e56bec83..88c42864e 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -35,8 +35,8 @@ pub struct ExtractedVectorPoints { // embedder pub embedder_name: String, pub embedder: Arc, - pub user_defined: RoaringBitmap, - pub remove_from_user_defined: RoaringBitmap, + pub user_provided: RoaringBitmap, + pub remove_from_user_provided: RoaringBitmap, } enum VectorStateDelta { @@ -82,9 +82,9 @@ struct EmbedderVectorExtractor { remove_vectors_writer: Writer>, // The docids of the documents that contains a user defined embedding - user_defined: RoaringBitmap, + user_provided: RoaringBitmap, // The docids of the documents that contains an auto-generated embedding - remove_from_user_defined: RoaringBitmap, + remove_from_user_provided: RoaringBitmap, } /// Extracts the embedding vector contained in each document under the `_vectors` field. @@ -140,8 +140,8 @@ pub fn extract_vector_points( manual_vectors_writer, prompts_writer, remove_vectors_writer, - user_defined: RoaringBitmap::new(), - remove_from_user_defined: RoaringBitmap::new(), + user_provided: RoaringBitmap::new(), + remove_from_user_provided: RoaringBitmap::new(), }); } @@ -179,8 +179,8 @@ pub fn extract_vector_points( manual_vectors_writer, prompts_writer, remove_vectors_writer, - user_defined, - remove_from_user_defined, + user_provided, + remove_from_user_provided, } in extractors.iter_mut() { let delta = match parsed_vectors.remove(embedder_name) { @@ -188,10 +188,10 @@ pub fn extract_vector_points( match (old.map_or(true, |old| old.is_user_provided()), new.is_user_provided()) { (true, true) | (false, false) => (), (true, false) => { - remove_from_user_defined.insert(docid); + remove_from_user_provided.insert(docid); } (false, true) => { - user_defined.insert(docid); + user_provided.insert(docid); } } @@ -214,7 +214,7 @@ pub fn extract_vector_points( .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) .any(|deladd| deladd.get(DelAdd::Addition).is_some()); if document_is_kept && old.is_some() { - remove_from_user_defined.insert(docid); + remove_from_user_provided.insert(docid); // becomes autogenerated VectorStateDelta::NowGenerated(prompt.render( obkv, @@ -229,9 +229,9 @@ pub fn extract_vector_points( } (None, Some(new)) => { if new.is_user_provided() { - user_defined.insert(docid); + user_provided.insert(docid); } else { - remove_from_user_defined.insert(docid); + remove_from_user_provided.insert(docid); } // was possibly autogenerated, remove all vectors for that document let add_vectors = new.into_array_of_vectors(); @@ -274,7 +274,7 @@ pub fn extract_vector_points( VectorStateDelta::NoChange } } else { - remove_from_user_defined.remove(docid); + remove_from_user_provided.remove(docid); VectorStateDelta::NowRemoved } } @@ -301,8 +301,8 @@ pub fn extract_vector_points( manual_vectors_writer, prompts_writer, remove_vectors_writer, - user_defined, - remove_from_user_defined, + user_provided, + remove_from_user_provided, } in extractors { results.push(ExtractedVectorPoints { @@ -311,8 +311,8 @@ pub fn extract_vector_points( prompts: writer_into_reader(prompts_writer)?, embedder, embedder_name, - user_defined, - remove_from_user_defined, + user_provided, + remove_from_user_provided, }) } @@ -347,9 +347,6 @@ fn push_vectors_diff( add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); - // let merged_vectors_iter = - // itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add)); - // insert vectors into the writer for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) { // Generate the key by extending the unique index to it. diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 6399b40f8..2babe330f 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -248,8 +248,8 @@ fn send_original_documents_data( prompts, embedder_name, embedder, - user_defined, - remove_from_user_defined: auto_generated, + user_provided, + remove_from_user_provided, } in extracted_vectors { let embeddings = match extract_embeddings( @@ -274,8 +274,8 @@ fn send_original_documents_data( expected_dimension: embedder.dimensions(), manual_vectors, embedder_name, - user_defined, - remove_from_user_defined: auto_generated, + user_provided, + remove_from_user_provided, })); } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 907554753..07c10bb45 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -503,8 +503,8 @@ where embeddings, manual_vectors, embedder_name, - user_defined, - remove_from_user_defined, + user_provided, + remove_from_user_provided, } => { dimension.insert(embedder_name.clone(), expected_dimension); TypedChunk::VectorPoints { @@ -513,8 +513,8 @@ where expected_dimension, manual_vectors, embedder_name, - user_defined, - remove_from_user_defined, + user_provided, + remove_from_user_provided, } } otherwise => otherwise, diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index ab9ef0525..05c849809 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -91,8 +91,8 @@ pub(crate) enum TypedChunk { expected_dimension: usize, manual_vectors: grenad::Reader>, embedder_name: String, - user_defined: RoaringBitmap, - remove_from_user_defined: RoaringBitmap, + user_provided: RoaringBitmap, + remove_from_user_provided: RoaringBitmap, }, ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), } @@ -635,8 +635,8 @@ pub(crate) fn write_typed_chunk_into_index( embeddings, expected_dimension, embedder_name, - user_defined: ud, - remove_from_user_defined: rud, + user_provided: ud, + remove_from_user_provided: rud, } = typed_chunk else { unreachable!(); diff --git a/milli/src/vector/rest.rs b/milli/src/vector/rest.rs index fd771a228..60f54782e 100644 --- a/milli/src/vector/rest.rs +++ b/milli/src/vector/rest.rs @@ -230,7 +230,6 @@ where input_value } [input] => { - dbg!(&options); let mut body = options.query.clone(); body.as_object_mut()