implements mor review comments

This commit is contained in:
Tamo 2024-06-05 15:19:22 +02:00
parent 49fa41ce65
commit b7349910d9
6 changed files with 31 additions and 35 deletions

View File

@ -5012,7 +5012,7 @@ mod tests {
insta::assert_json_snapshot!(task.details);
}
handle.advance_n_successful_batches(1);
handle.advance_one_successful_batch();
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "settings_update_processed_vectors");
{

View File

@ -35,8 +35,8 @@ pub struct ExtractedVectorPoints {
// embedder
pub embedder_name: String,
pub embedder: Arc<Embedder>,
pub user_defined: RoaringBitmap,
pub remove_from_user_defined: RoaringBitmap,
pub user_provided: RoaringBitmap,
pub remove_from_user_provided: RoaringBitmap,
}
enum VectorStateDelta {
@ -82,9 +82,9 @@ struct EmbedderVectorExtractor {
remove_vectors_writer: Writer<BufWriter<File>>,
// The docids of the documents that contains a user defined embedding
user_defined: RoaringBitmap,
user_provided: RoaringBitmap,
// The docids of the documents that contains an auto-generated embedding
remove_from_user_defined: RoaringBitmap,
remove_from_user_provided: RoaringBitmap,
}
/// Extracts the embedding vector contained in each document under the `_vectors` field.
@ -140,8 +140,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
manual_vectors_writer,
prompts_writer,
remove_vectors_writer,
user_defined: RoaringBitmap::new(),
remove_from_user_defined: RoaringBitmap::new(),
user_provided: RoaringBitmap::new(),
remove_from_user_provided: RoaringBitmap::new(),
});
}
@ -179,8 +179,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
manual_vectors_writer,
prompts_writer,
remove_vectors_writer,
user_defined,
remove_from_user_defined,
user_provided,
remove_from_user_provided,
} in extractors.iter_mut()
{
let delta = match parsed_vectors.remove(embedder_name) {
@ -188,10 +188,10 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
match (old.map_or(true, |old| old.is_user_provided()), new.is_user_provided()) {
(true, true) | (false, false) => (),
(true, false) => {
remove_from_user_defined.insert(docid);
remove_from_user_provided.insert(docid);
}
(false, true) => {
user_defined.insert(docid);
user_provided.insert(docid);
}
}
@ -214,7 +214,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
if document_is_kept && old.is_some() {
remove_from_user_defined.insert(docid);
remove_from_user_provided.insert(docid);
// becomes autogenerated
VectorStateDelta::NowGenerated(prompt.render(
obkv,
@ -229,9 +229,9 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
}
(None, Some(new)) => {
if new.is_user_provided() {
user_defined.insert(docid);
user_provided.insert(docid);
} else {
remove_from_user_defined.insert(docid);
remove_from_user_provided.insert(docid);
}
// was possibly autogenerated, remove all vectors for that document
let add_vectors = new.into_array_of_vectors();
@ -274,7 +274,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
VectorStateDelta::NoChange
}
} else {
remove_from_user_defined.remove(docid);
remove_from_user_provided.remove(docid);
VectorStateDelta::NowRemoved
}
}
@ -301,8 +301,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
manual_vectors_writer,
prompts_writer,
remove_vectors_writer,
user_defined,
remove_from_user_defined,
user_provided,
remove_from_user_provided,
} in extractors
{
results.push(ExtractedVectorPoints {
@ -311,8 +311,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
prompts: writer_into_reader(prompts_writer)?,
embedder,
embedder_name,
user_defined,
remove_from_user_defined,
user_provided,
remove_from_user_provided,
})
}
@ -347,9 +347,6 @@ fn push_vectors_diff(
add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
// let merged_vectors_iter =
// itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add));
// insert vectors into the writer
for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) {
// Generate the key by extending the unique index to it.

View File

@ -248,8 +248,8 @@ fn send_original_documents_data(
prompts,
embedder_name,
embedder,
user_defined,
remove_from_user_defined: auto_generated,
user_provided,
remove_from_user_provided,
} in extracted_vectors
{
let embeddings = match extract_embeddings(
@ -274,8 +274,8 @@ fn send_original_documents_data(
expected_dimension: embedder.dimensions(),
manual_vectors,
embedder_name,
user_defined,
remove_from_user_defined: auto_generated,
user_provided,
remove_from_user_provided,
}));
}
}

View File

@ -503,8 +503,8 @@ where
embeddings,
manual_vectors,
embedder_name,
user_defined,
remove_from_user_defined,
user_provided,
remove_from_user_provided,
} => {
dimension.insert(embedder_name.clone(), expected_dimension);
TypedChunk::VectorPoints {
@ -513,8 +513,8 @@ where
expected_dimension,
manual_vectors,
embedder_name,
user_defined,
remove_from_user_defined,
user_provided,
remove_from_user_provided,
}
}
otherwise => otherwise,

View File

@ -91,8 +91,8 @@ pub(crate) enum TypedChunk {
expected_dimension: usize,
manual_vectors: grenad::Reader<BufReader<File>>,
embedder_name: String,
user_defined: RoaringBitmap,
remove_from_user_defined: RoaringBitmap,
user_provided: RoaringBitmap,
remove_from_user_provided: RoaringBitmap,
},
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
}
@ -635,8 +635,8 @@ pub(crate) fn write_typed_chunk_into_index(
embeddings,
expected_dimension,
embedder_name,
user_defined: ud,
remove_from_user_defined: rud,
user_provided: ud,
remove_from_user_provided: rud,
} = typed_chunk
else {
unreachable!();

View File

@ -230,7 +230,6 @@ where
input_value
}
[input] => {
dbg!(&options);
let mut body = options.query.clone();
body.as_object_mut()