From 2f7a8a4efb9248855a272aa7ec9e8d46a290a8f8 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 14 May 2024 11:46:04 +0200 Subject: [PATCH] Don't write vectors that weren't autogenerated in document DB --- .../src/update/index_documents/typed_chunk.rs | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index e0de2d5a1..8eb9ead28 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -193,6 +193,10 @@ pub(crate) fn write_typed_chunk_into_index( let span = tracing::trace_span!(target: "indexing::write_db", "documents"); let _entered = span.enter(); + let fields_ids_map = index.fields_ids_map(wtxn)?; + let vectors_fid = + fields_ids_map.id(crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME); + let mut builder = MergerBuilder::new(keep_latest_obkv as MergeFn); for typed_chunk in typed_chunks { let TypedChunk::Documents(chunk) = typed_chunk else { @@ -206,6 +210,8 @@ pub(crate) fn write_typed_chunk_into_index( let mut docids = index.documents_ids(wtxn)?; let mut iter = merger.into_stream_merger_iter()?; + + let mut vectors_buffer = Vec::new(); while let Some((key, reader)) = iter.next()? { let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); let reader: KvReader = KvReader::new(reader); @@ -219,6 +225,24 @@ pub(crate) fn write_typed_chunk_into_index( let del_add_reader = KvReaderDelAdd::new(value); if let Some(addition) = del_add_reader.get(DelAdd::Addition) { + let addition = match vectors_fid { + // for the "_vectors" field, only keep vectors that are marked as userProvided + Some(vectors_fid) if vectors_fid == field_id => 'vectors: { + vectors_buffer.clear(); + let Ok(mut vectors) = + crate::vector::parsed_vectors::ParsedVectors::from_bytes( + addition, + ) + else { + break 'vectors addition; + }; + vectors.retain_user_provided_vectors(); + serde_json::to_writer(&mut vectors_buffer, &vectors.0) + .map_err(InternalError::SerdeJson)?; + &vectors_buffer + } + _ => addition, + }; writer.insert(field_id, addition)?; } }