From ec690e890d954dabedd99813d7b77bb97b3ac936 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 19 May 2025 11:38:36 +0200 Subject: [PATCH] Adapt Document change --- crates/milli/src/update/new/document.rs | 41 +++++++++++++------ .../milli/src/update/new/extract/documents.rs | 4 ++ 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/crates/milli/src/update/new/document.rs b/crates/milli/src/update/new/document.rs index 1ef44fc8d..720b83780 100644 --- a/crates/milli/src/update/new/document.rs +++ b/crates/milli/src/update/new/document.rs @@ -9,6 +9,7 @@ use super::vector_document::VectorDocument; use super::{KvReaderFieldId, KvWriterFieldId}; use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME}; use crate::documents::FieldIdMapper; +use crate::vector::settings::EmbedderAction; use crate::{DocumentId, GlobalFieldsIdsMap, Index, InternalError, Result, UserError}; /// A view into a document that can represent either the current version from the DB, @@ -309,6 +310,7 @@ where pub fn write_to_obkv<'s, 'a, 'map, 'buffer>( document: &'s impl Document<'s>, vector_document: Option<&'s impl VectorDocument<'s>>, + embedder_actions: &'a BTreeMap, fields_ids_map: &'a mut GlobalFieldsIdsMap<'map>, mut document_buffer: &'a mut bumpalo::collections::Vec<'buffer, u8>, ) -> Result<&'a KvReaderFieldId> @@ -338,20 +340,35 @@ where for res in vector_document.iter_vectors() { let (name, entry) = res?; if entry.has_configured_embedder { - continue; // we don't write vectors with configured embedder in documents - } - vectors.insert( - name, - if entry.implicit { - serde_json::json!(entry.embeddings) - } else { - serde_json::json!({ + if let Some(action) = embedder_actions.get(name) { + if action.write_back().is_some() && !entry.regenerate { + vectors.insert( + name, + serde_json::json!({ + "regenerate": entry.regenerate, + // TODO: consider optimizing the shape of embedders here to store an array of f32 rather than a JSON object + "embeddings": entry.embeddings, + }), + ); + } + } + } else { + if embedder_actions.contains_key(name) { + continue; + } + vectors.insert( + name, + if entry.implicit { + serde_json::json!(entry.embeddings) + } else { + serde_json::json!({ "regenerate": entry.regenerate, // TODO: consider optimizing the shape of embedders here to store an array of f32 rather than a JSON object - "embeddings": entry.embeddings, - }) - }, - ); + "embeddings": entry.embeddings, + }) + }, + ); + } } if vectors.is_empty() { diff --git a/crates/milli/src/update/new/extract/documents.rs b/crates/milli/src/update/new/extract/documents.rs index d1c92919b..18a1d28e3 100644 --- a/crates/milli/src/update/new/extract/documents.rs +++ b/crates/milli/src/update/new/extract/documents.rs @@ -45,6 +45,7 @@ impl<'extractor> Extractor<'extractor> for DocumentsExtractor<'_, '_> { ) -> Result<()> { let mut document_buffer = bumpalo::collections::Vec::new_in(&context.doc_alloc); let mut document_extractor_data = context.data.0.borrow_mut_or_yield(); + let embedder_actions = &Default::default(); for change in changes { let change = change?; @@ -121,9 +122,11 @@ impl<'extractor> Extractor<'extractor> for DocumentsExtractor<'_, '_> { let content = write_to_obkv( &content, vector_content.as_ref(), + embedder_actions, &mut new_fields_ids_map, &mut document_buffer, )?; + self.document_sender.uncompressed(docid, external_docid, content).unwrap(); } DocumentChange::Insertion(insertion) => { @@ -146,6 +149,7 @@ impl<'extractor> Extractor<'extractor> for DocumentsExtractor<'_, '_> { let content = write_to_obkv( &content, inserted_vectors.as_ref(), + embedder_actions, &mut new_fields_ids_map, &mut document_buffer, )?;