From a1aa99902607bae6df318d71ecc885cf2cadb6bb Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 22 Apr 2024 14:18:35 +0200 Subject: [PATCH 1/2] Add conditions reducing wrok --- .../extract/extract_vector_points.rs | 37 +++++++++++++------ 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 23f945c7a..8a7bcb1f9 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -198,11 +198,16 @@ pub fn extract_vector_points( if document_is_kept { // Don't give up if the old prompt was failing - let old_prompt = prompt - .render(obkv, DelAdd::Deletion, old_fields_ids_map) - .unwrap_or_default(); + let old_prompt = Some(prompt) + // TODO: this filter works because we erase the vec database when a embedding setting changes. + // When vector pipeline will be optimized, this should be removed. + .filter(|_| !settings_diff.reindex_vectors()) + .map(|p| { + p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default() + }); let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; - if old_prompt != new_prompt { + if old_prompt.as_ref() != Some(&new_prompt) { + let old_prompt = old_prompt.unwrap_or_default(); tracing::trace!( "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" ); @@ -224,6 +229,7 @@ pub fn extract_vector_points( &mut manual_vectors_writer, &mut key_buffer, delta, + settings_diff, )?; } @@ -264,10 +270,15 @@ fn push_vectors_diff( manual_vectors_writer: &mut Writer>, key_buffer: &mut Vec, delta: VectorStateDelta, + settings_diff: &InnerIndexSettingsDiff, ) -> Result<()> { puffin::profile_function!(); let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values(); - if must_remove { + if must_remove + // TODO: the below condition works because we erase the vec database when a embedding setting changes. + // When vector pipeline will be optimized, this should be removed. + && !settings_diff.reindex_vectors() + { key_buffer.truncate(TRUNCATE_SIZE); remove_vectors_writer.insert(&key_buffer, [])?; } @@ -295,12 +306,16 @@ fn push_vectors_diff( match eob { EitherOrBoth::Both(_, _) => (), // no need to touch anything EitherOrBoth::Left(vector) => { - // We insert only the Del part of the Obkv to inform - // that we only want to remove all those vectors. - let mut obkv = KvWriterDelAdd::memory(); - obkv.insert(DelAdd::Deletion, cast_slice(&vector))?; - let bytes = obkv.into_inner()?; - manual_vectors_writer.insert(&key_buffer, bytes)?; + // TODO: the below condition works because we erase the vec database when a embedding setting changes. + // When vector pipeline will be optimized, this should be removed. + if !settings_diff.reindex_vectors() { + // We insert only the Del part of the Obkv to inform + // that we only want to remove all those vectors. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Deletion, cast_slice(&vector))?; + let bytes = obkv.into_inner()?; + manual_vectors_writer.insert(&key_buffer, bytes)?; + } } EitherOrBoth::Right(vector) => { // We insert only the Add part of the Obkv to inform From 6247e95dc31ca83b74857b620b3376f930ba9e0e Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 23 Apr 2024 17:42:20 +0200 Subject: [PATCH 2/2] Add benchmark for embeddings --- workloads/movies-subset-hf-embeddings.json | 68 ++++++++++++++++++++ workloads/settings-add-embeddings.json | 72 ++++++++++++++++++++++ 2 files changed, 140 insertions(+) create mode 100644 workloads/movies-subset-hf-embeddings.json create mode 100644 workloads/settings-add-embeddings.json diff --git a/workloads/movies-subset-hf-embeddings.json b/workloads/movies-subset-hf-embeddings.json new file mode 100644 index 000000000..d24bc752c --- /dev/null +++ b/workloads/movies-subset-hf-embeddings.json @@ -0,0 +1,68 @@ +{ + "name": "movies-subset-hf-embeddings", + "run_count": 5, + "extra_cli_args": [ + "--max-indexing-threads=4" + ], + "assets": { + "movies-100.json": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/movies-100.json", + "sha256": "d215e395e4240f12f03b8f1f68901eac82d9e7ded5b462cbf4a6b8efde76c6c6" + } + }, + "commands": [ + { + "route": "experimental-features", + "method": "PATCH", + "body": { + "inline": { + "vectorStore": true + } + }, + "synchronous": "DontWait" + }, + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "searchableAttributes": [ + "title", + "overview" + ], + "filterableAttributes": [ + "genres", + "release_date" + ], + "sortableAttributes": [ + "release_date" + ] + } + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "embedders": { + "default": { + "source": "huggingFace" + } + } + } + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "movies-100.json" + }, + "synchronous": "WaitForTask" + } + ] +} \ No newline at end of file diff --git a/workloads/settings-add-embeddings.json b/workloads/settings-add-embeddings.json new file mode 100644 index 000000000..f87286943 --- /dev/null +++ b/workloads/settings-add-embeddings.json @@ -0,0 +1,72 @@ +{ + "name": "settings-add-embeddings-hf", + "run_count": 5, + "extra_cli_args": [ + "--max-indexing-threads=4" + ], + "assets": { + "movies-100.json": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/movies-100.json", + "sha256": "d215e395e4240f12f03b8f1f68901eac82d9e7ded5b462cbf4a6b8efde76c6c6" + } + }, + "commands": [ + { + "route": "experimental-features", + "method": "PATCH", + "body": { + "inline": { + "vectorStore": true + } + }, + "synchronous": "DontWait" + }, + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "searchableAttributes": [ + "title", + "overview" + ], + "filterableAttributes": [ + "genres", + "release_date" + ], + "sortableAttributes": [ + "release_date" + ] + } + }, + "synchronous": "DontWait" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "movies-100.json" + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "embedders": { + "default": { + "source": "huggingFace", + "model": null, + "revision": null, + "documentTemplate": null, + "distribution": null + } + } + } + }, + "synchronous": "WaitForTask" + } + ] +} \ No newline at end of file