From 363a5cc59099c42e14b7b0e9eb12e2598ff79d14 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 15 Jul 2024 11:56:18 +0200 Subject: [PATCH 1/4] Retrieve function from v1.9 to get embeddings in documents --- milli/src/index.rs | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 27b273393..634630f35 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -22,7 +22,7 @@ use crate::heed_codec::{ }; use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; -use crate::vector::EmbeddingConfig; +use crate::vector::{Embedding, EmbeddingConfig}; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec, @@ -1516,6 +1516,42 @@ impl Index { .unwrap_or_default()) } + pub fn embeddings( + &self, + rtxn: &RoTxn<'_>, + docid: DocumentId, + ) -> Result>> { + let mut res = BTreeMap::new(); + for row in self.embedder_category_id.iter(rtxn)? { + let (embedder_name, embedder_id) = row?; + let embedder_id = (embedder_id as u16) << 8; + let mut embeddings = Vec::new(); + 'vectors: for i in 0..=u8::MAX { + let reader = arroy::Reader::open(rtxn, embedder_id | (i as u16), self.vector_arroy) + .map(Some) + .or_else(|e| match e { + arroy::Error::MissingMetadata => Ok(None), + e => Err(e), + }) + .transpose(); + + let Some(reader) = reader else { + break 'vectors; + }; + + let embedding = reader?.item_vector(rtxn, docid)?; + if let Some(embedding) = embedding { + embeddings.push(embedding) + } else { + break 'vectors; + } + } + + res.insert(embedder_name.to_owned(), embeddings); + } + Ok(res) + } + pub(crate) fn put_search_cutoff(&self, wtxn: &mut RwTxn<'_>, cutoff: u64) -> heed::Result<()> { self.main.remap_types::().put(wtxn, main_key::SEARCH_CUTOFF, &cutoff) } From 9375b7bba58980e517f7c09d680586314a3ade65 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 15 Jul 2024 11:56:39 +0200 Subject: [PATCH 2/4] Inject generated vectors in dumps --- index-scheduler/src/batch.rs | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 3161dc499..4f80cc23b 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -914,8 +914,34 @@ impl IndexScheduler { if self.must_stop_processing.get() { return Err(Error::AbortedTask); } - let (_id, doc) = ret?; - let document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?; + let (id, doc) = ret?; + let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?; + + 'inject_vectors: { + let embeddings = index.embeddings(&rtxn, id)?; + + if embeddings.is_empty() { + break 'inject_vectors; + } + + let vectors = document + .entry("_vectors".to_owned()) + .or_insert(serde_json::Value::Object(Default::default())); + + let serde_json::Value::Object(vectors) = vectors else { + break 'inject_vectors; + }; + + for (embedder_name, embeddings) in embeddings { + vectors.entry(embedder_name).or_insert_with(|| { + serde_json::json!({ + "embeddings": embeddings, + "regenerate": true + }) + }); + } + } + index_dumper.push_document(&document)?; } From 9ec209bbf4a94edf70843602e6b9f3a7c06711ce Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 15 Jul 2024 11:57:11 +0200 Subject: [PATCH 3/4] When importing dumps, remove regenerate: true vectors items --- meilisearch/src/lib.rs | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index bb7562c85..0879701cf 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -419,7 +419,41 @@ fn import_dump( let file = tempfile::tempfile()?; let mut builder = DocumentsBatchBuilder::new(BufWriter::new(file)); for document in index_reader.documents()? { - builder.append_json_object(&document?)?; + let mut document = document?; + + 'remove_injected_vectors: { + let Some(vectors) = document.get_mut("_vectors") else { + break 'remove_injected_vectors; + }; + + let Some(vectors) = vectors.as_object_mut() else { break 'remove_injected_vectors }; + + vectors.retain(|_embedder, embedding_object| { + // don't touch values that aren't objects + let Some(embedding_object) = embedding_object.as_object() else { + return true; + }; + + let mut has_regenerate_true = false; + for (field, value) in embedding_object { + match (field.as_str(), value) { + // detected regenerate : true + // if we don't have any superfluous field, we'll remove the entire entry + ("regenerate", serde_json::Value::Bool(true)) => { + has_regenerate_true = true; + } + // ignore embeddings + ("embeddings", _) => continue, + // any other field: immediately retain the entry + _ => return true, + } + } + // retain the entry unless it has regenerate: true + !has_regenerate_true + }) + } + + builder.append_json_object(&document)?; } // This flush the content of the batch builder. From 8fe6d31e0136fa79a4b4909a290d97e9f9139b95 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 8 Jul 2024 11:04:11 +0200 Subject: [PATCH 4/4] CI: Add ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION workaround to keep using Ubuntu 18.04 --- .github/workflows/flaky-tests.yml | 2 ++ .github/workflows/fuzzer-indexing.yml | 3 ++- .github/workflows/publish-apt-brew-pkg.yml | 2 ++ .github/workflows/publish-binaries.yml | 4 ++++ .github/workflows/test-suite.yml | 8 ++++++++ 5 files changed, 18 insertions(+), 1 deletion(-) diff --git a/.github/workflows/flaky-tests.yml b/.github/workflows/flaky-tests.yml index c7e81aacc..dda1a86dc 100644 --- a/.github/workflows/flaky-tests.yml +++ b/.github/workflows/flaky-tests.yml @@ -1,4 +1,6 @@ name: Look for flaky tests +env: + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true on: workflow_dispatch: schedule: diff --git a/.github/workflows/fuzzer-indexing.yml b/.github/workflows/fuzzer-indexing.yml index 1d01a6ea5..f3cc5af37 100644 --- a/.github/workflows/fuzzer-indexing.yml +++ b/.github/workflows/fuzzer-indexing.yml @@ -1,5 +1,6 @@ name: Run the indexing fuzzer - +env: + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true on: push: branches: diff --git a/.github/workflows/publish-apt-brew-pkg.yml b/.github/workflows/publish-apt-brew-pkg.yml index 11893bae0..e99e196f2 100644 --- a/.github/workflows/publish-apt-brew-pkg.yml +++ b/.github/workflows/publish-apt-brew-pkg.yml @@ -15,6 +15,8 @@ jobs: debian: name: Publish debian packagge + env: + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true runs-on: ubuntu-latest needs: check-version container: diff --git a/.github/workflows/publish-binaries.yml b/.github/workflows/publish-binaries.yml index 2372ce497..4480d4918 100644 --- a/.github/workflows/publish-binaries.yml +++ b/.github/workflows/publish-binaries.yml @@ -35,6 +35,8 @@ jobs: publish-linux: name: Publish binary for Linux runs-on: ubuntu-latest + env: + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true needs: check-version container: # Use ubuntu-18.04 to compile with glibc 2.27 @@ -132,6 +134,8 @@ jobs: name: Publish binary for aarch64 runs-on: ubuntu-latest needs: check-version + env: + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true container: # Use ubuntu-18.04 to compile with glibc 2.27 image: ubuntu:18.04 diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index 5dbde4301..77b444b64 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -21,6 +21,8 @@ jobs: test-linux: name: Tests on ubuntu-18.04 runs-on: ubuntu-latest + env: + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true container: # Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations image: ubuntu:18.04 @@ -77,6 +79,8 @@ jobs: test-all-features: name: Tests almost all features runs-on: ubuntu-latest + env: + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true container: # Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations image: ubuntu:18.04 @@ -100,6 +104,8 @@ jobs: test-disabled-tokenization: name: Test disabled tokenization + env: + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true runs-on: ubuntu-latest container: image: ubuntu:18.04 @@ -127,6 +133,8 @@ jobs: # We run tests in debug also, to make sure that the debug_assertions are hit test-debug: name: Run tests in debug + env: + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true runs-on: ubuntu-latest container: # Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations