mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-30 00:34:26 +01:00
Merge #4796
4796: Generate vectors in dumps r=dureuill a=dureuill # Pull Request ## What does this PR do? 1. Add an Index::embeddings method to compute the embeddings of a document 2. Write generated vectors in dumps 3. Remove generated vectors when importing dumps 4. Cherry pick the `ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION` workaround so that the older CI can still build ## Manual Tests (TODO) - [ ] Import a dump from a v1.8.3 into a v1.8.4 successfully - [x] Import a dump from a v1.8.4 into a v1.8.4 successfully - [x] Import a dump from a v1.8.4 into a v1.9.0 successfully - [x] generated vectors are not regenerated - [x] user provided vectors are still available - [x] generated vectors still have the correct value - [x] updating a document with generated vectors attempts to regenerate Co-authored-by: Louis Dureuil <louis@meilisearch.com>
This commit is contained in:
commit
f0f02e6412
2
.github/workflows/flaky-tests.yml
vendored
2
.github/workflows/flaky-tests.yml
vendored
@ -1,4 +1,6 @@
|
||||
name: Look for flaky tests
|
||||
env:
|
||||
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
|
3
.github/workflows/fuzzer-indexing.yml
vendored
3
.github/workflows/fuzzer-indexing.yml
vendored
@ -1,5 +1,6 @@
|
||||
name: Run the indexing fuzzer
|
||||
|
||||
env:
|
||||
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
|
2
.github/workflows/publish-apt-brew-pkg.yml
vendored
2
.github/workflows/publish-apt-brew-pkg.yml
vendored
@ -15,6 +15,8 @@ jobs:
|
||||
|
||||
debian:
|
||||
name: Publish debian packagge
|
||||
env:
|
||||
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||
runs-on: ubuntu-latest
|
||||
needs: check-version
|
||||
container:
|
||||
|
4
.github/workflows/publish-binaries.yml
vendored
4
.github/workflows/publish-binaries.yml
vendored
@ -35,6 +35,8 @@ jobs:
|
||||
publish-linux:
|
||||
name: Publish binary for Linux
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||
needs: check-version
|
||||
container:
|
||||
# Use ubuntu-18.04 to compile with glibc 2.27
|
||||
@ -132,6 +134,8 @@ jobs:
|
||||
name: Publish binary for aarch64
|
||||
runs-on: ubuntu-latest
|
||||
needs: check-version
|
||||
env:
|
||||
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||
container:
|
||||
# Use ubuntu-18.04 to compile with glibc 2.27
|
||||
image: ubuntu:18.04
|
||||
|
8
.github/workflows/test-suite.yml
vendored
8
.github/workflows/test-suite.yml
vendored
@ -21,6 +21,8 @@ jobs:
|
||||
test-linux:
|
||||
name: Tests on ubuntu-18.04
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||
container:
|
||||
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
|
||||
image: ubuntu:18.04
|
||||
@ -77,6 +79,8 @@ jobs:
|
||||
test-all-features:
|
||||
name: Tests almost all features
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||
container:
|
||||
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
|
||||
image: ubuntu:18.04
|
||||
@ -100,6 +104,8 @@ jobs:
|
||||
|
||||
test-disabled-tokenization:
|
||||
name: Test disabled tokenization
|
||||
env:
|
||||
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: ubuntu:18.04
|
||||
@ -127,6 +133,8 @@ jobs:
|
||||
# We run tests in debug also, to make sure that the debug_assertions are hit
|
||||
test-debug:
|
||||
name: Run tests in debug
|
||||
env:
|
||||
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
|
||||
|
@ -914,8 +914,34 @@ impl IndexScheduler {
|
||||
if self.must_stop_processing.get() {
|
||||
return Err(Error::AbortedTask);
|
||||
}
|
||||
let (_id, doc) = ret?;
|
||||
let document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?;
|
||||
let (id, doc) = ret?;
|
||||
let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?;
|
||||
|
||||
'inject_vectors: {
|
||||
let embeddings = index.embeddings(&rtxn, id)?;
|
||||
|
||||
if embeddings.is_empty() {
|
||||
break 'inject_vectors;
|
||||
}
|
||||
|
||||
let vectors = document
|
||||
.entry("_vectors".to_owned())
|
||||
.or_insert(serde_json::Value::Object(Default::default()));
|
||||
|
||||
let serde_json::Value::Object(vectors) = vectors else {
|
||||
break 'inject_vectors;
|
||||
};
|
||||
|
||||
for (embedder_name, embeddings) in embeddings {
|
||||
vectors.entry(embedder_name).or_insert_with(|| {
|
||||
serde_json::json!({
|
||||
"embeddings": embeddings,
|
||||
"regenerate": true
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
index_dumper.push_document(&document)?;
|
||||
}
|
||||
|
||||
|
@ -419,7 +419,41 @@ fn import_dump(
|
||||
let file = tempfile::tempfile()?;
|
||||
let mut builder = DocumentsBatchBuilder::new(BufWriter::new(file));
|
||||
for document in index_reader.documents()? {
|
||||
builder.append_json_object(&document?)?;
|
||||
let mut document = document?;
|
||||
|
||||
'remove_injected_vectors: {
|
||||
let Some(vectors) = document.get_mut("_vectors") else {
|
||||
break 'remove_injected_vectors;
|
||||
};
|
||||
|
||||
let Some(vectors) = vectors.as_object_mut() else { break 'remove_injected_vectors };
|
||||
|
||||
vectors.retain(|_embedder, embedding_object| {
|
||||
// don't touch values that aren't objects
|
||||
let Some(embedding_object) = embedding_object.as_object() else {
|
||||
return true;
|
||||
};
|
||||
|
||||
let mut has_regenerate_true = false;
|
||||
for (field, value) in embedding_object {
|
||||
match (field.as_str(), value) {
|
||||
// detected regenerate : true
|
||||
// if we don't have any superfluous field, we'll remove the entire entry
|
||||
("regenerate", serde_json::Value::Bool(true)) => {
|
||||
has_regenerate_true = true;
|
||||
}
|
||||
// ignore embeddings
|
||||
("embeddings", _) => continue,
|
||||
// any other field: immediately retain the entry
|
||||
_ => return true,
|
||||
}
|
||||
}
|
||||
// retain the entry unless it has regenerate: true
|
||||
!has_regenerate_true
|
||||
})
|
||||
}
|
||||
|
||||
builder.append_json_object(&document)?;
|
||||
}
|
||||
|
||||
// This flush the content of the batch builder.
|
||||
|
@ -22,7 +22,7 @@ use crate::heed_codec::{
|
||||
};
|
||||
use crate::order_by_map::OrderByMap;
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::vector::EmbeddingConfig;
|
||||
use crate::vector::{Embedding, EmbeddingConfig};
|
||||
use crate::{
|
||||
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
||||
FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec,
|
||||
@ -1516,6 +1516,42 @@ impl Index {
|
||||
.unwrap_or_default())
|
||||
}
|
||||
|
||||
pub fn embeddings(
|
||||
&self,
|
||||
rtxn: &RoTxn<'_>,
|
||||
docid: DocumentId,
|
||||
) -> Result<BTreeMap<String, Vec<Embedding>>> {
|
||||
let mut res = BTreeMap::new();
|
||||
for row in self.embedder_category_id.iter(rtxn)? {
|
||||
let (embedder_name, embedder_id) = row?;
|
||||
let embedder_id = (embedder_id as u16) << 8;
|
||||
let mut embeddings = Vec::new();
|
||||
'vectors: for i in 0..=u8::MAX {
|
||||
let reader = arroy::Reader::open(rtxn, embedder_id | (i as u16), self.vector_arroy)
|
||||
.map(Some)
|
||||
.or_else(|e| match e {
|
||||
arroy::Error::MissingMetadata => Ok(None),
|
||||
e => Err(e),
|
||||
})
|
||||
.transpose();
|
||||
|
||||
let Some(reader) = reader else {
|
||||
break 'vectors;
|
||||
};
|
||||
|
||||
let embedding = reader?.item_vector(rtxn, docid)?;
|
||||
if let Some(embedding) = embedding {
|
||||
embeddings.push(embedding)
|
||||
} else {
|
||||
break 'vectors;
|
||||
}
|
||||
}
|
||||
|
||||
res.insert(embedder_name.to_owned(), embeddings);
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
pub(crate) fn put_search_cutoff(&self, wtxn: &mut RwTxn<'_>, cutoff: u64) -> heed::Result<()> {
|
||||
self.main.remap_types::<Str, BEU64>().put(wtxn, main_key::SEARCH_CUTOFF, &cutoff)
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user