mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-30 00:34:26 +01:00
Merge #4796
4796: Generate vectors in dumps r=dureuill a=dureuill # Pull Request ## What does this PR do? 1. Add an Index::embeddings method to compute the embeddings of a document 2. Write generated vectors in dumps 3. Remove generated vectors when importing dumps 4. Cherry pick the `ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION` workaround so that the older CI can still build ## Manual Tests (TODO) - [ ] Import a dump from a v1.8.3 into a v1.8.4 successfully - [x] Import a dump from a v1.8.4 into a v1.8.4 successfully - [x] Import a dump from a v1.8.4 into a v1.9.0 successfully - [x] generated vectors are not regenerated - [x] user provided vectors are still available - [x] generated vectors still have the correct value - [x] updating a document with generated vectors attempts to regenerate Co-authored-by: Louis Dureuil <louis@meilisearch.com>
This commit is contained in:
commit
f0f02e6412
2
.github/workflows/flaky-tests.yml
vendored
2
.github/workflows/flaky-tests.yml
vendored
@ -1,4 +1,6 @@
|
|||||||
name: Look for flaky tests
|
name: Look for flaky tests
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
on:
|
on:
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
schedule:
|
schedule:
|
||||||
|
3
.github/workflows/fuzzer-indexing.yml
vendored
3
.github/workflows/fuzzer-indexing.yml
vendored
@ -1,5 +1,6 @@
|
|||||||
name: Run the indexing fuzzer
|
name: Run the indexing fuzzer
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
|
2
.github/workflows/publish-apt-brew-pkg.yml
vendored
2
.github/workflows/publish-apt-brew-pkg.yml
vendored
@ -15,6 +15,8 @@ jobs:
|
|||||||
|
|
||||||
debian:
|
debian:
|
||||||
name: Publish debian packagge
|
name: Publish debian packagge
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs: check-version
|
needs: check-version
|
||||||
container:
|
container:
|
||||||
|
4
.github/workflows/publish-binaries.yml
vendored
4
.github/workflows/publish-binaries.yml
vendored
@ -35,6 +35,8 @@ jobs:
|
|||||||
publish-linux:
|
publish-linux:
|
||||||
name: Publish binary for Linux
|
name: Publish binary for Linux
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
needs: check-version
|
needs: check-version
|
||||||
container:
|
container:
|
||||||
# Use ubuntu-18.04 to compile with glibc 2.27
|
# Use ubuntu-18.04 to compile with glibc 2.27
|
||||||
@ -132,6 +134,8 @@ jobs:
|
|||||||
name: Publish binary for aarch64
|
name: Publish binary for aarch64
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs: check-version
|
needs: check-version
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
container:
|
container:
|
||||||
# Use ubuntu-18.04 to compile with glibc 2.27
|
# Use ubuntu-18.04 to compile with glibc 2.27
|
||||||
image: ubuntu:18.04
|
image: ubuntu:18.04
|
||||||
|
8
.github/workflows/test-suite.yml
vendored
8
.github/workflows/test-suite.yml
vendored
@ -21,6 +21,8 @@ jobs:
|
|||||||
test-linux:
|
test-linux:
|
||||||
name: Tests on ubuntu-18.04
|
name: Tests on ubuntu-18.04
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
container:
|
container:
|
||||||
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
|
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
|
||||||
image: ubuntu:18.04
|
image: ubuntu:18.04
|
||||||
@ -77,6 +79,8 @@ jobs:
|
|||||||
test-all-features:
|
test-all-features:
|
||||||
name: Tests almost all features
|
name: Tests almost all features
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
container:
|
container:
|
||||||
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
|
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
|
||||||
image: ubuntu:18.04
|
image: ubuntu:18.04
|
||||||
@ -100,6 +104,8 @@ jobs:
|
|||||||
|
|
||||||
test-disabled-tokenization:
|
test-disabled-tokenization:
|
||||||
name: Test disabled tokenization
|
name: Test disabled tokenization
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
container:
|
container:
|
||||||
image: ubuntu:18.04
|
image: ubuntu:18.04
|
||||||
@ -127,6 +133,8 @@ jobs:
|
|||||||
# We run tests in debug also, to make sure that the debug_assertions are hit
|
# We run tests in debug also, to make sure that the debug_assertions are hit
|
||||||
test-debug:
|
test-debug:
|
||||||
name: Run tests in debug
|
name: Run tests in debug
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
container:
|
container:
|
||||||
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
|
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
|
||||||
|
@ -914,8 +914,34 @@ impl IndexScheduler {
|
|||||||
if self.must_stop_processing.get() {
|
if self.must_stop_processing.get() {
|
||||||
return Err(Error::AbortedTask);
|
return Err(Error::AbortedTask);
|
||||||
}
|
}
|
||||||
let (_id, doc) = ret?;
|
let (id, doc) = ret?;
|
||||||
let document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?;
|
let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?;
|
||||||
|
|
||||||
|
'inject_vectors: {
|
||||||
|
let embeddings = index.embeddings(&rtxn, id)?;
|
||||||
|
|
||||||
|
if embeddings.is_empty() {
|
||||||
|
break 'inject_vectors;
|
||||||
|
}
|
||||||
|
|
||||||
|
let vectors = document
|
||||||
|
.entry("_vectors".to_owned())
|
||||||
|
.or_insert(serde_json::Value::Object(Default::default()));
|
||||||
|
|
||||||
|
let serde_json::Value::Object(vectors) = vectors else {
|
||||||
|
break 'inject_vectors;
|
||||||
|
};
|
||||||
|
|
||||||
|
for (embedder_name, embeddings) in embeddings {
|
||||||
|
vectors.entry(embedder_name).or_insert_with(|| {
|
||||||
|
serde_json::json!({
|
||||||
|
"embeddings": embeddings,
|
||||||
|
"regenerate": true
|
||||||
|
})
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
index_dumper.push_document(&document)?;
|
index_dumper.push_document(&document)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -419,7 +419,41 @@ fn import_dump(
|
|||||||
let file = tempfile::tempfile()?;
|
let file = tempfile::tempfile()?;
|
||||||
let mut builder = DocumentsBatchBuilder::new(BufWriter::new(file));
|
let mut builder = DocumentsBatchBuilder::new(BufWriter::new(file));
|
||||||
for document in index_reader.documents()? {
|
for document in index_reader.documents()? {
|
||||||
builder.append_json_object(&document?)?;
|
let mut document = document?;
|
||||||
|
|
||||||
|
'remove_injected_vectors: {
|
||||||
|
let Some(vectors) = document.get_mut("_vectors") else {
|
||||||
|
break 'remove_injected_vectors;
|
||||||
|
};
|
||||||
|
|
||||||
|
let Some(vectors) = vectors.as_object_mut() else { break 'remove_injected_vectors };
|
||||||
|
|
||||||
|
vectors.retain(|_embedder, embedding_object| {
|
||||||
|
// don't touch values that aren't objects
|
||||||
|
let Some(embedding_object) = embedding_object.as_object() else {
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut has_regenerate_true = false;
|
||||||
|
for (field, value) in embedding_object {
|
||||||
|
match (field.as_str(), value) {
|
||||||
|
// detected regenerate : true
|
||||||
|
// if we don't have any superfluous field, we'll remove the entire entry
|
||||||
|
("regenerate", serde_json::Value::Bool(true)) => {
|
||||||
|
has_regenerate_true = true;
|
||||||
|
}
|
||||||
|
// ignore embeddings
|
||||||
|
("embeddings", _) => continue,
|
||||||
|
// any other field: immediately retain the entry
|
||||||
|
_ => return true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// retain the entry unless it has regenerate: true
|
||||||
|
!has_regenerate_true
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
builder.append_json_object(&document)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// This flush the content of the batch builder.
|
// This flush the content of the batch builder.
|
||||||
|
@ -22,7 +22,7 @@ use crate::heed_codec::{
|
|||||||
};
|
};
|
||||||
use crate::order_by_map::OrderByMap;
|
use crate::order_by_map::OrderByMap;
|
||||||
use crate::proximity::ProximityPrecision;
|
use crate::proximity::ProximityPrecision;
|
||||||
use crate::vector::EmbeddingConfig;
|
use crate::vector::{Embedding, EmbeddingConfig};
|
||||||
use crate::{
|
use crate::{
|
||||||
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
||||||
FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec,
|
FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec,
|
||||||
@ -1516,6 +1516,42 @@ impl Index {
|
|||||||
.unwrap_or_default())
|
.unwrap_or_default())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn embeddings(
|
||||||
|
&self,
|
||||||
|
rtxn: &RoTxn<'_>,
|
||||||
|
docid: DocumentId,
|
||||||
|
) -> Result<BTreeMap<String, Vec<Embedding>>> {
|
||||||
|
let mut res = BTreeMap::new();
|
||||||
|
for row in self.embedder_category_id.iter(rtxn)? {
|
||||||
|
let (embedder_name, embedder_id) = row?;
|
||||||
|
let embedder_id = (embedder_id as u16) << 8;
|
||||||
|
let mut embeddings = Vec::new();
|
||||||
|
'vectors: for i in 0..=u8::MAX {
|
||||||
|
let reader = arroy::Reader::open(rtxn, embedder_id | (i as u16), self.vector_arroy)
|
||||||
|
.map(Some)
|
||||||
|
.or_else(|e| match e {
|
||||||
|
arroy::Error::MissingMetadata => Ok(None),
|
||||||
|
e => Err(e),
|
||||||
|
})
|
||||||
|
.transpose();
|
||||||
|
|
||||||
|
let Some(reader) = reader else {
|
||||||
|
break 'vectors;
|
||||||
|
};
|
||||||
|
|
||||||
|
let embedding = reader?.item_vector(rtxn, docid)?;
|
||||||
|
if let Some(embedding) = embedding {
|
||||||
|
embeddings.push(embedding)
|
||||||
|
} else {
|
||||||
|
break 'vectors;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
res.insert(embedder_name.to_owned(), embeddings);
|
||||||
|
}
|
||||||
|
Ok(res)
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) fn put_search_cutoff(&self, wtxn: &mut RwTxn<'_>, cutoff: u64) -> heed::Result<()> {
|
pub(crate) fn put_search_cutoff(&self, wtxn: &mut RwTxn<'_>, cutoff: u64) -> heed::Result<()> {
|
||||||
self.main.remap_types::<Str, BEU64>().put(wtxn, main_key::SEARCH_CUTOFF, &cutoff)
|
self.main.remap_types::<Str, BEU64>().put(wtxn, main_key::SEARCH_CUTOFF, &cutoff)
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user