4649: Don't store the vectors in the documents database r=dureuill a=irevoire

# Pull Request

## Related issue
Fixes https://github.com/meilisearch/meilisearch/issues/4607

## What does this PR do?
- Ensure that anything falling under `_vectors` is NOT searchable, filterable or sortable
- [x] per embedder, add a roaring bitmap of documents that provide "userProvided" embeddings
- [x] in the indexing process in extract_vector_points, set the bit corresponding to the document depending on the "userProvided" subfield in the _vectors field.
- [x] in the document DB in typed chunks, when writing the _vectors field, remove all keys corresponding to an embedder

Co-authored-by: Tamo <tamo@meilisearch.com>
Co-authored-by: Louis Dureuil <louis@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2024-06-17 12:32:03 +00:00 committed by GitHub
commit e9bf4c43a4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
60 changed files with 3920 additions and 1126 deletions

6
Cargo.lock generated
View File

@ -2455,6 +2455,7 @@ name = "index-scheduler"
version = "1.9.0" version = "1.9.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"arroy",
"big_s", "big_s",
"bincode", "bincode",
"crossbeam", "crossbeam",
@ -2465,6 +2466,7 @@ dependencies = [
"file-store", "file-store",
"flate2", "flate2",
"insta", "insta",
"maplit",
"meili-snap", "meili-snap",
"meilisearch-auth", "meilisearch-auth",
"meilisearch-types", "meilisearch-types",
@ -5301,9 +5303,9 @@ dependencies = [
[[package]] [[package]]
name = "tracing-actix-web" name = "tracing-actix-web"
version = "0.7.10" version = "0.7.11"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa069bd1503dd526ee793bb3fce408895136c95fc86d2edb2acf1c646d7f0684" checksum = "4ee9e39a66d9b615644893ffc1704d2a89b5b315b7fd0228ad3182ca9a306b19"
dependencies = [ dependencies = [
"actix-web", "actix-web",
"mutually_exclusive_features", "mutually_exclusive_features",

View File

@ -780,7 +780,7 @@ expression: document
1.3484878540039063 1.3484878540039063
] ]
], ],
"userProvided": false "regenerate": true
} }
} }
} }

View File

@ -779,7 +779,7 @@ expression: document
1.04031240940094 1.04031240940094
] ]
], ],
"userProvided": false "regenerate": true
} }
} }
} }

View File

@ -152,6 +152,7 @@ impl Settings<Unchecked> {
} }
#[derive(Debug, Clone, Deserialize)] #[derive(Debug, Clone, Deserialize)]
#[allow(dead_code)] // otherwise rustc complains that the fields go unused
#[cfg_attr(test, derive(serde::Serialize))] #[cfg_attr(test, derive(serde::Serialize))]
#[serde(deny_unknown_fields)] #[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]

View File

@ -182,6 +182,7 @@ impl Settings<Unchecked> {
} }
} }
#[allow(dead_code)] // otherwise rustc complains that the fields go unused
#[derive(Debug, Clone, Deserialize)] #[derive(Debug, Clone, Deserialize)]
#[cfg_attr(test, derive(serde::Serialize))] #[cfg_attr(test, derive(serde::Serialize))]
#[serde(deny_unknown_fields)] #[serde(deny_unknown_fields)]

View File

@ -200,6 +200,7 @@ impl std::ops::Deref for IndexUid {
} }
} }
#[allow(dead_code)] // otherwise rustc complains that the fields go unused
#[derive(Debug)] #[derive(Debug)]
#[cfg_attr(test, derive(serde::Serialize))] #[cfg_attr(test, derive(serde::Serialize))]
#[cfg_attr(test, serde(rename_all = "camelCase"))] #[cfg_attr(test, serde(rename_all = "camelCase"))]

View File

@ -40,7 +40,9 @@ ureq = "2.9.7"
uuid = { version = "1.6.1", features = ["serde", "v4"] } uuid = { version = "1.6.1", features = ["serde", "v4"] }
[dev-dependencies] [dev-dependencies]
arroy = "0.3.1"
big_s = "1.0.2" big_s = "1.0.2"
crossbeam = "0.8.4" crossbeam = "0.8.4"
insta = { version = "1.34.0", features = ["json", "redactions"] } insta = { version = "1.34.0", features = ["json", "redactions"] }
maplit = "1.0.2"
meili-snap = { path = "../meili-snap" } meili-snap = { path = "../meili-snap" }

View File

@ -909,6 +909,7 @@ impl IndexScheduler {
let fields_ids_map = index.fields_ids_map(&rtxn)?; let fields_ids_map = index.fields_ids_map(&rtxn)?;
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
let embedding_configs = index.embedding_configs(&rtxn)?;
// 3.1. Dump the documents // 3.1. Dump the documents
for ret in index.all_documents(&rtxn)? { for ret in index.all_documents(&rtxn)? {
@ -951,16 +952,21 @@ impl IndexScheduler {
}; };
for (embedder_name, embeddings) in embeddings { for (embedder_name, embeddings) in embeddings {
// don't change the entry if it already exists, because it was user-provided let user_provided = embedding_configs
vectors.entry(embedder_name).or_insert_with(|| { .iter()
let embeddings = ExplicitVectors { .find(|conf| conf.name == embedder_name)
embeddings: VectorOrArrayOfVectors::from_array_of_vectors( .is_some_and(|conf| conf.user_provided.contains(id));
embeddings,
), let embeddings = ExplicitVectors {
user_provided: false, embeddings: Some(
}; VectorOrArrayOfVectors::from_array_of_vectors(embeddings),
serde_json::to_value(embeddings).unwrap() ),
}); regenerate: !user_provided,
};
vectors.insert(
embedder_name,
serde_json::to_value(embeddings).unwrap(),
);
} }
} }

View File

@ -53,6 +53,7 @@ use meilisearch_types::heed::byteorder::BE;
use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str, I128}; use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str, I128};
use meilisearch_types::heed::{self, Database, Env, PutFlags, RoTxn, RwTxn}; use meilisearch_types::heed::{self, Database, Env, PutFlags, RoTxn, RwTxn};
use meilisearch_types::milli::documents::DocumentsBatchBuilder; use meilisearch_types::milli::documents::DocumentsBatchBuilder;
use meilisearch_types::milli::index::IndexEmbeddingConfig;
use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::update::IndexerConfig;
use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs}; use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs};
use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32}; use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32};
@ -1459,33 +1460,39 @@ impl IndexScheduler {
// TODO: consider using a type alias or a struct embedder/template // TODO: consider using a type alias or a struct embedder/template
pub fn embedders( pub fn embedders(
&self, &self,
embedding_configs: Vec<(String, milli::vector::EmbeddingConfig)>, embedding_configs: Vec<IndexEmbeddingConfig>,
) -> Result<EmbeddingConfigs> { ) -> Result<EmbeddingConfigs> {
let res: Result<_> = embedding_configs let res: Result<_> = embedding_configs
.into_iter() .into_iter()
.map(|(name, milli::vector::EmbeddingConfig { embedder_options, prompt })| { .map(
let prompt = |IndexEmbeddingConfig {
Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?); name,
// optimistically return existing embedder config: milli::vector::EmbeddingConfig { embedder_options, prompt },
{ ..
let embedders = self.embedders.read().unwrap(); }| {
if let Some(embedder) = embedders.get(&embedder_options) { let prompt =
return Ok((name, (embedder.clone(), prompt))); Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?);
// optimistically return existing embedder
{
let embedders = self.embedders.read().unwrap();
if let Some(embedder) = embedders.get(&embedder_options) {
return Ok((name, (embedder.clone(), prompt)));
}
} }
}
// add missing embedder // add missing embedder
let embedder = Arc::new( let embedder = Arc::new(
Embedder::new(embedder_options.clone()) Embedder::new(embedder_options.clone())
.map_err(meilisearch_types::milli::vector::Error::from) .map_err(meilisearch_types::milli::vector::Error::from)
.map_err(meilisearch_types::milli::Error::from)?, .map_err(meilisearch_types::milli::Error::from)?,
); );
{ {
let mut embedders = self.embedders.write().unwrap(); let mut embedders = self.embedders.write().unwrap();
embedders.insert(embedder_options, embedder.clone()); embedders.insert(embedder_options, embedder.clone());
} }
Ok((name, (embedder, prompt))) Ok((name, (embedder, prompt)))
}) },
)
.collect(); .collect();
res.map(EmbeddingConfigs::new) res.map(EmbeddingConfigs::new)
} }
@ -1748,6 +1755,9 @@ mod tests {
use meilisearch_types::milli::update::IndexDocumentsMethod::{ use meilisearch_types::milli::update::IndexDocumentsMethod::{
ReplaceDocuments, UpdateDocuments, ReplaceDocuments, UpdateDocuments,
}; };
use meilisearch_types::milli::update::Setting;
use meilisearch_types::milli::vector::settings::EmbeddingSettings;
use meilisearch_types::settings::Unchecked;
use meilisearch_types::tasks::IndexSwap; use meilisearch_types::tasks::IndexSwap;
use meilisearch_types::VERSION_FILE_NAME; use meilisearch_types::VERSION_FILE_NAME;
use tempfile::{NamedTempFile, TempDir}; use tempfile::{NamedTempFile, TempDir};
@ -1826,6 +1836,7 @@ mod tests {
assert_eq!(breakpoint, (Init, false)); assert_eq!(breakpoint, (Init, false));
let index_scheduler_handle = IndexSchedulerHandle { let index_scheduler_handle = IndexSchedulerHandle {
_tempdir: tempdir, _tempdir: tempdir,
index_scheduler: index_scheduler.private_clone(),
test_breakpoint_rcv: receiver, test_breakpoint_rcv: receiver,
last_breakpoint: breakpoint.0, last_breakpoint: breakpoint.0,
}; };
@ -1914,6 +1925,7 @@ mod tests {
pub struct IndexSchedulerHandle { pub struct IndexSchedulerHandle {
_tempdir: TempDir, _tempdir: TempDir,
index_scheduler: IndexScheduler,
test_breakpoint_rcv: crossbeam::channel::Receiver<(Breakpoint, bool)>, test_breakpoint_rcv: crossbeam::channel::Receiver<(Breakpoint, bool)>,
last_breakpoint: Breakpoint, last_breakpoint: Breakpoint,
} }
@ -1931,9 +1943,13 @@ mod tests {
{ {
Ok(b) => b, Ok(b) => b,
Err(RecvTimeoutError::Timeout) => { Err(RecvTimeoutError::Timeout) => {
panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.") let state = snapshot_index_scheduler(&self.index_scheduler);
panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.\n{state}")
}
Err(RecvTimeoutError::Disconnected) => {
let state = snapshot_index_scheduler(&self.index_scheduler);
panic!("The scheduler crashed.\n{state}")
} }
Err(RecvTimeoutError::Disconnected) => panic!("The scheduler crashed."),
}; };
// if we've already encountered a breakpoint we're supposed to be stuck on the false // if we've already encountered a breakpoint we're supposed to be stuck on the false
// and we expect the same variant with the true to come now. // and we expect the same variant with the true to come now.
@ -1952,9 +1968,13 @@ mod tests {
{ {
Ok(b) => b, Ok(b) => b,
Err(RecvTimeoutError::Timeout) => { Err(RecvTimeoutError::Timeout) => {
panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.") let state = snapshot_index_scheduler(&self.index_scheduler);
panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.\n{state}")
}
Err(RecvTimeoutError::Disconnected) => {
let state = snapshot_index_scheduler(&self.index_scheduler);
panic!("The scheduler crashed.\n{state}")
} }
Err(RecvTimeoutError::Disconnected) => panic!("The scheduler crashed."),
}; };
assert!(!b, "Found the breakpoint handle in a bad state. Check your test suite"); assert!(!b, "Found the breakpoint handle in a bad state. Check your test suite");
@ -1968,9 +1988,10 @@ mod tests {
fn advance_till(&mut self, breakpoints: impl IntoIterator<Item = Breakpoint>) { fn advance_till(&mut self, breakpoints: impl IntoIterator<Item = Breakpoint>) {
for breakpoint in breakpoints { for breakpoint in breakpoints {
let b = self.advance(); let b = self.advance();
let state = snapshot_index_scheduler(&self.index_scheduler);
assert_eq!( assert_eq!(
b, breakpoint, b, breakpoint,
"Was expecting the breakpoint `{:?}` but instead got `{:?}`.", "Was expecting the breakpoint `{:?}` but instead got `{:?}`.\n{state}",
breakpoint, b breakpoint, b
); );
} }
@ -1995,6 +2016,7 @@ mod tests {
// Wait for one successful batch. // Wait for one successful batch.
#[track_caller] #[track_caller]
fn advance_one_successful_batch(&mut self) { fn advance_one_successful_batch(&mut self) {
self.index_scheduler.assert_internally_consistent();
self.advance_till([Start, BatchCreated]); self.advance_till([Start, BatchCreated]);
loop { loop {
match self.advance() { match self.advance() {
@ -2003,13 +2025,17 @@ mod tests {
InsideProcessBatch => (), InsideProcessBatch => (),
// the batch went successfully, we can stop the loop and go on with the next states. // the batch went successfully, we can stop the loop and go on with the next states.
ProcessBatchSucceeded => break, ProcessBatchSucceeded => break,
AbortedIndexation => panic!("The batch was aborted."), AbortedIndexation => panic!("The batch was aborted.\n{}", snapshot_index_scheduler(&self.index_scheduler)),
ProcessBatchFailed => panic!("The batch failed."), ProcessBatchFailed => {
while self.advance() != Start {}
panic!("The batch failed.\n{}", snapshot_index_scheduler(&self.index_scheduler))
},
breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint), breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint),
} }
} }
self.advance_till([AfterProcessing]); self.advance_till([AfterProcessing]);
self.index_scheduler.assert_internally_consistent();
} }
// Wait for one failed batch. // Wait for one failed batch.
@ -2023,8 +2049,8 @@ mod tests {
InsideProcessBatch => (), InsideProcessBatch => (),
// the batch went failed, we can stop the loop and go on with the next states. // the batch went failed, we can stop the loop and go on with the next states.
ProcessBatchFailed => break, ProcessBatchFailed => break,
ProcessBatchSucceeded => panic!("The batch succeeded. (and it wasn't supposed to sorry)"), ProcessBatchSucceeded => panic!("The batch succeeded. (and it wasn't supposed to sorry)\n{}", snapshot_index_scheduler(&self.index_scheduler)),
AbortedIndexation => panic!("The batch was aborted."), AbortedIndexation => panic!("The batch was aborted.\n{}", snapshot_index_scheduler(&self.index_scheduler)),
breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint), breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint),
} }
} }
@ -3052,8 +3078,10 @@ mod tests {
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let configs = index.embedding_configs(&rtxn).unwrap(); let configs = index.embedding_configs(&rtxn).unwrap();
let (_, embedding_config) = configs.first().unwrap(); let IndexEmbeddingConfig { name, config, user_provided } = configs.first().unwrap();
insta::assert_json_snapshot!(embedding_config.embedder_options); insta::assert_snapshot!(name, @"default");
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
insta::assert_json_snapshot!(config.embedder_options);
} }
#[test] #[test]
@ -4989,7 +5017,6 @@ mod tests {
false, false,
) )
.unwrap(); .unwrap();
index_scheduler.assert_internally_consistent();
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_settings_task_vectors"); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_settings_task_vectors");
@ -5000,7 +5027,7 @@ mod tests {
insta::assert_json_snapshot!(task.details); insta::assert_json_snapshot!(task.details);
} }
handle.advance_n_successful_batches(1); handle.advance_one_successful_batch();
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "settings_update_processed_vectors"); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "settings_update_processed_vectors");
{ {
@ -5017,13 +5044,17 @@ mod tests {
let configs = index.embedding_configs(&rtxn).unwrap(); let configs = index.embedding_configs(&rtxn).unwrap();
// for consistency with the below // for consistency with the below
#[allow(clippy::get_first)] #[allow(clippy::get_first)]
let (name, fakerest_config) = configs.get(0).unwrap(); let IndexEmbeddingConfig { name, config: fakerest_config, user_provided } =
insta::assert_json_snapshot!(name, @r###""A_fakerest""###); configs.get(0).unwrap();
insta::assert_snapshot!(name, @"A_fakerest");
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
insta::assert_json_snapshot!(fakerest_config.embedder_options); insta::assert_json_snapshot!(fakerest_config.embedder_options);
let fakerest_name = name.clone(); let fakerest_name = name.clone();
let (name, simple_hf_config) = configs.get(1).unwrap(); let IndexEmbeddingConfig { name, config: simple_hf_config, user_provided } =
insta::assert_json_snapshot!(name, @r###""B_small_hf""###); configs.get(1).unwrap();
insta::assert_snapshot!(name, @"B_small_hf");
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
insta::assert_json_snapshot!(simple_hf_config.embedder_options); insta::assert_json_snapshot!(simple_hf_config.embedder_options);
let simple_hf_name = name.clone(); let simple_hf_name = name.clone();
@ -5038,25 +5069,25 @@ mod tests {
// add one doc, specifying vectors // add one doc, specifying vectors
let doc = serde_json::json!( let doc = serde_json::json!(
{ {
"id": 0, "id": 0,
"doggo": "Intel", "doggo": "Intel",
"breed": "beagle", "breed": "beagle",
"_vectors": { "_vectors": {
&fakerest_name: { &fakerest_name: {
// this will never trigger regeneration, which is good because we can't actually generate with // this will never trigger regeneration, which is good because we can't actually generate with
// this embedder // this embedder
"userProvided": true, "regenerate": false,
"embeddings": beagle_embed, "embeddings": beagle_embed,
}, },
&simple_hf_name: { &simple_hf_name: {
// this will be regenerated on updates // this will be regenerated on updates
"userProvided": false, "regenerate": true,
"embeddings": lab_embed, "embeddings": lab_embed,
}, },
"noise": [0.1, 0.2, 0.3] "noise": [0.1, 0.2, 0.3]
} }
} }
); );
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0u128).unwrap(); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0u128).unwrap();
@ -5078,7 +5109,6 @@ mod tests {
false, false,
) )
.unwrap(); .unwrap();
index_scheduler.assert_internally_consistent();
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after adding Intel"); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after adding Intel");
@ -5091,6 +5121,19 @@ mod tests {
let index = index_scheduler.index("doggos").unwrap(); let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
// Ensure the document have been inserted into the relevant bitamp
let configs = index.embedding_configs(&rtxn).unwrap();
// for consistency with the below
#[allow(clippy::get_first)]
let IndexEmbeddingConfig { name, config: _, user_provided: user_defined } =
configs.get(0).unwrap();
insta::assert_snapshot!(name, @"A_fakerest");
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>");
let IndexEmbeddingConfig { name, config: _, user_provided } = configs.get(1).unwrap();
insta::assert_snapshot!(name, @"B_small_hf");
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
let embeddings = index.embeddings(&rtxn, 0).unwrap(); let embeddings = index.embeddings(&rtxn, 0).unwrap();
assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true"); assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true");
@ -5140,7 +5183,6 @@ mod tests {
false, false,
) )
.unwrap(); .unwrap();
index_scheduler.assert_internally_consistent();
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir"); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir");
@ -5153,11 +5195,25 @@ mod tests {
let index = index_scheduler.index("doggos").unwrap(); let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
// Ensure the document have been inserted into the relevant bitamp
let configs = index.embedding_configs(&rtxn).unwrap();
// for consistency with the below
#[allow(clippy::get_first)]
let IndexEmbeddingConfig { name, config: _, user_provided: user_defined } =
configs.get(0).unwrap();
insta::assert_snapshot!(name, @"A_fakerest");
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>");
let IndexEmbeddingConfig { name, config: _, user_provided } =
configs.get(1).unwrap();
insta::assert_snapshot!(name, @"B_small_hf");
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
let embeddings = index.embeddings(&rtxn, 0).unwrap(); let embeddings = index.embeddings(&rtxn, 0).unwrap();
// automatically changed to patou // automatically changed to patou because set to regenerate
assert_json_snapshot!(embeddings[&simple_hf_name][0] == patou_embed, @"true"); assert_json_snapshot!(embeddings[&simple_hf_name][0] == patou_embed, @"true");
// remained beagle because set to userProvided // remained beagle
assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true"); assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true");
let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1;
@ -5176,4 +5232,578 @@ mod tests {
} }
} }
} }
#[test]
fn import_vectors_first_and_embedder_later() {
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
let content = serde_json::json!(
[
{
"id": 0,
"doggo": "kefir",
},
{
"id": 1,
"doggo": "intel",
"_vectors": {
"my_doggo_embedder": vec![1; 384],
"unknown embedder": vec![1, 2, 3],
}
},
{
"id": 2,
"doggo": "max",
"_vectors": {
"my_doggo_embedder": {
"regenerate": false,
"embeddings": vec![2; 384],
},
"unknown embedder": vec![4, 5],
},
},
{
"id": 3,
"doggo": "marcel",
"_vectors": {
"my_doggo_embedder": {
"regenerate": true,
"embeddings": vec![3; 384],
},
},
},
{
"id": 4,
"doggo": "sora",
"_vectors": {
"my_doggo_embedder": {
"regenerate": true,
},
},
},
]
);
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap();
let documents_count =
read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file)
.unwrap();
snapshot!(documents_count, @"5");
file.persist().unwrap();
index_scheduler
.register(
KindWithContent::DocumentAdditionOrUpdate {
index_uid: S("doggos"),
primary_key: None,
method: ReplaceDocuments,
content_file: uuid,
documents_count,
allow_index_creation: true,
},
None,
false,
)
.unwrap();
handle.advance_one_successful_batch();
let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap();
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
let documents = index
.all_documents(&rtxn)
.unwrap()
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
.collect::<Vec<_>>();
snapshot!(serde_json::to_string(&documents).unwrap(), name: "documents after initial push");
let setting = meilisearch_types::settings::Settings::<Unchecked> {
embedders: Setting::Set(maplit::btreemap! {
S("my_doggo_embedder") => Setting::Set(EmbeddingSettings {
source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace),
model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")),
revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")),
document_template: Setting::Set(S("{{doc.doggo}}")),
..Default::default()
})
}),
..Default::default()
};
index_scheduler
.register(
KindWithContent::SettingsUpdate {
index_uid: S("doggos"),
new_settings: Box::new(setting),
is_deletion: false,
allow_index_creation: false,
},
None,
false,
)
.unwrap();
index_scheduler.assert_internally_consistent();
handle.advance_one_successful_batch();
index_scheduler.assert_internally_consistent();
let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap();
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
let documents = index
.all_documents(&rtxn)
.unwrap()
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
.collect::<Vec<_>>();
// the all the vectors linked to the new specified embedder have been removed
// Only the unknown embedders stays in the document DB
snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]"###);
let conf = index.embedding_configs(&rtxn).unwrap();
// even though we specified the vector for the ID 3, it shouldn't be marked
// as user provided since we explicitely marked it as NOT user provided.
snapshot!(format!("{conf:#?}"), @r###"
[
IndexEmbeddingConfig {
name: "my_doggo_embedder",
config: EmbeddingConfig {
embedder_options: HuggingFace(
EmbedderOptions {
model: "sentence-transformers/all-MiniLM-L6-v2",
revision: Some(
"e4ce9877abf3edfe10b0d82785e83bdcb973e22e",
),
distribution: None,
},
),
prompt: PromptData {
template: "{{doc.doggo}}",
},
},
user_provided: RoaringBitmap<[1, 2]>,
},
]
"###);
let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap();
let embeddings = index.embeddings(&rtxn, docid).unwrap();
let embedding = &embeddings["my_doggo_embedder"];
assert!(!embedding.is_empty(), "{embedding:?}");
// the document with the id 3 should keep its original embedding
let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap();
let mut embeddings = Vec::new();
'vectors: for i in 0..=u8::MAX {
let reader = arroy::Reader::open(&rtxn, i as u16, index.vector_arroy)
.map(Some)
.or_else(|e| match e {
arroy::Error::MissingMetadata => Ok(None),
e => Err(e),
})
.transpose();
let Some(reader) = reader else {
break 'vectors;
};
let embedding = reader.unwrap().item_vector(&rtxn, docid).unwrap();
if let Some(embedding) = embedding {
embeddings.push(embedding)
} else {
break 'vectors;
}
}
snapshot!(embeddings.len(), @"1");
assert!(embeddings[0].iter().all(|i| *i == 3.0), "{:?}", embeddings[0]);
// If we update marcel it should regenerate its embedding automatically
let content = serde_json::json!(
[
{
"id": 3,
"doggo": "marvel",
},
{
"id": 4,
"doggo": "sorry",
},
]
);
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(1_u128).unwrap();
let documents_count =
read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file)
.unwrap();
snapshot!(documents_count, @"2");
file.persist().unwrap();
index_scheduler
.register(
KindWithContent::DocumentAdditionOrUpdate {
index_uid: S("doggos"),
primary_key: None,
method: UpdateDocuments,
content_file: uuid,
documents_count,
allow_index_creation: true,
},
None,
false,
)
.unwrap();
handle.advance_one_successful_batch();
// the document with the id 3 should have its original embedding updated
let rtxn = index.read_txn().unwrap();
let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap();
let doc = index.documents(&rtxn, Some(docid)).unwrap()[0];
let doc = obkv_to_json(&field_ids, &field_ids_map, doc.1).unwrap();
snapshot!(json_string!(doc), @r###"
{
"id": 3,
"doggo": "marvel"
}
"###);
let embeddings = index.embeddings(&rtxn, docid).unwrap();
let embedding = &embeddings["my_doggo_embedder"];
assert!(!embedding.is_empty());
assert!(!embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]);
// the document with the id 4 should generate an embedding
let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap();
let embeddings = index.embeddings(&rtxn, docid).unwrap();
let embedding = &embeddings["my_doggo_embedder"];
assert!(!embedding.is_empty());
}
#[test]
fn delete_document_containing_vector() {
// 1. Add an embedder
// 2. Push two documents containing a simple vector
// 3. Delete the first document
// 4. The user defined roaring bitmap shouldn't contains the id of the first document anymore
// 5. Clear the index
// 6. The user defined roaring bitmap shouldn't contains the id of the second document
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
let setting = meilisearch_types::settings::Settings::<Unchecked> {
embedders: Setting::Set(maplit::btreemap! {
S("manual") => Setting::Set(EmbeddingSettings {
source: Setting::Set(milli::vector::settings::EmbedderSource::UserProvided),
dimensions: Setting::Set(3),
..Default::default()
})
}),
..Default::default()
};
index_scheduler
.register(
KindWithContent::SettingsUpdate {
index_uid: S("doggos"),
new_settings: Box::new(setting),
is_deletion: false,
allow_index_creation: true,
},
None,
false,
)
.unwrap();
handle.advance_one_successful_batch();
let content = serde_json::json!(
[
{
"id": 0,
"doggo": "kefir",
"_vectors": {
"manual": vec![0, 0, 0],
}
},
{
"id": 1,
"doggo": "intel",
"_vectors": {
"manual": vec![1, 1, 1],
}
},
]
);
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap();
let documents_count =
read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file)
.unwrap();
snapshot!(documents_count, @"2");
file.persist().unwrap();
index_scheduler
.register(
KindWithContent::DocumentAdditionOrUpdate {
index_uid: S("doggos"),
primary_key: None,
method: ReplaceDocuments,
content_file: uuid,
documents_count,
allow_index_creation: false,
},
None,
false,
)
.unwrap();
handle.advance_one_successful_batch();
index_scheduler
.register(
KindWithContent::DocumentDeletion {
index_uid: S("doggos"),
documents_ids: vec![S("1")],
},
None,
false,
)
.unwrap();
handle.advance_one_successful_batch();
let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap();
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
let documents = index
.all_documents(&rtxn)
.unwrap()
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
.collect::<Vec<_>>();
snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"}]"###);
let conf = index.embedding_configs(&rtxn).unwrap();
snapshot!(format!("{conf:#?}"), @r###"
[
IndexEmbeddingConfig {
name: "manual",
config: EmbeddingConfig {
embedder_options: UserProvided(
EmbedderOptions {
dimensions: 3,
distribution: None,
},
),
prompt: PromptData {
template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}",
},
},
user_provided: RoaringBitmap<[0]>,
},
]
"###);
let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap();
let embeddings = index.embeddings(&rtxn, docid).unwrap();
let embedding = &embeddings["manual"];
assert!(!embedding.is_empty(), "{embedding:?}");
index_scheduler
.register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None, false)
.unwrap();
handle.advance_one_successful_batch();
let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap();
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
let documents = index
.all_documents(&rtxn)
.unwrap()
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
.collect::<Vec<_>>();
snapshot!(serde_json::to_string(&documents).unwrap(), @"[]");
let conf = index.embedding_configs(&rtxn).unwrap();
snapshot!(format!("{conf:#?}"), @r###"
[
IndexEmbeddingConfig {
name: "manual",
config: EmbeddingConfig {
embedder_options: UserProvided(
EmbedderOptions {
dimensions: 3,
distribution: None,
},
),
prompt: PromptData {
template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}",
},
},
user_provided: RoaringBitmap<[]>,
},
]
"###);
}
#[test]
fn delete_embedder_with_user_provided_vectors() {
// 1. Add two embedders
// 2. Push two documents containing a simple vector
// 3. The documents must not contain the vectors after the update as they are in the vectors db
// 3. Delete the embedders
// 4. The documents contain the vectors again
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
let setting = meilisearch_types::settings::Settings::<Unchecked> {
embedders: Setting::Set(maplit::btreemap! {
S("manual") => Setting::Set(EmbeddingSettings {
source: Setting::Set(milli::vector::settings::EmbedderSource::UserProvided),
dimensions: Setting::Set(3),
..Default::default()
}),
S("my_doggo_embedder") => Setting::Set(EmbeddingSettings {
source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace),
model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")),
revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")),
document_template: Setting::Set(S("{{doc.doggo}}")),
..Default::default()
}),
}),
..Default::default()
};
index_scheduler
.register(
KindWithContent::SettingsUpdate {
index_uid: S("doggos"),
new_settings: Box::new(setting),
is_deletion: false,
allow_index_creation: true,
},
None,
false,
)
.unwrap();
handle.advance_one_successful_batch();
let content = serde_json::json!(
[
{
"id": 0,
"doggo": "kefir",
"_vectors": {
"manual": vec![0, 0, 0],
"my_doggo_embedder": vec![1; 384],
}
},
{
"id": 1,
"doggo": "intel",
"_vectors": {
"manual": vec![1, 1, 1],
}
},
]
);
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap();
let documents_count =
read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file)
.unwrap();
snapshot!(documents_count, @"2");
file.persist().unwrap();
index_scheduler
.register(
KindWithContent::DocumentAdditionOrUpdate {
index_uid: S("doggos"),
primary_key: None,
method: ReplaceDocuments,
content_file: uuid,
documents_count,
allow_index_creation: false,
},
None,
false,
)
.unwrap();
handle.advance_one_successful_batch();
{
let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap();
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
let documents = index
.all_documents(&rtxn)
.unwrap()
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
.collect::<Vec<_>>();
snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel"}]"###);
}
{
let setting = meilisearch_types::settings::Settings::<Unchecked> {
embedders: Setting::Set(maplit::btreemap! {
S("manual") => Setting::Reset,
}),
..Default::default()
};
index_scheduler
.register(
KindWithContent::SettingsUpdate {
index_uid: S("doggos"),
new_settings: Box::new(setting),
is_deletion: false,
allow_index_creation: true,
},
None,
false,
)
.unwrap();
handle.advance_one_successful_batch();
}
{
let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap();
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
let documents = index
.all_documents(&rtxn)
.unwrap()
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
.collect::<Vec<_>>();
snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir","_vectors":{"manual":{"embeddings":[[0.0,0.0,0.0]],"regenerate":false}}},{"id":1,"doggo":"intel","_vectors":{"manual":{"embeddings":[[1.0,1.0,1.0]],"regenerate":false}}}]"###);
}
{
let setting = meilisearch_types::settings::Settings::<Unchecked> {
embedders: Setting::Reset,
..Default::default()
};
index_scheduler
.register(
KindWithContent::SettingsUpdate {
index_uid: S("doggos"),
new_settings: Box::new(setting),
is_deletion: false,
allow_index_creation: true,
},
None,
false,
)
.unwrap();
handle.advance_one_successful_batch();
}
{
let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap();
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
let documents = index
.all_documents(&rtxn)
.unwrap()
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
.collect::<Vec<_>>();
// FIXME: redaction
snapshot!(json_string!(serde_json::to_string(&documents).unwrap(), { "[]._vectors.doggo_embedder.embeddings" => "[vector]" }), @r###""[{\"id\":0,\"doggo\":\"kefir\",\"_vectors\":{\"manual\":{\"embeddings\":[[0.0,0.0,0.0]],\"regenerate\":false},\"my_doggo_embedder\":{\"embeddings\":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],\"regenerate\":false}}},{\"id\":1,\"doggo\":\"intel\",\"_vectors\":{\"manual\":{\"embeddings\":[[1.0,1.0,1.0]],\"regenerate\":false}}}]""###);
}
}
} }

View File

@ -6,10 +6,6 @@ expression: doc
"doggo": "Intel", "doggo": "Intel",
"breed": "beagle", "breed": "beagle",
"_vectors": { "_vectors": {
"A_fakerest": {
"embeddings": "[vector]",
"userProvided": true
},
"noise": [ "noise": [
0.1, 0.1,
0.2, 0.2,

View File

@ -6,10 +6,6 @@ expression: doc
"doggo": "kefir", "doggo": "kefir",
"breed": "patou", "breed": "patou",
"_vectors": { "_vectors": {
"A_fakerest": {
"embeddings": "[vector]",
"userProvided": true
},
"noise": [ "noise": [
0.1, 0.1,
0.2, 0.2,

View File

@ -222,6 +222,7 @@ InvalidApiKeyUid , InvalidRequest , BAD_REQUEST ;
InvalidContentType , InvalidRequest , UNSUPPORTED_MEDIA_TYPE ; InvalidContentType , InvalidRequest , UNSUPPORTED_MEDIA_TYPE ;
InvalidDocumentCsvDelimiter , InvalidRequest , BAD_REQUEST ; InvalidDocumentCsvDelimiter , InvalidRequest , BAD_REQUEST ;
InvalidDocumentFields , InvalidRequest , BAD_REQUEST ; InvalidDocumentFields , InvalidRequest , BAD_REQUEST ;
InvalidDocumentRetrieveVectors , InvalidRequest , BAD_REQUEST ;
MissingDocumentFilter , InvalidRequest , BAD_REQUEST ; MissingDocumentFilter , InvalidRequest , BAD_REQUEST ;
InvalidDocumentFilter , InvalidRequest , BAD_REQUEST ; InvalidDocumentFilter , InvalidRequest , BAD_REQUEST ;
InvalidDocumentGeoField , InvalidRequest , BAD_REQUEST ; InvalidDocumentGeoField , InvalidRequest , BAD_REQUEST ;
@ -240,9 +241,11 @@ InvalidSearchAttributesToSearchOn , InvalidRequest , BAD_REQUEST ;
InvalidSearchAttributesToCrop , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToCrop , InvalidRequest , BAD_REQUEST ;
InvalidSearchAttributesToHighlight , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToHighlight , InvalidRequest , BAD_REQUEST ;
InvalidSimilarAttributesToRetrieve , InvalidRequest , BAD_REQUEST ; InvalidSimilarAttributesToRetrieve , InvalidRequest , BAD_REQUEST ;
InvalidSimilarRetrieveVectors , InvalidRequest , BAD_REQUEST ;
InvalidSearchAttributesToRetrieve , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToRetrieve , InvalidRequest , BAD_REQUEST ;
InvalidSearchRankingScoreThreshold , InvalidRequest , BAD_REQUEST ; InvalidSearchRankingScoreThreshold , InvalidRequest , BAD_REQUEST ;
InvalidSimilarRankingScoreThreshold , InvalidRequest , BAD_REQUEST ; InvalidSimilarRankingScoreThreshold , InvalidRequest , BAD_REQUEST ;
InvalidSearchRetrieveVectors , InvalidRequest , BAD_REQUEST ;
InvalidSearchCropLength , InvalidRequest , BAD_REQUEST ; InvalidSearchCropLength , InvalidRequest , BAD_REQUEST ;
InvalidSearchCropMarker , InvalidRequest , BAD_REQUEST ; InvalidSearchCropMarker , InvalidRequest , BAD_REQUEST ;
InvalidSearchFacets , InvalidRequest , BAD_REQUEST ; InvalidSearchFacets , InvalidRequest , BAD_REQUEST ;

View File

@ -8,6 +8,7 @@ use std::str::FromStr;
use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef}; use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef};
use fst::IntoStreamer; use fst::IntoStreamer;
use milli::index::IndexEmbeddingConfig;
use milli::proximity::ProximityPrecision; use milli::proximity::ProximityPrecision;
use milli::update::Setting; use milli::update::Setting;
use milli::{Criterion, CriterionError, Index, DEFAULT_VALUES_PER_FACET}; use milli::{Criterion, CriterionError, Index, DEFAULT_VALUES_PER_FACET};
@ -672,7 +673,7 @@ pub fn settings(
let embedders: BTreeMap<_, _> = index let embedders: BTreeMap<_, _> = index
.embedding_configs(rtxn)? .embedding_configs(rtxn)?
.into_iter() .into_iter()
.map(|(name, config)| (name, Setting::Set(config.into()))) .map(|IndexEmbeddingConfig { name, config, .. }| (name, Setting::Set(config.into())))
.collect(); .collect();
let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) }; let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) };

View File

@ -74,8 +74,8 @@ pub enum DocumentDeletionKind {
#[derive(Copy, Clone, Debug, PartialEq, Eq)] #[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum DocumentFetchKind { pub enum DocumentFetchKind {
PerDocumentId, PerDocumentId { retrieve_vectors: bool },
Normal { with_filter: bool, limit: usize, offset: usize }, Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool },
} }
pub trait Analytics: Sync + Send { pub trait Analytics: Sync + Send {

View File

@ -622,6 +622,7 @@ pub struct SearchAggregator {
// Whether a non-default embedder was specified // Whether a non-default embedder was specified
embedder: bool, embedder: bool,
hybrid: bool, hybrid: bool,
retrieve_vectors: bool,
// every time a search is done, we increment the counter linked to the used settings // every time a search is done, we increment the counter linked to the used settings
matching_strategy: HashMap<String, usize>, matching_strategy: HashMap<String, usize>,
@ -662,6 +663,7 @@ impl SearchAggregator {
page, page,
hits_per_page, hits_per_page,
attributes_to_retrieve: _, attributes_to_retrieve: _,
retrieve_vectors,
attributes_to_crop: _, attributes_to_crop: _,
crop_length, crop_length,
attributes_to_highlight: _, attributes_to_highlight: _,
@ -728,6 +730,7 @@ impl SearchAggregator {
if let Some(ref vector) = vector { if let Some(ref vector) = vector {
ret.max_vector_size = vector.len(); ret.max_vector_size = vector.len();
} }
ret.retrieve_vectors |= retrieve_vectors;
if query.is_finite_pagination() { if query.is_finite_pagination() {
let limit = hits_per_page.unwrap_or_else(DEFAULT_SEARCH_LIMIT); let limit = hits_per_page.unwrap_or_else(DEFAULT_SEARCH_LIMIT);
@ -803,6 +806,7 @@ impl SearchAggregator {
attributes_to_search_on_total_number_of_uses, attributes_to_search_on_total_number_of_uses,
max_terms_number, max_terms_number,
max_vector_size, max_vector_size,
retrieve_vectors,
matching_strategy, matching_strategy,
max_limit, max_limit,
max_offset, max_offset,
@ -873,6 +877,7 @@ impl SearchAggregator {
// vector // vector
self.max_vector_size = self.max_vector_size.max(max_vector_size); self.max_vector_size = self.max_vector_size.max(max_vector_size);
self.retrieve_vectors |= retrieve_vectors;
self.semantic_ratio |= semantic_ratio; self.semantic_ratio |= semantic_ratio;
self.hybrid |= hybrid; self.hybrid |= hybrid;
self.embedder |= embedder; self.embedder |= embedder;
@ -929,6 +934,7 @@ impl SearchAggregator {
attributes_to_search_on_total_number_of_uses, attributes_to_search_on_total_number_of_uses,
max_terms_number, max_terms_number,
max_vector_size, max_vector_size,
retrieve_vectors,
matching_strategy, matching_strategy,
max_limit, max_limit,
max_offset, max_offset,
@ -991,6 +997,7 @@ impl SearchAggregator {
}, },
"vector": { "vector": {
"max_vector_size": max_vector_size, "max_vector_size": max_vector_size,
"retrieve_vectors": retrieve_vectors,
}, },
"hybrid": { "hybrid": {
"enabled": hybrid, "enabled": hybrid,
@ -1079,6 +1086,7 @@ impl MultiSearchAggregator {
page: _, page: _,
hits_per_page: _, hits_per_page: _,
attributes_to_retrieve: _, attributes_to_retrieve: _,
retrieve_vectors: _,
attributes_to_crop: _, attributes_to_crop: _,
crop_length: _, crop_length: _,
attributes_to_highlight: _, attributes_to_highlight: _,
@ -1534,6 +1542,9 @@ pub struct DocumentsFetchAggregator {
// if a filter was used // if a filter was used
per_filter: bool, per_filter: bool,
#[serde(rename = "vector.retrieve_vectors")]
retrieve_vectors: bool,
// pagination // pagination
#[serde(rename = "pagination.max_limit")] #[serde(rename = "pagination.max_limit")]
max_limit: usize, max_limit: usize,
@ -1543,18 +1554,21 @@ pub struct DocumentsFetchAggregator {
impl DocumentsFetchAggregator { impl DocumentsFetchAggregator {
pub fn from_query(query: &DocumentFetchKind, request: &HttpRequest) -> Self { pub fn from_query(query: &DocumentFetchKind, request: &HttpRequest) -> Self {
let (limit, offset) = match query { let (limit, offset, retrieve_vectors) = match query {
DocumentFetchKind::PerDocumentId => (1, 0), DocumentFetchKind::PerDocumentId { retrieve_vectors } => (1, 0, *retrieve_vectors),
DocumentFetchKind::Normal { limit, offset, .. } => (*limit, *offset), DocumentFetchKind::Normal { limit, offset, retrieve_vectors, .. } => {
(*limit, *offset, *retrieve_vectors)
}
}; };
Self { Self {
timestamp: Some(OffsetDateTime::now_utc()), timestamp: Some(OffsetDateTime::now_utc()),
user_agents: extract_user_agents(request).into_iter().collect(), user_agents: extract_user_agents(request).into_iter().collect(),
total_received: 1, total_received: 1,
per_document_id: matches!(query, DocumentFetchKind::PerDocumentId), per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }),
per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter), per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter),
max_limit: limit, max_limit: limit,
max_offset: offset, max_offset: offset,
retrieve_vectors,
} }
} }
@ -1568,6 +1582,7 @@ impl DocumentsFetchAggregator {
per_filter, per_filter,
max_limit, max_limit,
max_offset, max_offset,
retrieve_vectors,
} = other; } = other;
if self.timestamp.is_none() { if self.timestamp.is_none() {
@ -1583,6 +1598,8 @@ impl DocumentsFetchAggregator {
self.max_limit = self.max_limit.max(max_limit); self.max_limit = self.max_limit.max(max_limit);
self.max_offset = self.max_offset.max(max_offset); self.max_offset = self.max_offset.max(max_offset);
self.retrieve_vectors |= retrieve_vectors;
} }
pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> { pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> {
@ -1623,6 +1640,7 @@ pub struct SimilarAggregator {
// Whether a non-default embedder was specified // Whether a non-default embedder was specified
embedder: bool, embedder: bool,
retrieve_vectors: bool,
// pagination // pagination
max_limit: usize, max_limit: usize,
@ -1646,6 +1664,7 @@ impl SimilarAggregator {
offset, offset,
limit, limit,
attributes_to_retrieve: _, attributes_to_retrieve: _,
retrieve_vectors,
show_ranking_score, show_ranking_score,
show_ranking_score_details, show_ranking_score_details,
filter, filter,
@ -1690,6 +1709,7 @@ impl SimilarAggregator {
ret.ranking_score_threshold = ranking_score_threshold.is_some(); ret.ranking_score_threshold = ranking_score_threshold.is_some();
ret.embedder = embedder.is_some(); ret.embedder = embedder.is_some();
ret.retrieve_vectors = *retrieve_vectors;
ret ret
} }
@ -1722,6 +1742,7 @@ impl SimilarAggregator {
show_ranking_score_details, show_ranking_score_details,
embedder, embedder,
ranking_score_threshold, ranking_score_threshold,
retrieve_vectors,
} = other; } = other;
if self.timestamp.is_none() { if self.timestamp.is_none() {
@ -1751,6 +1772,7 @@ impl SimilarAggregator {
} }
self.embedder |= embedder; self.embedder |= embedder;
self.retrieve_vectors |= retrieve_vectors;
// pagination // pagination
self.max_limit = self.max_limit.max(max_limit); self.max_limit = self.max_limit.max(max_limit);
@ -1785,6 +1807,7 @@ impl SimilarAggregator {
show_ranking_score_details, show_ranking_score_details,
embedder, embedder,
ranking_score_threshold, ranking_score_threshold,
retrieve_vectors,
} = self; } = self;
if total_received == 0 { if total_received == 0 {
@ -1811,6 +1834,9 @@ impl SimilarAggregator {
"avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64),
"most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)),
}, },
"vector": {
"retrieve_vectors": retrieve_vectors,
},
"hybrid": { "hybrid": {
"embedder": embedder, "embedder": embedder,
}, },

View File

@ -16,6 +16,7 @@ use meilisearch_types::error::{Code, ResponseError};
use meilisearch_types::heed::RoTxn; use meilisearch_types::heed::RoTxn;
use meilisearch_types::index_uid::IndexUid; use meilisearch_types::index_uid::IndexUid;
use meilisearch_types::milli::update::IndexDocumentsMethod; use meilisearch_types::milli::update::IndexDocumentsMethod;
use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors;
use meilisearch_types::milli::DocumentId; use meilisearch_types::milli::DocumentId;
use meilisearch_types::star_or::OptionStarOrList; use meilisearch_types::star_or::OptionStarOrList;
use meilisearch_types::tasks::KindWithContent; use meilisearch_types::tasks::KindWithContent;
@ -39,7 +40,7 @@ use crate::extractors::sequential_extractor::SeqHandler;
use crate::routes::{ use crate::routes::{
get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT, get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT,
}; };
use crate::search::parse_filter; use crate::search::{parse_filter, RetrieveVectors};
use crate::Opt; use crate::Opt;
static ACCEPTED_CONTENT_TYPE: Lazy<Vec<String>> = Lazy::new(|| { static ACCEPTED_CONTENT_TYPE: Lazy<Vec<String>> = Lazy::new(|| {
@ -94,6 +95,8 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
pub struct GetDocument { pub struct GetDocument {
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentFields>)] #[deserr(default, error = DeserrQueryParamError<InvalidDocumentFields>)]
fields: OptionStarOrList<String>, fields: OptionStarOrList<String>,
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentRetrieveVectors>)]
retrieve_vectors: Param<bool>,
} }
pub async fn get_document( pub async fn get_document(
@ -107,13 +110,20 @@ pub async fn get_document(
debug!(parameters = ?params, "Get document"); debug!(parameters = ?params, "Get document");
let index_uid = IndexUid::try_from(index_uid)?; let index_uid = IndexUid::try_from(index_uid)?;
analytics.get_fetch_documents(&DocumentFetchKind::PerDocumentId, &req); let GetDocument { fields, retrieve_vectors: param_retrieve_vectors } = params.into_inner();
let GetDocument { fields } = params.into_inner();
let attributes_to_retrieve = fields.merge_star_and_none(); let attributes_to_retrieve = fields.merge_star_and_none();
let features = index_scheduler.features();
let retrieve_vectors = RetrieveVectors::new(param_retrieve_vectors.0, features)?;
analytics.get_fetch_documents(
&DocumentFetchKind::PerDocumentId { retrieve_vectors: param_retrieve_vectors.0 },
&req,
);
let index = index_scheduler.index(&index_uid)?; let index = index_scheduler.index(&index_uid)?;
let document = retrieve_document(&index, &document_id, attributes_to_retrieve)?; let document =
retrieve_document(&index, &document_id, attributes_to_retrieve, retrieve_vectors)?;
debug!(returns = ?document, "Get document"); debug!(returns = ?document, "Get document");
Ok(HttpResponse::Ok().json(document)) Ok(HttpResponse::Ok().json(document))
} }
@ -153,6 +163,8 @@ pub struct BrowseQueryGet {
limit: Param<usize>, limit: Param<usize>,
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentFields>)] #[deserr(default, error = DeserrQueryParamError<InvalidDocumentFields>)]
fields: OptionStarOrList<String>, fields: OptionStarOrList<String>,
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentRetrieveVectors>)]
retrieve_vectors: Param<bool>,
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentFilter>)] #[deserr(default, error = DeserrQueryParamError<InvalidDocumentFilter>)]
filter: Option<String>, filter: Option<String>,
} }
@ -166,6 +178,8 @@ pub struct BrowseQuery {
limit: usize, limit: usize,
#[deserr(default, error = DeserrJsonError<InvalidDocumentFields>)] #[deserr(default, error = DeserrJsonError<InvalidDocumentFields>)]
fields: Option<Vec<String>>, fields: Option<Vec<String>>,
#[deserr(default, error = DeserrJsonError<InvalidDocumentRetrieveVectors>)]
retrieve_vectors: bool,
#[deserr(default, error = DeserrJsonError<InvalidDocumentFilter>)] #[deserr(default, error = DeserrJsonError<InvalidDocumentFilter>)]
filter: Option<Value>, filter: Option<Value>,
} }
@ -185,6 +199,7 @@ pub async fn documents_by_query_post(
with_filter: body.filter.is_some(), with_filter: body.filter.is_some(),
limit: body.limit, limit: body.limit,
offset: body.offset, offset: body.offset,
retrieve_vectors: body.retrieve_vectors,
}, },
&req, &req,
); );
@ -201,7 +216,7 @@ pub async fn get_documents(
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
debug!(parameters = ?params, "Get documents GET"); debug!(parameters = ?params, "Get documents GET");
let BrowseQueryGet { limit, offset, fields, filter } = params.into_inner(); let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter } = params.into_inner();
let filter = match filter { let filter = match filter {
Some(f) => match serde_json::from_str(&f) { Some(f) => match serde_json::from_str(&f) {
@ -215,6 +230,7 @@ pub async fn get_documents(
offset: offset.0, offset: offset.0,
limit: limit.0, limit: limit.0,
fields: fields.merge_star_and_none(), fields: fields.merge_star_and_none(),
retrieve_vectors: retrieve_vectors.0,
filter, filter,
}; };
@ -223,6 +239,7 @@ pub async fn get_documents(
with_filter: query.filter.is_some(), with_filter: query.filter.is_some(),
limit: query.limit, limit: query.limit,
offset: query.offset, offset: query.offset,
retrieve_vectors: query.retrieve_vectors,
}, },
&req, &req,
); );
@ -236,10 +253,14 @@ fn documents_by_query(
query: BrowseQuery, query: BrowseQuery,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?; let index_uid = IndexUid::try_from(index_uid.into_inner())?;
let BrowseQuery { offset, limit, fields, filter } = query; let BrowseQuery { offset, limit, fields, retrieve_vectors, filter } = query;
let features = index_scheduler.features();
let retrieve_vectors = RetrieveVectors::new(retrieve_vectors, features)?;
let index = index_scheduler.index(&index_uid)?; let index = index_scheduler.index(&index_uid)?;
let (total, documents) = retrieve_documents(&index, offset, limit, filter, fields)?; let (total, documents) =
retrieve_documents(&index, offset, limit, filter, fields, retrieve_vectors)?;
let ret = PaginationView::new(offset, limit, total as usize, documents); let ret = PaginationView::new(offset, limit, total as usize, documents);
@ -579,13 +600,44 @@ fn some_documents<'a, 't: 'a>(
index: &'a Index, index: &'a Index,
rtxn: &'t RoTxn, rtxn: &'t RoTxn,
doc_ids: impl IntoIterator<Item = DocumentId> + 'a, doc_ids: impl IntoIterator<Item = DocumentId> + 'a,
retrieve_vectors: RetrieveVectors,
) -> Result<impl Iterator<Item = Result<Document, ResponseError>> + 'a, ResponseError> { ) -> Result<impl Iterator<Item = Result<Document, ResponseError>> + 'a, ResponseError> {
let fields_ids_map = index.fields_ids_map(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?;
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
let embedding_configs = index.embedding_configs(rtxn)?;
Ok(index.iter_documents(rtxn, doc_ids)?.map(move |ret| { Ok(index.iter_documents(rtxn, doc_ids)?.map(move |ret| {
ret.map_err(ResponseError::from).and_then(|(_key, document)| -> Result<_, ResponseError> { ret.map_err(ResponseError::from).and_then(|(key, document)| -> Result<_, ResponseError> {
Ok(milli::obkv_to_json(&all_fields, &fields_ids_map, document)?) let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, document)?;
match retrieve_vectors {
RetrieveVectors::Ignore => {}
RetrieveVectors::Hide => {
document.remove("_vectors");
}
RetrieveVectors::Retrieve => {
let mut vectors = match document.remove("_vectors") {
Some(Value::Object(map)) => map,
_ => Default::default(),
};
for (name, vector) in index.embeddings(rtxn, key)? {
let user_provided = embedding_configs
.iter()
.find(|conf| conf.name == name)
.is_some_and(|conf| conf.user_provided.contains(key));
let embeddings = ExplicitVectors {
embeddings: Some(vector.into()),
regenerate: !user_provided,
};
vectors.insert(
name,
serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?,
);
}
document.insert("_vectors".into(), vectors.into());
}
}
Ok(document)
}) })
})) }))
} }
@ -596,6 +648,7 @@ fn retrieve_documents<S: AsRef<str>>(
limit: usize, limit: usize,
filter: Option<Value>, filter: Option<Value>,
attributes_to_retrieve: Option<Vec<S>>, attributes_to_retrieve: Option<Vec<S>>,
retrieve_vectors: RetrieveVectors,
) -> Result<(u64, Vec<Document>), ResponseError> { ) -> Result<(u64, Vec<Document>), ResponseError> {
let rtxn = index.read_txn()?; let rtxn = index.read_txn()?;
let filter = &filter; let filter = &filter;
@ -620,53 +673,57 @@ fn retrieve_documents<S: AsRef<str>>(
let (it, number_of_documents) = { let (it, number_of_documents) = {
let number_of_documents = candidates.len(); let number_of_documents = candidates.len();
( (
some_documents(index, &rtxn, candidates.into_iter().skip(offset).take(limit))?, some_documents(
index,
&rtxn,
candidates.into_iter().skip(offset).take(limit),
retrieve_vectors,
)?,
number_of_documents, number_of_documents,
) )
}; };
let documents: Result<Vec<_>, ResponseError> = it let documents: Vec<_> = it
.map(|document| { .map(|document| {
Ok(match &attributes_to_retrieve { Ok(match &attributes_to_retrieve {
Some(attributes_to_retrieve) => permissive_json_pointer::select_values( Some(attributes_to_retrieve) => permissive_json_pointer::select_values(
&document?, &document?,
attributes_to_retrieve.iter().map(|s| s.as_ref()), attributes_to_retrieve.iter().map(|s| s.as_ref()).chain(
(retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors"),
),
), ),
None => document?, None => document?,
}) })
}) })
.collect(); .collect::<Result<_, ResponseError>>()?;
Ok((number_of_documents, documents?)) Ok((number_of_documents, documents))
} }
fn retrieve_document<S: AsRef<str>>( fn retrieve_document<S: AsRef<str>>(
index: &Index, index: &Index,
doc_id: &str, doc_id: &str,
attributes_to_retrieve: Option<Vec<S>>, attributes_to_retrieve: Option<Vec<S>>,
retrieve_vectors: RetrieveVectors,
) -> Result<Document, ResponseError> { ) -> Result<Document, ResponseError> {
let txn = index.read_txn()?; let txn = index.read_txn()?;
let fields_ids_map = index.fields_ids_map(&txn)?;
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
let internal_id = index let internal_id = index
.external_documents_ids() .external_documents_ids()
.get(&txn, doc_id)? .get(&txn, doc_id)?
.ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?; .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?;
let document = index let document = some_documents(index, &txn, Some(internal_id), retrieve_vectors)?
.documents(&txn, std::iter::once(internal_id))?
.into_iter()
.next() .next()
.map(|(_, d)| d) .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))??;
.ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?;
let document = meilisearch_types::milli::obkv_to_json(&all_fields, &fields_ids_map, document)?;
let document = match &attributes_to_retrieve { let document = match &attributes_to_retrieve {
Some(attributes_to_retrieve) => permissive_json_pointer::select_values( Some(attributes_to_retrieve) => permissive_json_pointer::select_values(
&document, &document,
attributes_to_retrieve.iter().map(|s| s.as_ref()), attributes_to_retrieve
.iter()
.map(|s| s.as_ref())
.chain((retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors")),
), ),
None => document, None => document,
}; };

View File

@ -115,6 +115,7 @@ impl From<FacetSearchQuery> for SearchQuery {
page: None, page: None,
hits_per_page: None, hits_per_page: None,
attributes_to_retrieve: None, attributes_to_retrieve: None,
retrieve_vectors: false,
attributes_to_crop: None, attributes_to_crop: None,
crop_length: DEFAULT_CROP_LENGTH(), crop_length: DEFAULT_CROP_LENGTH(),
attributes_to_highlight: None, attributes_to_highlight: None,

View File

@ -20,9 +20,9 @@ use crate::extractors::sequential_extractor::SeqHandler;
use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS; use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS;
use crate::search::{ use crate::search::{
add_search_rules, perform_search, HybridQuery, MatchingStrategy, RankingScoreThreshold, add_search_rules, perform_search, HybridQuery, MatchingStrategy, RankingScoreThreshold,
SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, RetrieveVectors, SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH,
DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG,
DEFAULT_SEARCH_OFFSET, DEFAULT_SEMANTIC_RATIO, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, DEFAULT_SEMANTIC_RATIO,
}; };
use crate::search_queue::SearchQueue; use crate::search_queue::SearchQueue;
@ -51,6 +51,8 @@ pub struct SearchQueryGet {
hits_per_page: Option<Param<usize>>, hits_per_page: Option<Param<usize>>,
#[deserr(default, error = DeserrQueryParamError<InvalidSearchAttributesToRetrieve>)] #[deserr(default, error = DeserrQueryParamError<InvalidSearchAttributesToRetrieve>)]
attributes_to_retrieve: Option<CS<String>>, attributes_to_retrieve: Option<CS<String>>,
#[deserr(default, error = DeserrQueryParamError<InvalidSearchRetrieveVectors>)]
retrieve_vectors: Param<bool>,
#[deserr(default, error = DeserrQueryParamError<InvalidSearchAttributesToCrop>)] #[deserr(default, error = DeserrQueryParamError<InvalidSearchAttributesToCrop>)]
attributes_to_crop: Option<CS<String>>, attributes_to_crop: Option<CS<String>>,
#[deserr(default = Param(DEFAULT_CROP_LENGTH()), error = DeserrQueryParamError<InvalidSearchCropLength>)] #[deserr(default = Param(DEFAULT_CROP_LENGTH()), error = DeserrQueryParamError<InvalidSearchCropLength>)]
@ -153,6 +155,7 @@ impl From<SearchQueryGet> for SearchQuery {
page: other.page.as_deref().copied(), page: other.page.as_deref().copied(),
hits_per_page: other.hits_per_page.as_deref().copied(), hits_per_page: other.hits_per_page.as_deref().copied(),
attributes_to_retrieve: other.attributes_to_retrieve.map(|o| o.into_iter().collect()), attributes_to_retrieve: other.attributes_to_retrieve.map(|o| o.into_iter().collect()),
retrieve_vectors: other.retrieve_vectors.0,
attributes_to_crop: other.attributes_to_crop.map(|o| o.into_iter().collect()), attributes_to_crop: other.attributes_to_crop.map(|o| o.into_iter().collect()),
crop_length: other.crop_length.0, crop_length: other.crop_length.0,
attributes_to_highlight: other.attributes_to_highlight.map(|o| o.into_iter().collect()), attributes_to_highlight: other.attributes_to_highlight.map(|o| o.into_iter().collect()),
@ -222,10 +225,12 @@ pub async fn search_with_url_query(
let features = index_scheduler.features(); let features = index_scheduler.features();
let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?; let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?;
let retrieve_vector = RetrieveVectors::new(query.retrieve_vectors, features)?;
let _permit = search_queue.try_get_search_permit().await?; let _permit = search_queue.try_get_search_permit().await?;
let search_result = let search_result = tokio::task::spawn_blocking(move || {
tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)).await?; perform_search(&index, query, search_kind, retrieve_vector)
})
.await?;
if let Ok(ref search_result) = search_result { if let Ok(ref search_result) = search_result {
aggregate.succeed(search_result); aggregate.succeed(search_result);
} }
@ -262,10 +267,13 @@ pub async fn search_with_post(
let features = index_scheduler.features(); let features = index_scheduler.features();
let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?; let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?;
let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?;
let _permit = search_queue.try_get_search_permit().await?; let _permit = search_queue.try_get_search_permit().await?;
let search_result = let search_result = tokio::task::spawn_blocking(move || {
tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)).await?; perform_search(&index, query, search_kind, retrieve_vectors)
})
.await?;
if let Ok(ref search_result) = search_result { if let Ok(ref search_result) = search_result {
aggregate.succeed(search_result); aggregate.succeed(search_result);
if search_result.degraded { if search_result.degraded {
@ -287,11 +295,10 @@ pub fn search_kind(
features: RoFeatures, features: RoFeatures,
) -> Result<SearchKind, ResponseError> { ) -> Result<SearchKind, ResponseError> {
if query.vector.is_some() { if query.vector.is_some() {
features.check_vector("Passing `vector` as a query parameter")?; features.check_vector("Passing `vector` as a parameter")?;
} }
if query.hybrid.is_some() { if query.hybrid.is_some() {
features.check_vector("Passing `hybrid` as a query parameter")?; features.check_vector("Passing `hybrid` as a parameter")?;
} }
// regardless of anything, always do a keyword search when we don't have a vector and the query is whitespace or missing // regardless of anything, always do a keyword search when we don't have a vector and the query is whitespace or missing

View File

@ -4,11 +4,7 @@ use deserr::actix_web::{AwebJson, AwebQueryParameter};
use index_scheduler::IndexScheduler; use index_scheduler::IndexScheduler;
use meilisearch_types::deserr::query_params::Param; use meilisearch_types::deserr::query_params::Param;
use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError}; use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError};
use meilisearch_types::error::deserr_codes::{ use meilisearch_types::error::deserr_codes::*;
InvalidEmbedder, InvalidSimilarAttributesToRetrieve, InvalidSimilarFilter, InvalidSimilarId,
InvalidSimilarLimit, InvalidSimilarOffset, InvalidSimilarRankingScoreThreshold,
InvalidSimilarShowRankingScore, InvalidSimilarShowRankingScoreDetails,
};
use meilisearch_types::error::{ErrorCode as _, ResponseError}; use meilisearch_types::error::{ErrorCode as _, ResponseError};
use meilisearch_types::index_uid::IndexUid; use meilisearch_types::index_uid::IndexUid;
use meilisearch_types::keys::actions; use meilisearch_types::keys::actions;
@ -21,8 +17,8 @@ use crate::analytics::{Analytics, SimilarAggregator};
use crate::extractors::authentication::GuardedData; use crate::extractors::authentication::GuardedData;
use crate::extractors::sequential_extractor::SeqHandler; use crate::extractors::sequential_extractor::SeqHandler;
use crate::search::{ use crate::search::{
add_search_rules, perform_similar, RankingScoreThresholdSimilar, SearchKind, SimilarQuery, add_search_rules, perform_similar, RankingScoreThresholdSimilar, RetrieveVectors, SearchKind,
SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, SimilarQuery, SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
}; };
pub fn configure(cfg: &mut web::ServiceConfig) { pub fn configure(cfg: &mut web::ServiceConfig) {
@ -97,6 +93,8 @@ async fn similar(
features.check_vector("Using the similar API")?; features.check_vector("Using the similar API")?;
let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?;
// Tenant token search_rules. // Tenant token search_rules.
if let Some(search_rules) = index_scheduler.filters().get_index_search_rules(&index_uid) { if let Some(search_rules) = index_scheduler.filters().get_index_search_rules(&index_uid) {
add_search_rules(&mut query.filter, search_rules); add_search_rules(&mut query.filter, search_rules);
@ -107,8 +105,10 @@ async fn similar(
let (embedder_name, embedder) = let (embedder_name, embedder) =
SearchKind::embedder(&index_scheduler, &index, query.embedder.as_deref(), None)?; SearchKind::embedder(&index_scheduler, &index, query.embedder.as_deref(), None)?;
tokio::task::spawn_blocking(move || perform_similar(&index, query, embedder_name, embedder)) tokio::task::spawn_blocking(move || {
.await? perform_similar(&index, query, embedder_name, embedder, retrieve_vectors)
})
.await?
} }
#[derive(Debug, deserr::Deserr)] #[derive(Debug, deserr::Deserr)]
@ -122,6 +122,8 @@ pub struct SimilarQueryGet {
limit: Param<usize>, limit: Param<usize>,
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarAttributesToRetrieve>)] #[deserr(default, error = DeserrQueryParamError<InvalidSimilarAttributesToRetrieve>)]
attributes_to_retrieve: Option<CS<String>>, attributes_to_retrieve: Option<CS<String>>,
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarRetrieveVectors>)]
retrieve_vectors: Param<bool>,
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarFilter>)] #[deserr(default, error = DeserrQueryParamError<InvalidSimilarFilter>)]
filter: Option<String>, filter: Option<String>,
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarShowRankingScore>)] #[deserr(default, error = DeserrQueryParamError<InvalidSimilarShowRankingScore>)]
@ -156,6 +158,7 @@ impl TryFrom<SimilarQueryGet> for SimilarQuery {
offset, offset,
limit, limit,
attributes_to_retrieve, attributes_to_retrieve,
retrieve_vectors,
filter, filter,
show_ranking_score, show_ranking_score,
show_ranking_score_details, show_ranking_score_details,
@ -180,6 +183,7 @@ impl TryFrom<SimilarQueryGet> for SimilarQuery {
filter, filter,
embedder, embedder,
attributes_to_retrieve: attributes_to_retrieve.map(|o| o.into_iter().collect()), attributes_to_retrieve: attributes_to_retrieve.map(|o| o.into_iter().collect()),
retrieve_vectors: retrieve_vectors.0,
show_ranking_score: show_ranking_score.0, show_ranking_score: show_ranking_score.0,
show_ranking_score_details: show_ranking_score_details.0, show_ranking_score_details: show_ranking_score_details.0,
ranking_score_threshold: ranking_score_threshold.map(|x| x.0), ranking_score_threshold: ranking_score_threshold.map(|x| x.0),

View File

@ -15,7 +15,7 @@ use crate::extractors::authentication::{AuthenticationError, GuardedData};
use crate::extractors::sequential_extractor::SeqHandler; use crate::extractors::sequential_extractor::SeqHandler;
use crate::routes::indexes::search::search_kind; use crate::routes::indexes::search::search_kind;
use crate::search::{ use crate::search::{
add_search_rules, perform_search, SearchQueryWithIndex, SearchResultWithIndex, add_search_rules, perform_search, RetrieveVectors, SearchQueryWithIndex, SearchResultWithIndex,
}; };
use crate::search_queue::SearchQueue; use crate::search_queue::SearchQueue;
@ -83,11 +83,14 @@ pub async fn multi_search_with_post(
let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features) let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)
.with_index(query_index)?; .with_index(query_index)?;
let retrieve_vector =
RetrieveVectors::new(query.retrieve_vectors, features).with_index(query_index)?;
let search_result = let search_result = tokio::task::spawn_blocking(move || {
tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)) perform_search(&index, query, search_kind, retrieve_vector)
.await })
.with_index(query_index)?; .await
.with_index(query_index)?;
search_results.push(SearchResultWithIndex { search_results.push(SearchResultWithIndex {
index_uid: index_uid.into_inner(), index_uid: index_uid.into_inner(),

View File

@ -15,6 +15,7 @@ use meilisearch_types::error::{Code, ResponseError};
use meilisearch_types::heed::RoTxn; use meilisearch_types::heed::RoTxn;
use meilisearch_types::index_uid::IndexUid; use meilisearch_types::index_uid::IndexUid;
use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy}; use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy};
use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors;
use meilisearch_types::milli::vector::Embedder; use meilisearch_types::milli::vector::Embedder;
use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues, TimeBudget}; use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues, TimeBudget};
use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS; use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS;
@ -59,6 +60,8 @@ pub struct SearchQuery {
pub hits_per_page: Option<usize>, pub hits_per_page: Option<usize>,
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToRetrieve>)] #[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToRetrieve>)]
pub attributes_to_retrieve: Option<BTreeSet<String>>, pub attributes_to_retrieve: Option<BTreeSet<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchRetrieveVectors>)]
pub retrieve_vectors: bool,
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToCrop>)] #[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToCrop>)]
pub attributes_to_crop: Option<Vec<String>>, pub attributes_to_crop: Option<Vec<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchCropLength>, default = DEFAULT_CROP_LENGTH())] #[deserr(default, error = DeserrJsonError<InvalidSearchCropLength>, default = DEFAULT_CROP_LENGTH())]
@ -141,6 +144,7 @@ impl fmt::Debug for SearchQuery {
page, page,
hits_per_page, hits_per_page,
attributes_to_retrieve, attributes_to_retrieve,
retrieve_vectors,
attributes_to_crop, attributes_to_crop,
crop_length, crop_length,
attributes_to_highlight, attributes_to_highlight,
@ -173,6 +177,9 @@ impl fmt::Debug for SearchQuery {
if let Some(q) = q { if let Some(q) = q {
debug.field("q", &q); debug.field("q", &q);
} }
if *retrieve_vectors {
debug.field("retrieve_vectors", &retrieve_vectors);
}
if let Some(v) = vector { if let Some(v) = vector {
if v.len() < 10 { if v.len() < 10 {
debug.field("vector", &v); debug.field("vector", &v);
@ -370,6 +377,8 @@ pub struct SearchQueryWithIndex {
pub hits_per_page: Option<usize>, pub hits_per_page: Option<usize>,
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToRetrieve>)] #[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToRetrieve>)]
pub attributes_to_retrieve: Option<BTreeSet<String>>, pub attributes_to_retrieve: Option<BTreeSet<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchRetrieveVectors>)]
pub retrieve_vectors: bool,
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToCrop>)] #[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToCrop>)]
pub attributes_to_crop: Option<Vec<String>>, pub attributes_to_crop: Option<Vec<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchCropLength>, default = DEFAULT_CROP_LENGTH())] #[deserr(default, error = DeserrJsonError<InvalidSearchCropLength>, default = DEFAULT_CROP_LENGTH())]
@ -413,6 +422,7 @@ impl SearchQueryWithIndex {
page, page,
hits_per_page, hits_per_page,
attributes_to_retrieve, attributes_to_retrieve,
retrieve_vectors,
attributes_to_crop, attributes_to_crop,
crop_length, crop_length,
attributes_to_highlight, attributes_to_highlight,
@ -440,6 +450,7 @@ impl SearchQueryWithIndex {
page, page,
hits_per_page, hits_per_page,
attributes_to_retrieve, attributes_to_retrieve,
retrieve_vectors,
attributes_to_crop, attributes_to_crop,
crop_length, crop_length,
attributes_to_highlight, attributes_to_highlight,
@ -478,6 +489,8 @@ pub struct SimilarQuery {
pub embedder: Option<String>, pub embedder: Option<String>,
#[deserr(default, error = DeserrJsonError<InvalidSimilarAttributesToRetrieve>)] #[deserr(default, error = DeserrJsonError<InvalidSimilarAttributesToRetrieve>)]
pub attributes_to_retrieve: Option<BTreeSet<String>>, pub attributes_to_retrieve: Option<BTreeSet<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSimilarRetrieveVectors>)]
pub retrieve_vectors: bool,
#[deserr(default, error = DeserrJsonError<InvalidSimilarShowRankingScore>, default)] #[deserr(default, error = DeserrJsonError<InvalidSimilarShowRankingScore>, default)]
pub show_ranking_score: bool, pub show_ranking_score: bool,
#[deserr(default, error = DeserrJsonError<InvalidSimilarShowRankingScoreDetails>, default)] #[deserr(default, error = DeserrJsonError<InvalidSimilarShowRankingScoreDetails>, default)]
@ -810,6 +823,7 @@ pub fn perform_search(
index: &Index, index: &Index,
query: SearchQuery, query: SearchQuery,
search_kind: SearchKind, search_kind: SearchKind,
retrieve_vectors: RetrieveVectors,
) -> Result<SearchResult, MeilisearchHttpError> { ) -> Result<SearchResult, MeilisearchHttpError> {
let before_search = Instant::now(); let before_search = Instant::now();
let rtxn = index.read_txn()?; let rtxn = index.read_txn()?;
@ -847,6 +861,8 @@ pub fn perform_search(
page, page,
hits_per_page, hits_per_page,
attributes_to_retrieve, attributes_to_retrieve,
// use the enum passed as parameter
retrieve_vectors: _,
attributes_to_crop, attributes_to_crop,
crop_length, crop_length,
attributes_to_highlight, attributes_to_highlight,
@ -870,6 +886,7 @@ pub fn perform_search(
let format = AttributesFormat { let format = AttributesFormat {
attributes_to_retrieve, attributes_to_retrieve,
retrieve_vectors,
attributes_to_highlight, attributes_to_highlight,
attributes_to_crop, attributes_to_crop,
crop_length, crop_length,
@ -953,6 +970,7 @@ pub fn perform_search(
struct AttributesFormat { struct AttributesFormat {
attributes_to_retrieve: Option<BTreeSet<String>>, attributes_to_retrieve: Option<BTreeSet<String>>,
retrieve_vectors: RetrieveVectors,
attributes_to_highlight: Option<HashSet<String>>, attributes_to_highlight: Option<HashSet<String>>,
attributes_to_crop: Option<Vec<String>>, attributes_to_crop: Option<Vec<String>>,
crop_length: usize, crop_length: usize,
@ -965,6 +983,36 @@ struct AttributesFormat {
show_ranking_score_details: bool, show_ranking_score_details: bool,
} }
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum RetrieveVectors {
/// Do not touch the `_vectors` field
///
/// this is the behavior when the vectorStore feature is disabled
Ignore,
/// Remove the `_vectors` field
///
/// this is the behavior when the vectorStore feature is enabled, and `retrieveVectors` is `false`
Hide,
/// Retrieve vectors from the DB and merge them into the `_vectors` field
///
/// this is the behavior when the vectorStore feature is enabled, and `retrieveVectors` is `true`
Retrieve,
}
impl RetrieveVectors {
pub fn new(
retrieve_vector: bool,
features: index_scheduler::RoFeatures,
) -> Result<Self, index_scheduler::Error> {
match (retrieve_vector, features.check_vector("Passing `retrieveVectors` as a parameter")) {
(true, Ok(())) => Ok(Self::Retrieve),
(true, Err(error)) => Err(error),
(false, Ok(())) => Ok(Self::Hide),
(false, Err(_)) => Ok(Self::Ignore),
}
}
}
fn make_hits( fn make_hits(
index: &Index, index: &Index,
rtxn: &RoTxn<'_>, rtxn: &RoTxn<'_>,
@ -974,10 +1022,32 @@ fn make_hits(
document_scores: Vec<Vec<ScoreDetails>>, document_scores: Vec<Vec<ScoreDetails>>,
) -> Result<Vec<SearchHit>, MeilisearchHttpError> { ) -> Result<Vec<SearchHit>, MeilisearchHttpError> {
let fields_ids_map = index.fields_ids_map(rtxn).unwrap(); let fields_ids_map = index.fields_ids_map(rtxn).unwrap();
let displayed_ids = index let displayed_ids =
.displayed_fields_ids(rtxn)? index.displayed_fields_ids(rtxn)?.map(|fields| fields.into_iter().collect::<BTreeSet<_>>());
.map(|fields| fields.into_iter().collect::<BTreeSet<_>>())
.unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect()); let vectors_fid = fields_ids_map.id(milli::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME);
let vectors_is_hidden = match (&displayed_ids, vectors_fid) {
// displayed_ids is a wildcard, so `_vectors` can be displayed regardless of its fid
(None, _) => false,
// displayed_ids is a finite list, and `_vectors` cannot be part of it because it is not an existing field
(Some(_), None) => true,
// displayed_ids is a finit list, so hide if `_vectors` is not part of it
(Some(map), Some(vectors_fid)) => map.contains(&vectors_fid),
};
let retrieve_vectors = if let RetrieveVectors::Retrieve = format.retrieve_vectors {
if vectors_is_hidden {
RetrieveVectors::Hide
} else {
RetrieveVectors::Retrieve
}
} else {
format.retrieve_vectors
};
let displayed_ids =
displayed_ids.unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect());
let fids = |attrs: &BTreeSet<String>| { let fids = |attrs: &BTreeSet<String>| {
let mut ids = BTreeSet::new(); let mut ids = BTreeSet::new();
for attr in attrs { for attr in attrs {
@ -1000,6 +1070,7 @@ fn make_hits(
.intersection(&displayed_ids) .intersection(&displayed_ids)
.cloned() .cloned()
.collect(); .collect();
let attr_to_highlight = format.attributes_to_highlight.unwrap_or_default(); let attr_to_highlight = format.attributes_to_highlight.unwrap_or_default();
let attr_to_crop = format.attributes_to_crop.unwrap_or_default(); let attr_to_crop = format.attributes_to_crop.unwrap_or_default();
let formatted_options = compute_formatted_options( let formatted_options = compute_formatted_options(
@ -1033,18 +1104,48 @@ fn make_hits(
formatter_builder.highlight_prefix(format.highlight_pre_tag); formatter_builder.highlight_prefix(format.highlight_pre_tag);
formatter_builder.highlight_suffix(format.highlight_post_tag); formatter_builder.highlight_suffix(format.highlight_post_tag);
let mut documents = Vec::new(); let mut documents = Vec::new();
let embedding_configs = index.embedding_configs(rtxn)?;
let documents_iter = index.documents(rtxn, documents_ids)?; let documents_iter = index.documents(rtxn, documents_ids)?;
for ((_id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) { for ((id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) {
// First generate a document with all the displayed fields // First generate a document with all the displayed fields
let displayed_document = make_document(&displayed_ids, &fields_ids_map, obkv)?; let displayed_document = make_document(&displayed_ids, &fields_ids_map, obkv)?;
let add_vectors_fid =
vectors_fid.filter(|_fid| retrieve_vectors == RetrieveVectors::Retrieve);
// select the attributes to retrieve // select the attributes to retrieve
let attributes_to_retrieve = to_retrieve_ids let attributes_to_retrieve = to_retrieve_ids
.iter() .iter()
// skip the vectors_fid if RetrieveVectors::Hide
.filter(|fid| match vectors_fid {
Some(vectors_fid) => {
!(retrieve_vectors == RetrieveVectors::Hide && **fid == vectors_fid)
}
None => true,
})
// need to retrieve the existing `_vectors` field if the `RetrieveVectors::Retrieve`
.chain(add_vectors_fid.iter())
.map(|&fid| fields_ids_map.name(fid).expect("Missing field name")); .map(|&fid| fields_ids_map.name(fid).expect("Missing field name"));
let mut document = let mut document =
permissive_json_pointer::select_values(&displayed_document, attributes_to_retrieve); permissive_json_pointer::select_values(&displayed_document, attributes_to_retrieve);
if retrieve_vectors == RetrieveVectors::Retrieve {
let mut vectors = match document.remove("_vectors") {
Some(Value::Object(map)) => map,
_ => Default::default(),
};
for (name, vector) in index.embeddings(rtxn, id)? {
let user_provided = embedding_configs
.iter()
.find(|conf| conf.name == name)
.is_some_and(|conf| conf.user_provided.contains(id));
let embeddings =
ExplicitVectors { embeddings: Some(vector.into()), regenerate: !user_provided };
vectors.insert(name, serde_json::to_value(embeddings)?);
}
document.insert("_vectors".into(), vectors.into());
}
let (matches_position, formatted) = format_fields( let (matches_position, formatted) = format_fields(
&displayed_document, &displayed_document,
&fields_ids_map, &fields_ids_map,
@ -1114,6 +1215,7 @@ pub fn perform_similar(
query: SimilarQuery, query: SimilarQuery,
embedder_name: String, embedder_name: String,
embedder: Arc<Embedder>, embedder: Arc<Embedder>,
retrieve_vectors: RetrieveVectors,
) -> Result<SimilarResult, ResponseError> { ) -> Result<SimilarResult, ResponseError> {
let before_search = Instant::now(); let before_search = Instant::now();
let rtxn = index.read_txn()?; let rtxn = index.read_txn()?;
@ -1125,6 +1227,7 @@ pub fn perform_similar(
filter: _, filter: _,
embedder: _, embedder: _,
attributes_to_retrieve, attributes_to_retrieve,
retrieve_vectors: _,
show_ranking_score, show_ranking_score,
show_ranking_score_details, show_ranking_score_details,
ranking_score_threshold, ranking_score_threshold,
@ -1171,6 +1274,7 @@ pub fn perform_similar(
let format = AttributesFormat { let format = AttributesFormat {
attributes_to_retrieve, attributes_to_retrieve,
retrieve_vectors,
attributes_to_highlight: None, attributes_to_highlight: None,
attributes_to_crop: None, attributes_to_crop: None,
crop_length: DEFAULT_CROP_LENGTH(), crop_length: DEFAULT_CROP_LENGTH(),

View File

@ -182,14 +182,10 @@ impl Index<'_> {
self.service.get(url).await self.service.get(url).await
} }
pub async fn get_document( pub async fn get_document(&self, id: u64, options: Option<Value>) -> (Value, StatusCode) {
&self,
id: u64,
options: Option<GetDocumentOptions>,
) -> (Value, StatusCode) {
let mut url = format!("/indexes/{}/documents/{}", urlencode(self.uid.as_ref()), id); let mut url = format!("/indexes/{}/documents/{}", urlencode(self.uid.as_ref()), id);
if let Some(fields) = options.and_then(|o| o.fields) { if let Some(options) = options {
let _ = write!(url, "?fields={}", fields.join(",")); write!(url, "?{}", yaup::to_string(&options).unwrap()).unwrap();
} }
self.service.get(url).await self.service.get(url).await
} }
@ -205,18 +201,11 @@ impl Index<'_> {
} }
pub async fn get_all_documents(&self, options: GetAllDocumentsOptions) -> (Value, StatusCode) { pub async fn get_all_documents(&self, options: GetAllDocumentsOptions) -> (Value, StatusCode) {
let mut url = format!("/indexes/{}/documents?", urlencode(self.uid.as_ref())); let url = format!(
if let Some(limit) = options.limit { "/indexes/{}/documents?{}",
let _ = write!(url, "limit={}&", limit); urlencode(self.uid.as_ref()),
} yaup::to_string(&options).unwrap()
);
if let Some(offset) = options.offset {
let _ = write!(url, "offset={}&", offset);
}
if let Some(attributes_to_retrieve) = options.attributes_to_retrieve {
let _ = write!(url, "fields={}&", attributes_to_retrieve.join(","));
}
self.service.get(url).await self.service.get(url).await
} }
@ -435,13 +424,11 @@ impl Index<'_> {
} }
} }
pub struct GetDocumentOptions { #[derive(Debug, Default, serde::Serialize)]
pub fields: Option<Vec<&'static str>>, #[serde(rename_all = "camelCase")]
}
#[derive(Debug, Default)]
pub struct GetAllDocumentsOptions { pub struct GetAllDocumentsOptions {
pub limit: Option<usize>, pub limit: Option<usize>,
pub offset: Option<usize>, pub offset: Option<usize>,
pub attributes_to_retrieve: Option<Vec<&'static str>>, pub retrieve_vectors: bool,
pub fields: Option<Vec<&'static str>>,
} }

View File

@ -6,7 +6,7 @@ pub mod service;
use std::fmt::{self, Display}; use std::fmt::{self, Display};
#[allow(unused)] #[allow(unused)]
pub use index::{GetAllDocumentsOptions, GetDocumentOptions}; pub use index::GetAllDocumentsOptions;
use meili_snap::json_string; use meili_snap::json_string;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
#[allow(unused)] #[allow(unused)]

View File

@ -795,3 +795,70 @@ async fn fetch_document_by_filter() {
} }
"###); "###);
} }
#[actix_rt::test]
async fn retrieve_vectors() {
let server = Server::new().await;
let index = server.index("doggo");
// GETALL DOCUMENTS BY QUERY
let (response, _code) = index.get_all_documents_raw("?retrieveVectors=tamo").await;
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value in parameter `retrieveVectors`: could not parse `tamo` as a boolean, expected either `true` or `false`",
"code": "invalid_document_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors"
}
"###);
let (response, _code) = index.get_all_documents_raw("?retrieveVectors=true").await;
snapshot!(json_string!(response), @r###"
{
"message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677",
"code": "feature_not_enabled",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
}
"###);
// FETCHALL DOCUMENTS BY POST
let (response, _code) =
index.get_document_by_filter(json!({ "retrieveVectors": "tamo" })).await;
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"tamo\"`",
"code": "invalid_document_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors"
}
"###);
let (response, _code) = index.get_document_by_filter(json!({ "retrieveVectors": true })).await;
snapshot!(json_string!(response), @r###"
{
"message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677",
"code": "feature_not_enabled",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
}
"###);
// GET A SINGLEDOCUMENT
let (response, _code) = index.get_document(0, Some(json!({"retrieveVectors": "tamo"}))).await;
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value in parameter `retrieveVectors`: could not parse `tamo` as a boolean, expected either `true` or `false`",
"code": "invalid_document_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors"
}
"###);
let (response, _code) = index.get_document(0, Some(json!({"retrieveVectors": true}))).await;
snapshot!(json_string!(response), @r###"
{
"message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677",
"code": "feature_not_enabled",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
}
"###);
}

View File

@ -4,7 +4,7 @@ use meili_snap::*;
use urlencoding::encode as urlencode; use urlencoding::encode as urlencode;
use crate::common::encoder::Encoder; use crate::common::encoder::Encoder;
use crate::common::{GetAllDocumentsOptions, GetDocumentOptions, Server, Value}; use crate::common::{GetAllDocumentsOptions, Server, Value};
use crate::json; use crate::json;
// TODO: partial test since we are testing error, amd error is not yet fully implemented in // TODO: partial test since we are testing error, amd error is not yet fully implemented in
@ -59,8 +59,7 @@ async fn get_document() {
}) })
); );
let (response, code) = let (response, code) = index.get_document(0, Some(json!({ "fields": ["id"] }))).await;
index.get_document(0, Some(GetDocumentOptions { fields: Some(vec!["id"]) })).await;
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!( assert_eq!(
response, response,
@ -69,9 +68,8 @@ async fn get_document() {
}) })
); );
let (response, code) = index let (response, code) =
.get_document(0, Some(GetDocumentOptions { fields: Some(vec!["nested.content"]) })) index.get_document(0, Some(json!({ "fields": ["nested.content"] }))).await;
.await;
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!( assert_eq!(
response, response,
@ -211,7 +209,7 @@ async fn test_get_all_documents_attributes_to_retrieve() {
let (response, code) = index let (response, code) = index
.get_all_documents(GetAllDocumentsOptions { .get_all_documents(GetAllDocumentsOptions {
attributes_to_retrieve: Some(vec!["name"]), fields: Some(vec!["name"]),
..Default::default() ..Default::default()
}) })
.await; .await;
@ -225,9 +223,19 @@ async fn test_get_all_documents_attributes_to_retrieve() {
assert_eq!(response["limit"], json!(20)); assert_eq!(response["limit"], json!(20));
assert_eq!(response["total"], json!(77)); assert_eq!(response["total"], json!(77));
let (response, code) = index.get_all_documents_raw("?fields=").await;
assert_eq!(code, 200);
assert_eq!(response["results"].as_array().unwrap().len(), 20);
for results in response["results"].as_array().unwrap() {
assert_eq!(results.as_object().unwrap().keys().count(), 0);
}
assert_eq!(response["offset"], json!(0));
assert_eq!(response["limit"], json!(20));
assert_eq!(response["total"], json!(77));
let (response, code) = index let (response, code) = index
.get_all_documents(GetAllDocumentsOptions { .get_all_documents(GetAllDocumentsOptions {
attributes_to_retrieve: Some(vec![]), fields: Some(vec!["wrong"]),
..Default::default() ..Default::default()
}) })
.await; .await;
@ -242,22 +250,7 @@ async fn test_get_all_documents_attributes_to_retrieve() {
let (response, code) = index let (response, code) = index
.get_all_documents(GetAllDocumentsOptions { .get_all_documents(GetAllDocumentsOptions {
attributes_to_retrieve: Some(vec!["wrong"]), fields: Some(vec!["name", "tags"]),
..Default::default()
})
.await;
assert_eq!(code, 200);
assert_eq!(response["results"].as_array().unwrap().len(), 20);
for results in response["results"].as_array().unwrap() {
assert_eq!(results.as_object().unwrap().keys().count(), 0);
}
assert_eq!(response["offset"], json!(0));
assert_eq!(response["limit"], json!(20));
assert_eq!(response["total"], json!(77));
let (response, code) = index
.get_all_documents(GetAllDocumentsOptions {
attributes_to_retrieve: Some(vec!["name", "tags"]),
..Default::default() ..Default::default()
}) })
.await; .await;
@ -270,10 +263,7 @@ async fn test_get_all_documents_attributes_to_retrieve() {
} }
let (response, code) = index let (response, code) = index
.get_all_documents(GetAllDocumentsOptions { .get_all_documents(GetAllDocumentsOptions { fields: Some(vec!["*"]), ..Default::default() })
attributes_to_retrieve: Some(vec!["*"]),
..Default::default()
})
.await; .await;
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!(response["results"].as_array().unwrap().len(), 20); assert_eq!(response["results"].as_array().unwrap().len(), 20);
@ -283,7 +273,7 @@ async fn test_get_all_documents_attributes_to_retrieve() {
let (response, code) = index let (response, code) = index
.get_all_documents(GetAllDocumentsOptions { .get_all_documents(GetAllDocumentsOptions {
attributes_to_retrieve: Some(vec!["*", "wrong"]), fields: Some(vec!["*", "wrong"]),
..Default::default() ..Default::default()
}) })
.await; .await;
@ -316,12 +306,10 @@ async fn get_document_s_nested_attributes_to_retrieve() {
assert_eq!(code, 202); assert_eq!(code, 202);
index.wait_task(1).await; index.wait_task(1).await;
let (response, code) = let (response, code) = index.get_document(0, Some(json!({ "fields": ["content"] }))).await;
index.get_document(0, Some(GetDocumentOptions { fields: Some(vec!["content"]) })).await;
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!(response, json!({})); assert_eq!(response, json!({}));
let (response, code) = let (response, code) = index.get_document(1, Some(json!({ "fields": ["content"] }))).await;
index.get_document(1, Some(GetDocumentOptions { fields: Some(vec!["content"]) })).await;
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!( assert_eq!(
response, response,
@ -333,9 +321,7 @@ async fn get_document_s_nested_attributes_to_retrieve() {
}) })
); );
let (response, code) = index let (response, code) = index.get_document(0, Some(json!({ "fields": ["content.truc"] }))).await;
.get_document(0, Some(GetDocumentOptions { fields: Some(vec!["content.truc"]) }))
.await;
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!( assert_eq!(
response, response,
@ -343,9 +329,7 @@ async fn get_document_s_nested_attributes_to_retrieve() {
"content.truc": "foobar", "content.truc": "foobar",
}) })
); );
let (response, code) = index let (response, code) = index.get_document(1, Some(json!({ "fields": ["content.truc"] }))).await;
.get_document(1, Some(GetDocumentOptions { fields: Some(vec!["content.truc"]) }))
.await;
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!( assert_eq!(
response, response,
@ -540,3 +524,207 @@ async fn get_document_by_filter() {
} }
"###); "###);
} }
#[actix_rt::test]
async fn get_document_with_vectors() {
let server = Server::new().await;
let index = server.index("doggo");
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
snapshot!(code, @"200 OK");
snapshot!(value, @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false
}
"###);
let (response, code) = index
.update_settings(json!({
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 3,
}
},
}))
.await;
snapshot!(code, @"202 Accepted");
server.wait_task(response.uid()).await;
let documents = json!([
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }},
{"id": 1, "name": "echo", "_vectors": { "manual": null }},
]);
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
index.wait_task(value.uid()).await;
// by default you shouldn't see the `_vectors` object
let (documents, _code) = index.get_all_documents(Default::default()).await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"id": 0,
"name": "kefir"
},
{
"id": 1,
"name": "echo"
}
],
"offset": 0,
"limit": 20,
"total": 2
}
"###);
let (documents, _code) = index.get_document(0, None).await;
snapshot!(json_string!(documents), @r###"
{
"id": 0,
"name": "kefir"
}
"###);
// if we try to retrieve the vectors with the `fields` parameter they
// still shouldn't be displayed
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions {
fields: Some(vec!["name", "_vectors"]),
..Default::default()
})
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"name": "kefir"
},
{
"name": "echo"
}
],
"offset": 0,
"limit": 20,
"total": 2
}
"###);
let (documents, _code) =
index.get_document(0, Some(json!({"fields": ["name", "_vectors"]}))).await;
snapshot!(json_string!(documents), @r###"
{
"name": "kefir"
}
"###);
// If we specify the retrieve vectors boolean and nothing else we should get the vectors
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"id": 0,
"name": "kefir",
"_vectors": {
"manual": {
"embeddings": [
[
0.0,
0.0,
0.0
]
],
"regenerate": false
}
}
},
{
"id": 1,
"name": "echo",
"_vectors": {}
}
],
"offset": 0,
"limit": 20,
"total": 2
}
"###);
let (documents, _code) = index.get_document(0, Some(json!({"retrieveVectors": true}))).await;
snapshot!(json_string!(documents), @r###"
{
"id": 0,
"name": "kefir",
"_vectors": {
"manual": {
"embeddings": [
[
0.0,
0.0,
0.0
]
],
"regenerate": false
}
}
}
"###);
// If we specify the retrieve vectors boolean and exclude vectors form the `fields` we should still get the vectors
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions {
retrieve_vectors: true,
fields: Some(vec!["name"]),
..Default::default()
})
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"name": "kefir",
"_vectors": {
"manual": {
"embeddings": [
[
0.0,
0.0,
0.0
]
],
"regenerate": false
}
}
},
{
"name": "echo",
"_vectors": {}
}
],
"offset": 0,
"limit": 20,
"total": 2
}
"###);
let (documents, _code) =
index.get_document(0, Some(json!({"retrieveVectors": true, "fields": ["name"]}))).await;
snapshot!(json_string!(documents), @r###"
{
"name": "kefir",
"_vectors": {
"manual": {
"embeddings": [
[
0.0,
0.0,
0.0
]
],
"regenerate": false
}
}
}
"###);
}

View File

@ -1938,3 +1938,210 @@ async fn import_dump_v6_containing_experimental_features() {
}) })
.await; .await;
} }
// In this test we must generate the dump ourselves to ensure the
// `user provided` vectors are well set
#[actix_rt::test]
#[cfg_attr(target_os = "windows", ignore)]
async fn generate_and_import_dump_containing_vectors() {
let temp = tempfile::tempdir().unwrap();
let mut opt = default_settings(temp.path());
let server = Server::new_with_options(opt.clone()).await.unwrap();
let (code, _) = server.set_features(json!({"vectorStore": true})).await;
snapshot!(code, @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false
}
"###);
let index = server.index("pets");
let (response, code) = index
.update_settings(json!(
{
"embedders": {
"doggo_embedder": {
"source": "huggingFace",
"model": "sentence-transformers/all-MiniLM-L6-v2",
"revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e",
"documentTemplate": "{{doc.doggo}}",
}
}
}
))
.await;
snapshot!(code, @"202 Accepted");
let response = index.wait_task(response.uid()).await;
snapshot!(response);
let (response, code) = index
.add_documents(
json!([
{"id": 0, "doggo": "kefir", "_vectors": { "doggo_embedder": vec![0; 384] }},
{"id": 1, "doggo": "echo", "_vectors": { "doggo_embedder": { "regenerate": false, "embeddings": vec![1; 384] }}},
{"id": 2, "doggo": "intel", "_vectors": { "doggo_embedder": { "regenerate": true, "embeddings": vec![2; 384] }}},
{"id": 3, "doggo": "bill", "_vectors": { "doggo_embedder": { "regenerate": true }}},
{"id": 4, "doggo": "max" },
]),
None,
)
.await;
snapshot!(code, @"202 Accepted");
let response = index.wait_task(response.uid()).await;
snapshot!(response);
let (response, code) = server.create_dump().await;
snapshot!(code, @"202 Accepted");
let response = index.wait_task(response.uid()).await;
snapshot!(response["status"], @r###""succeeded""###);
// ========= We made a dump, now we should clear the DB and try to import our dump
drop(server);
tokio::fs::remove_dir_all(&opt.db_path).await.unwrap();
let dump_name = format!("{}.dump", response["details"]["dumpUid"].as_str().unwrap());
let dump_path = opt.dump_dir.join(dump_name);
assert!(dump_path.exists(), "path: `{}`", dump_path.display());
opt.import_dump = Some(dump_path);
// NOTE: We shouldn't have to change the database path but I lost one hour
// because of a « bad path » error and that fixed it.
opt.db_path = temp.path().join("data.ms");
let mut server = Server::new_auth_with_options(opt, temp).await;
server.use_api_key("MASTER_KEY");
let (indexes, code) = server.list_indexes(None, None).await;
assert_eq!(code, 200, "{indexes}");
snapshot!(indexes["results"].as_array().unwrap().len(), @"1");
snapshot!(indexes["results"][0]["uid"], @r###""pets""###);
snapshot!(indexes["results"][0]["primaryKey"], @r###""id""###);
let (response, code) = server.get_features().await;
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false
}
"###);
let index = server.index("pets");
let (response, code) = index.settings().await;
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"displayedAttributes": [
"*"
],
"searchableAttributes": [
"*"
],
"filterableAttributes": [],
"sortableAttributes": [],
"rankingRules": [
"words",
"typo",
"proximity",
"attribute",
"sort",
"exactness"
],
"stopWords": [],
"nonSeparatorTokens": [],
"separatorTokens": [],
"dictionary": [],
"synonyms": {},
"distinctAttribute": null,
"proximityPrecision": "byWord",
"typoTolerance": {
"enabled": true,
"minWordSizeForTypos": {
"oneTypo": 5,
"twoTypos": 9
},
"disableOnWords": [],
"disableOnAttributes": []
},
"faceting": {
"maxValuesPerFacet": 100,
"sortFacetValuesBy": {
"*": "alpha"
}
},
"pagination": {
"maxTotalHits": 1000
},
"embedders": {
"doggo_embedder": {
"source": "huggingFace",
"model": "sentence-transformers/all-MiniLM-L6-v2",
"revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e",
"documentTemplate": "{{doc.doggo}}"
}
},
"searchCutoffMs": null
}
"###);
index
.search(json!({"retrieveVectors": true}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"], { "[]._vectors.doggo_embedder.embeddings" => "[vector]" }), @r###"
[
{
"id": 0,
"doggo": "kefir",
"_vectors": {
"doggo_embedder": {
"embeddings": "[vector]",
"regenerate": false
}
}
},
{
"id": 1,
"doggo": "echo",
"_vectors": {
"doggo_embedder": {
"embeddings": "[vector]",
"regenerate": false
}
}
},
{
"id": 2,
"doggo": "intel",
"_vectors": {
"doggo_embedder": {
"embeddings": "[vector]",
"regenerate": true
}
}
},
{
"id": 3,
"doggo": "bill",
"_vectors": {
"doggo_embedder": {
"embeddings": "[vector]",
"regenerate": true
}
}
},
{
"id": 4,
"doggo": "max",
"_vectors": {
"doggo_embedder": {
"embeddings": "[vector]",
"regenerate": true
}
}
}
]
"###);
})
.await;
}

View File

@ -0,0 +1,25 @@
---
source: meilisearch/tests/dumps/mod.rs
---
{
"uid": 0,
"indexUid": "pets",
"status": "succeeded",
"type": "settingsUpdate",
"canceledBy": null,
"details": {
"embedders": {
"doggo_embedder": {
"source": "huggingFace",
"model": "sentence-transformers/all-MiniLM-L6-v2",
"revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e",
"documentTemplate": "{{doc.doggo}}"
}
}
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}

View File

@ -0,0 +1,19 @@
---
source: meilisearch/tests/dumps/mod.rs
---
{
"uid": 1,
"indexUid": "pets",
"status": "succeeded",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 5,
"indexedDocuments": 5
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}

View File

@ -13,6 +13,7 @@ mod snapshot;
mod stats; mod stats;
mod swap_indexes; mod swap_indexes;
mod tasks; mod tasks;
mod vector;
// Tests are isolated by features in different modules to allow better readability, test // Tests are isolated by features in different modules to allow better readability, test
// targetability, and improved incremental compilation times. // targetability, and improved incremental compilation times.

View File

@ -167,6 +167,74 @@ async fn search_bad_hits_per_page() {
"###); "###);
} }
#[actix_rt::test]
async fn search_bad_attributes_to_retrieve() {
let server = Server::new().await;
let index = server.index("test");
let (response, code) = index.search_post(json!({"attributesToRetrieve": "doggo"})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value type at `.attributesToRetrieve`: expected an array, but found a string: `\"doggo\"`",
"code": "invalid_search_attributes_to_retrieve",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_retrieve"
}
"###);
// Can't make the `attributes_to_retrieve` fail with a get search since it'll accept anything as an array of strings.
}
#[actix_rt::test]
async fn search_bad_retrieve_vectors() {
let server = Server::new().await;
let index = server.index("test");
let (response, code) = index.search_post(json!({"retrieveVectors": "doggo"})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"doggo\"`",
"code": "invalid_search_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors"
}
"###);
let (response, code) = index.search_post(json!({"retrieveVectors": [true]})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found an array: `[true]`",
"code": "invalid_search_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors"
}
"###);
let (response, code) = index.search_get("retrieveVectors=").await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value in parameter `retrieveVectors`: could not parse `` as a boolean, expected either `true` or `false`",
"code": "invalid_search_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors"
}
"###);
let (response, code) = index.search_get("retrieveVectors=doggo").await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value in parameter `retrieveVectors`: could not parse `doggo` as a boolean, expected either `true` or `false`",
"code": "invalid_search_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors"
}
"###);
}
#[actix_rt::test] #[actix_rt::test]
async fn search_bad_attributes_to_crop() { async fn search_bad_attributes_to_crop() {
let server = Server::new().await; let server = Server::new().await;

View File

@ -124,29 +124,29 @@ async fn simple_search() {
let (response, code) = index let (response, code) = index
.search_post( .search_post(
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}}), json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true}),
) )
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]}}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}}}]"###);
snapshot!(response["semanticHitCount"], @"0"); snapshot!(response["semanticHitCount"], @"0");
let (response, code) = index let (response, code) = index
.search_post( .search_post(
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.5}, "showRankingScore": true}), json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.5}, "showRankingScore": true, "retrieveVectors": true}),
) )
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9472135901451112}]"###);
snapshot!(response["semanticHitCount"], @"2"); snapshot!(response["semanticHitCount"], @"2");
let (response, code) = index let (response, code) = index
.search_post( .search_post(
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}, "showRankingScore": true}), json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}, "showRankingScore": true, "retrieveVectors": true}),
) )
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9472135901451112}]"###);
snapshot!(response["semanticHitCount"], @"3"); snapshot!(response["semanticHitCount"], @"3");
} }
@ -204,10 +204,10 @@ async fn distribution_shift() {
let server = Server::new().await; let server = Server::new().await;
let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await; let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await;
let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}}); let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}, "retrieveVectors": true});
let (response, code) = index.search_post(search.clone()).await; let (response, code) = index.search_post(search.clone()).await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9472135901451112}]"###);
let (response, code) = index let (response, code) = index
.update_settings(json!({ .update_settings(json!({
@ -228,7 +228,7 @@ async fn distribution_shift() {
let (response, code) = index.search_post(search).await; let (response, code) = index.search_post(search).await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.1920928955078125e-7}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.1920928955078125e-7}]"###);
} }
#[actix_rt::test] #[actix_rt::test]
@ -239,20 +239,23 @@ async fn highlighter() {
let (response, code) = index let (response, code) = index
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
"hybrid": {"semanticRatio": 0.2}, "hybrid": {"semanticRatio": 0.2},
"attributesToHighlight": [ "retrieveVectors": true,
"desc" "attributesToHighlight": [
"desc",
"_vectors",
], ],
"highlightPreTag": "**BEGIN**", "highlightPreTag": "**BEGIN**",
"highlightPostTag": "**END**" "highlightPostTag": "**END**",
})) }))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}}}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"}}]"###);
snapshot!(response["semanticHitCount"], @"0"); snapshot!(response["semanticHitCount"], @"0");
let (response, code) = index let (response, code) = index
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
"hybrid": {"semanticRatio": 0.8}, "hybrid": {"semanticRatio": 0.8},
"retrieveVectors": true,
"showRankingScore": true, "showRankingScore": true,
"attributesToHighlight": [ "attributesToHighlight": [
"desc" "desc"
@ -262,13 +265,14 @@ async fn highlighter() {
})) }))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###);
snapshot!(response["semanticHitCount"], @"3"); snapshot!(response["semanticHitCount"], @"3");
// no highlighting on full semantic // no highlighting on full semantic
let (response, code) = index let (response, code) = index
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
"hybrid": {"semanticRatio": 1.0}, "hybrid": {"semanticRatio": 1.0},
"retrieveVectors": true,
"showRankingScore": true, "showRankingScore": true,
"attributesToHighlight": [ "attributesToHighlight": [
"desc" "desc"
@ -278,7 +282,7 @@ async fn highlighter() {
})) }))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###);
snapshot!(response["semanticHitCount"], @"3"); snapshot!(response["semanticHitCount"], @"3");
} }
@ -361,12 +365,12 @@ async fn single_document() {
let (response, code) = index let (response, code) = index
.search_post( .search_post(
json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}), json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}),
) )
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0}"###); snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0}"###);
snapshot!(response["semanticHitCount"], @"1"); snapshot!(response["semanticHitCount"], @"1");
} }
@ -377,25 +381,25 @@ async fn query_combination() {
// search without query and vector, but with hybrid => still placeholder // search without query and vector, but with hybrid => still placeholder
let (response, code) = index let (response, code) = index
.search_post(json!({"hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) .search_post(json!({"hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###); snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":1.0}]"###);
snapshot!(response["semanticHitCount"], @"null"); snapshot!(response["semanticHitCount"], @"null");
// same with a different semantic ratio // same with a different semantic ratio
let (response, code) = index let (response, code) = index
.search_post(json!({"hybrid": {"semanticRatio": 0.76}, "showRankingScore": true})) .search_post(json!({"hybrid": {"semanticRatio": 0.76}, "showRankingScore": true, "retrieveVectors": true}))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###); snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":1.0}]"###);
snapshot!(response["semanticHitCount"], @"null"); snapshot!(response["semanticHitCount"], @"null");
// wrong vector dimensions // wrong vector dimensions
let (response, code) = index let (response, code) = index
.search_post(json!({"vector": [1.0, 0.0, 1.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) .search_post(json!({"vector": [1.0, 0.0, 1.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}))
.await; .await;
snapshot!(code, @"400 Bad Request"); snapshot!(code, @"400 Bad Request");
@ -410,34 +414,34 @@ async fn query_combination() {
// full vector // full vector
let (response, code) = index let (response, code) = index
.search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.6581138968467712}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.6581138968467712}]"###);
snapshot!(response["semanticHitCount"], @"3"); snapshot!(response["semanticHitCount"], @"3");
// full keyword, without a query // full keyword, without a query
let (response, code) = index let (response, code) = index
.search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true})) .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true, "retrieveVectors": true}))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###); snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":1.0}]"###);
snapshot!(response["semanticHitCount"], @"null"); snapshot!(response["semanticHitCount"], @"null");
// query + vector, full keyword => keyword // query + vector, full keyword => keyword
let (response, code) = index let (response, code) = index
.search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true})) .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true, "retrieveVectors": true}))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9242424242424242}]"###);
snapshot!(response["semanticHitCount"], @"null"); snapshot!(response["semanticHitCount"], @"null");
// query + vector, no hybrid keyword => // query + vector, no hybrid keyword =>
let (response, code) = index let (response, code) = index
.search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "showRankingScore": true})) .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "showRankingScore": true, "retrieveVectors": true}))
.await; .await;
snapshot!(code, @"400 Bad Request"); snapshot!(code, @"400 Bad Request");
@ -453,7 +457,7 @@ async fn query_combination() {
// full vector, without a vector => error // full vector, without a vector => error
let (response, code) = index let (response, code) = index
.search_post( .search_post(
json!({"q": "Captain", "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}), json!({"q": "Captain", "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}),
) )
.await; .await;
@ -470,11 +474,93 @@ async fn query_combination() {
// hybrid without a vector => full keyword // hybrid without a vector => full keyword
let (response, code) = index let (response, code) = index
.search_post( .search_post(
json!({"q": "Planet", "hybrid": {"semanticRatio": 0.99}, "showRankingScore": true}), json!({"q": "Planet", "hybrid": {"semanticRatio": 0.99}, "showRankingScore": true, "retrieveVectors": true}),
) )
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9242424242424242}]"###);
snapshot!(response["semanticHitCount"], @"0"); snapshot!(response["semanticHitCount"], @"0");
} }
#[actix_rt::test]
async fn retrieve_vectors() {
let server = Server::new().await;
let index = index_with_documents_hf(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
let (response, code) = index
.search_post(
json!({"q": "Captain", "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true}),
)
.await;
snapshot!(code, @"200 OK");
insta::assert_json_snapshot!(response["hits"], {"[]._vectors.default.embeddings" => "[vectors]"}, @r###"
[
{
"title": "Captain Planet",
"desc": "He's not part of the Marvel Cinematic Universe",
"id": "2",
"_vectors": {
"default": {
"embeddings": "[vectors]",
"regenerate": true
}
}
},
{
"title": "Captain Marvel",
"desc": "a Shazam ersatz",
"id": "3",
"_vectors": {
"default": {
"embeddings": "[vectors]",
"regenerate": true
}
}
},
{
"title": "Shazam!",
"desc": "a Captain Marvel ersatz",
"id": "1",
"_vectors": {
"default": {
"embeddings": "[vectors]",
"regenerate": true
}
}
}
]
"###);
// remove `_vectors` from displayed attributes
let (response, code) =
index.update_settings(json!({ "displayedAttributes": ["id", "title", "desc"]} )).await;
assert_eq!(202, code, "{:?}", response);
index.wait_task(response.uid()).await;
let (response, code) = index
.search_post(
json!({"q": "Captain", "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true}),
)
.await;
snapshot!(code, @"200 OK");
insta::assert_json_snapshot!(response["hits"], {"[]._vectors.default.embeddings" => "[vectors]"}, @r###"
[
{
"title": "Captain Planet",
"desc": "He's not part of the Marvel Cinematic Universe",
"id": "2"
},
{
"title": "Captain Marvel",
"desc": "a Shazam ersatz",
"id": "3"
},
{
"title": "Shazam!",
"desc": "a Captain Marvel ersatz",
"id": "1"
}
]
"###);
}

View File

@ -1290,21 +1290,38 @@ async fn experimental_feature_vector_store() {
index.add_documents(json!(documents), None).await; index.add_documents(json!(documents), None).await;
index.wait_task(0).await; index.wait_task(0).await;
let (response, code) = index index
.search_post(json!({ .search(json!({
"vector": [1.0, 2.0, 3.0], "vector": [1.0, 2.0, 3.0],
"showRankingScore": true "showRankingScore": true
})) }), |response, code|{
meili_snap::snapshot!(code, @"400 Bad Request");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"message": "Passing `vector` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677",
"code": "feature_not_enabled",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
}
"###);
})
.await;
index
.search(json!({
"retrieveVectors": true,
"showRankingScore": true
}), |response, code|{
meili_snap::snapshot!(code, @"400 Bad Request");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677",
"code": "feature_not_enabled",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
}
"###);
})
.await; .await;
meili_snap::snapshot!(code, @"400 Bad Request");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"message": "Passing `vector` as a query parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677",
"code": "feature_not_enabled",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
}
"###);
let (response, code) = server.set_features(json!({"vectorStore": true})).await; let (response, code) = server.set_features(json!({"vectorStore": true})).await;
meili_snap::snapshot!(code, @"200 OK"); meili_snap::snapshot!(code, @"200 OK");
@ -1337,6 +1354,7 @@ async fn experimental_feature_vector_store() {
.search_post(json!({ .search_post(json!({
"vector": [1.0, 2.0, 3.0], "vector": [1.0, 2.0, 3.0],
"showRankingScore": true, "showRankingScore": true,
"retrieveVectors": true,
})) }))
.await; .await;
@ -1348,11 +1366,16 @@ async fn experimental_feature_vector_store() {
"title": "Shazam!", "title": "Shazam!",
"id": "287947", "id": "287947",
"_vectors": { "_vectors": {
"manual": [ "manual": {
1.0, "embeddings": [
2.0, [
3.0 1.0,
] 2.0,
3.0
]
],
"regenerate": false
}
}, },
"_rankingScore": 1.0 "_rankingScore": 1.0
}, },
@ -1360,11 +1383,16 @@ async fn experimental_feature_vector_store() {
"title": "Captain Marvel", "title": "Captain Marvel",
"id": "299537", "id": "299537",
"_vectors": { "_vectors": {
"manual": [ "manual": {
1.0, "embeddings": [
2.0, [
54.0 1.0,
] 2.0,
54.0
]
],
"regenerate": false
}
}, },
"_rankingScore": 0.9129111766815186 "_rankingScore": 0.9129111766815186
}, },
@ -1372,11 +1400,16 @@ async fn experimental_feature_vector_store() {
"title": "Gläss", "title": "Gläss",
"id": "450465", "id": "450465",
"_vectors": { "_vectors": {
"manual": [ "manual": {
-100.0, "embeddings": [
340.0, [
90.0 -100.0,
] 340.0,
90.0
]
],
"regenerate": false
}
}, },
"_rankingScore": 0.8106412887573242 "_rankingScore": 0.8106412887573242
}, },
@ -1384,11 +1417,16 @@ async fn experimental_feature_vector_store() {
"title": "How to Train Your Dragon: The Hidden World", "title": "How to Train Your Dragon: The Hidden World",
"id": "166428", "id": "166428",
"_vectors": { "_vectors": {
"manual": [ "manual": {
-100.0, "embeddings": [
231.0, [
32.0 -100.0,
] 231.0,
32.0
]
],
"regenerate": false
}
}, },
"_rankingScore": 0.7412010431289673 "_rankingScore": 0.7412010431289673
}, },
@ -1396,11 +1434,16 @@ async fn experimental_feature_vector_store() {
"title": "Escape Room", "title": "Escape Room",
"id": "522681", "id": "522681",
"_vectors": { "_vectors": {
"manual": [ "manual": {
10.0, "embeddings": [
-23.0, [
32.0 10.0,
] -23.0,
32.0
]
],
"regenerate": false
}
}, },
"_rankingScore": 0.6972063183784485 "_rankingScore": 0.6972063183784485
} }

View File

@ -756,3 +756,54 @@ async fn filter_reserved_geo_point_string() {
}) })
.await; .await;
} }
#[actix_rt::test]
async fn similar_bad_retrieve_vectors() {
let server = Server::new().await;
server.set_features(json!({"vectorStore": true})).await;
let index = server.index("test");
let (response, code) = index.similar_post(json!({"retrieveVectors": "doggo"})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"doggo\"`",
"code": "invalid_similar_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors"
}
"###);
let (response, code) = index.similar_post(json!({"retrieveVectors": [true]})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found an array: `[true]`",
"code": "invalid_similar_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors"
}
"###);
let (response, code) = index.similar_get("retrieveVectors=").await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value in parameter `retrieveVectors`: could not parse `` as a boolean, expected either `true` or `false`",
"code": "invalid_similar_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors"
}
"###);
let (response, code) = index.similar_get("retrieveVectors=doggo").await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value in parameter `retrieveVectors`: could not parse `doggo` as a boolean, expected either `true` or `false`",
"code": "invalid_similar_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors"
}
"###);
}

View File

@ -78,7 +78,7 @@ async fn basic() {
index.wait_task(value.uid()).await; index.wait_task(value.uid()).await;
index index
.similar(json!({"id": 143}), |response, code| { .similar(json!({"id": 143, "retrieveVectors": true}), |response, code| {
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###" snapshot!(json_string!(response["hits"]), @r###"
[ [
@ -87,11 +87,16 @@ async fn basic() {
"release_year": 2019, "release_year": 2019,
"id": "522681", "id": "522681",
"_vectors": { "_vectors": {
"manual": [ "manual": {
0.1, "embeddings": [
0.6, [
0.8 0.10000000149011612,
] 0.6000000238418579,
0.800000011920929
]
],
"regenerate": false
}
} }
}, },
{ {
@ -99,11 +104,16 @@ async fn basic() {
"release_year": 2019, "release_year": 2019,
"id": "299537", "id": "299537",
"_vectors": { "_vectors": {
"manual": [ "manual": {
0.6, "embeddings": [
0.8, [
-0.2 0.6000000238418579,
] 0.800000011920929,
-0.20000000298023224
]
],
"regenerate": false
}
} }
}, },
{ {
@ -111,11 +121,16 @@ async fn basic() {
"release_year": 2019, "release_year": 2019,
"id": "166428", "id": "166428",
"_vectors": { "_vectors": {
"manual": [ "manual": {
0.7, "embeddings": [
0.7, [
-0.4 0.699999988079071,
] 0.699999988079071,
-0.4000000059604645
]
],
"regenerate": false
}
} }
}, },
{ {
@ -123,11 +138,16 @@ async fn basic() {
"release_year": 2019, "release_year": 2019,
"id": "287947", "id": "287947",
"_vectors": { "_vectors": {
"manual": [ "manual": {
0.8, "embeddings": [
0.4, [
-0.5 0.800000011920929,
] 0.4000000059604645,
-0.5
]
],
"regenerate": false
}
} }
} }
] ]
@ -136,7 +156,7 @@ async fn basic() {
.await; .await;
index index
.similar(json!({"id": "299537"}), |response, code| { .similar(json!({"id": "299537", "retrieveVectors": true}), |response, code| {
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###" snapshot!(json_string!(response["hits"]), @r###"
[ [
@ -145,11 +165,16 @@ async fn basic() {
"release_year": 2019, "release_year": 2019,
"id": "166428", "id": "166428",
"_vectors": { "_vectors": {
"manual": [ "manual": {
0.7, "embeddings": [
0.7, [
-0.4 0.699999988079071,
] 0.699999988079071,
-0.4000000059604645
]
],
"regenerate": false
}
} }
}, },
{ {
@ -157,11 +182,16 @@ async fn basic() {
"release_year": 2019, "release_year": 2019,
"id": "287947", "id": "287947",
"_vectors": { "_vectors": {
"manual": [ "manual": {
0.8, "embeddings": [
0.4, [
-0.5 0.800000011920929,
] 0.4000000059604645,
-0.5
]
],
"regenerate": false
}
} }
}, },
{ {
@ -169,11 +199,16 @@ async fn basic() {
"release_year": 2019, "release_year": 2019,
"id": "522681", "id": "522681",
"_vectors": { "_vectors": {
"manual": [ "manual": {
0.1, "embeddings": [
0.6, [
0.8 0.10000000149011612,
] 0.6000000238418579,
0.800000011920929
]
],
"regenerate": false
}
} }
}, },
{ {
@ -181,11 +216,16 @@ async fn basic() {
"release_year": 1930, "release_year": 1930,
"id": "143", "id": "143",
"_vectors": { "_vectors": {
"manual": [ "manual": {
-0.5, "embeddings": [
0.3, [
0.85 -0.5,
] 0.30000001192092896,
0.8500000238418579
]
],
"regenerate": false
}
} }
} }
] ]
@ -228,7 +268,7 @@ async fn ranking_score_threshold() {
index index
.similar( .similar(
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0}), json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0, "retrieveVectors": true}),
|response, code| { |response, code| {
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"4"); meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"4");
@ -239,11 +279,16 @@ async fn ranking_score_threshold() {
"release_year": 2019, "release_year": 2019,
"id": "522681", "id": "522681",
"_vectors": { "_vectors": {
"manual": [ "manual": {
0.1, "embeddings": [
0.6, [
0.8 0.10000000149011612,
] 0.6000000238418579,
0.800000011920929
]
],
"regenerate": false
}
}, },
"_rankingScore": 0.890957772731781 "_rankingScore": 0.890957772731781
}, },
@ -252,11 +297,16 @@ async fn ranking_score_threshold() {
"release_year": 2019, "release_year": 2019,
"id": "299537", "id": "299537",
"_vectors": { "_vectors": {
"manual": [ "manual": {
0.6, "embeddings": [
0.8, [
-0.2 0.6000000238418579,
] 0.800000011920929,
-0.20000000298023224
]
],
"regenerate": false
}
}, },
"_rankingScore": 0.39060014486312866 "_rankingScore": 0.39060014486312866
}, },
@ -265,11 +315,16 @@ async fn ranking_score_threshold() {
"release_year": 2019, "release_year": 2019,
"id": "166428", "id": "166428",
"_vectors": { "_vectors": {
"manual": [ "manual": {
0.7, "embeddings": [
0.7, [
-0.4 0.699999988079071,
] 0.699999988079071,
-0.4000000059604645
]
],
"regenerate": false
}
}, },
"_rankingScore": 0.2819308042526245 "_rankingScore": 0.2819308042526245
}, },
@ -278,11 +333,16 @@ async fn ranking_score_threshold() {
"release_year": 2019, "release_year": 2019,
"id": "287947", "id": "287947",
"_vectors": { "_vectors": {
"manual": [ "manual": {
0.8, "embeddings": [
0.4, [
-0.5 0.800000011920929,
] 0.4000000059604645,
-0.5
]
],
"regenerate": false
}
}, },
"_rankingScore": 0.1662663221359253 "_rankingScore": 0.1662663221359253
} }
@ -294,7 +354,7 @@ async fn ranking_score_threshold() {
index index
.similar( .similar(
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.2}), json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.2, "retrieveVectors": true}),
|response, code| { |response, code| {
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"3"); meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"3");
@ -305,11 +365,16 @@ async fn ranking_score_threshold() {
"release_year": 2019, "release_year": 2019,
"id": "522681", "id": "522681",
"_vectors": { "_vectors": {
"manual": [ "manual": {
0.1, "embeddings": [
0.6, [
0.8 0.10000000149011612,
] 0.6000000238418579,
0.800000011920929
]
],
"regenerate": false
}
}, },
"_rankingScore": 0.890957772731781 "_rankingScore": 0.890957772731781
}, },
@ -318,11 +383,16 @@ async fn ranking_score_threshold() {
"release_year": 2019, "release_year": 2019,
"id": "299537", "id": "299537",
"_vectors": { "_vectors": {
"manual": [ "manual": {
0.6, "embeddings": [
0.8, [
-0.2 0.6000000238418579,
] 0.800000011920929,
-0.20000000298023224
]
],
"regenerate": false
}
}, },
"_rankingScore": 0.39060014486312866 "_rankingScore": 0.39060014486312866
}, },
@ -331,11 +401,16 @@ async fn ranking_score_threshold() {
"release_year": 2019, "release_year": 2019,
"id": "166428", "id": "166428",
"_vectors": { "_vectors": {
"manual": [ "manual": {
0.7, "embeddings": [
0.7, [
-0.4 0.699999988079071,
] 0.699999988079071,
-0.4000000059604645
]
],
"regenerate": false
}
}, },
"_rankingScore": 0.2819308042526245 "_rankingScore": 0.2819308042526245
} }
@ -347,7 +422,7 @@ async fn ranking_score_threshold() {
index index
.similar( .similar(
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.3}), json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.3, "retrieveVectors": true}),
|response, code| { |response, code| {
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"2"); meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"2");
@ -358,11 +433,16 @@ async fn ranking_score_threshold() {
"release_year": 2019, "release_year": 2019,
"id": "522681", "id": "522681",
"_vectors": { "_vectors": {
"manual": [ "manual": {
0.1, "embeddings": [
0.6, [
0.8 0.10000000149011612,
] 0.6000000238418579,
0.800000011920929
]
],
"regenerate": false
}
}, },
"_rankingScore": 0.890957772731781 "_rankingScore": 0.890957772731781
}, },
@ -371,11 +451,16 @@ async fn ranking_score_threshold() {
"release_year": 2019, "release_year": 2019,
"id": "299537", "id": "299537",
"_vectors": { "_vectors": {
"manual": [ "manual": {
0.6, "embeddings": [
0.8, [
-0.2 0.6000000238418579,
] 0.800000011920929,
-0.20000000298023224
]
],
"regenerate": false
}
}, },
"_rankingScore": 0.39060014486312866 "_rankingScore": 0.39060014486312866
} }
@ -387,7 +472,7 @@ async fn ranking_score_threshold() {
index index
.similar( .similar(
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.6}), json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.6, "retrieveVectors": true}),
|response, code| { |response, code| {
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"1"); meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"1");
@ -398,11 +483,16 @@ async fn ranking_score_threshold() {
"release_year": 2019, "release_year": 2019,
"id": "522681", "id": "522681",
"_vectors": { "_vectors": {
"manual": [ "manual": {
0.1, "embeddings": [
0.6, [
0.8 0.10000000149011612,
] 0.6000000238418579,
0.800000011920929
]
],
"regenerate": false
}
}, },
"_rankingScore": 0.890957772731781 "_rankingScore": 0.890957772731781
} }
@ -414,7 +504,7 @@ async fn ranking_score_threshold() {
index index
.similar( .similar(
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.9}), json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.9, "retrieveVectors": true}),
|response, code| { |response, code| {
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @"[]"); snapshot!(json_string!(response["hits"]), @"[]");
@ -456,71 +546,97 @@ async fn filter() {
index.wait_task(value.uid()).await; index.wait_task(value.uid()).await;
index index
.similar(json!({"id": 522681, "filter": "release_year = 2019"}), |response, code| { .similar(
snapshot!(code, @"200 OK"); json!({"id": 522681, "filter": "release_year = 2019", "retrieveVectors": true}),
snapshot!(json_string!(response["hits"]), @r###" |response, code| {
[ snapshot!(code, @"200 OK");
{ snapshot!(json_string!(response["hits"]), @r###"
"title": "Captain Marvel", [
"release_year": 2019, {
"id": "299537", "title": "Captain Marvel",
"_vectors": { "release_year": 2019,
"manual": [ "id": "299537",
0.6, "_vectors": {
0.8, "manual": {
-0.2 "embeddings": [
] [
} 0.6000000238418579,
}, 0.800000011920929,
{ -0.20000000298023224
"title": "How to Train Your Dragon: The Hidden World", ]
"release_year": 2019, ],
"id": "166428", "regenerate": false
"_vectors": { }
"manual": [ }
0.7, },
0.7, {
-0.4 "title": "How to Train Your Dragon: The Hidden World",
] "release_year": 2019,
} "id": "166428",
}, "_vectors": {
{ "manual": {
"title": "Shazam!", "embeddings": [
"release_year": 2019, [
"id": "287947", 0.699999988079071,
"_vectors": { 0.699999988079071,
"manual": [ -0.4000000059604645
0.8, ]
0.4, ],
-0.5 "regenerate": false
] }
} }
} },
] {
"###); "title": "Shazam!",
}) "release_year": 2019,
"id": "287947",
"_vectors": {
"manual": {
"embeddings": [
[
0.800000011920929,
0.4000000059604645,
-0.5
]
],
"regenerate": false
}
}
}
]
"###);
},
)
.await; .await;
index index
.similar(json!({"id": 522681, "filter": "release_year < 2000"}), |response, code| { .similar(
snapshot!(code, @"200 OK"); json!({"id": 522681, "filter": "release_year < 2000", "retrieveVectors": true}),
snapshot!(json_string!(response["hits"]), @r###" |response, code| {
[ snapshot!(code, @"200 OK");
{ snapshot!(json_string!(response["hits"]), @r###"
"title": "All Quiet on the Western Front", [
"release_year": 1930, {
"id": "143", "title": "All Quiet on the Western Front",
"_vectors": { "release_year": 1930,
"manual": [ "id": "143",
-0.5, "_vectors": {
0.3, "manual": {
0.85 "embeddings": [
] [
} -0.5,
} 0.30000001192092896,
] 0.8500000238418579
"###); ]
}) ],
"regenerate": false
}
}
}
]
"###);
},
)
.await; .await;
} }
@ -557,7 +673,7 @@ async fn limit_and_offset() {
index.wait_task(value.uid()).await; index.wait_task(value.uid()).await;
index index
.similar(json!({"id": 143, "limit": 1}), |response, code| { .similar(json!({"id": 143, "limit": 1, "retrieveVectors": true}), |response, code| {
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###" snapshot!(json_string!(response["hits"]), @r###"
[ [
@ -566,11 +682,16 @@ async fn limit_and_offset() {
"release_year": 2019, "release_year": 2019,
"id": "522681", "id": "522681",
"_vectors": { "_vectors": {
"manual": [ "manual": {
0.1, "embeddings": [
0.6, [
0.8 0.10000000149011612,
] 0.6000000238418579,
0.800000011920929
]
],
"regenerate": false
}
} }
} }
] ]
@ -579,24 +700,32 @@ async fn limit_and_offset() {
.await; .await;
index index
.similar(json!({"id": 143, "limit": 1, "offset": 1}), |response, code| { .similar(
snapshot!(code, @"200 OK"); json!({"id": 143, "limit": 1, "offset": 1, "retrieveVectors": true}),
snapshot!(json_string!(response["hits"]), @r###" |response, code| {
[ snapshot!(code, @"200 OK");
{ snapshot!(json_string!(response["hits"]), @r###"
"title": "Captain Marvel", [
"release_year": 2019, {
"id": "299537", "title": "Captain Marvel",
"_vectors": { "release_year": 2019,
"manual": [ "id": "299537",
0.6, "_vectors": {
0.8, "manual": {
-0.2 "embeddings": [
] [
} 0.6000000238418579,
} 0.800000011920929,
] -0.20000000298023224
"###); ]
}) ],
"regenerate": false
}
}
}
]
"###);
},
)
.await; .await;
} }

View File

@ -0,0 +1,227 @@
mod settings;
use meili_snap::{json_string, snapshot};
use crate::common::index::Index;
use crate::common::{GetAllDocumentsOptions, Server};
use crate::json;
#[actix_rt::test]
async fn add_remove_user_provided() {
let server = Server::new().await;
let index = server.index("doggo");
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
snapshot!(code, @"200 OK");
snapshot!(value, @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false
}
"###);
let (response, code) = index
.update_settings(json!({
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 3,
}
},
}))
.await;
snapshot!(code, @"202 Accepted");
server.wait_task(response.uid()).await;
let documents = json!([
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }},
{"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1] }},
]);
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
index.wait_task(value.uid()).await;
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"id": 0,
"name": "kefir",
"_vectors": {
"manual": {
"embeddings": [
[
0.0,
0.0,
0.0
]
],
"regenerate": false
}
}
},
{
"id": 1,
"name": "echo",
"_vectors": {
"manual": {
"embeddings": [
[
1.0,
1.0,
1.0
]
],
"regenerate": false
}
}
}
],
"offset": 0,
"limit": 20,
"total": 2
}
"###);
let documents = json!([
{"id": 0, "name": "kefir", "_vectors": { "manual": [10, 10, 10] }},
{"id": 1, "name": "echo", "_vectors": { "manual": null }},
]);
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
index.wait_task(value.uid()).await;
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"id": 0,
"name": "kefir",
"_vectors": {
"manual": {
"embeddings": [
[
10.0,
10.0,
10.0
]
],
"regenerate": false
}
}
},
{
"id": 1,
"name": "echo",
"_vectors": {}
}
],
"offset": 0,
"limit": 20,
"total": 2
}
"###);
let (value, code) = index.delete_document(0).await;
snapshot!(code, @"202 Accepted");
index.wait_task(value.uid()).await;
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"id": 1,
"name": "echo",
"_vectors": {}
}
],
"offset": 0,
"limit": 20,
"total": 1
}
"###);
}
async fn generate_default_user_provided_documents(server: &Server) -> Index {
let index = server.index("doggo");
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
snapshot!(code, @"200 OK");
snapshot!(value, @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false
}
"###);
let (response, code) = index
.update_settings(json!({
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 3,
}
},
}))
.await;
snapshot!(code, @"202 Accepted");
server.wait_task(response.uid()).await;
let documents = json!([
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }},
{"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1] }},
{"id": 2, "name": "billou", "_vectors": { "manual": [[2, 2, 2], [2, 2, 3]] }},
{"id": 3, "name": "intel", "_vectors": { "manual": { "regenerate": false, "embeddings": [3, 3, 3] }}},
{"id": 4, "name": "max", "_vectors": { "manual": { "regenerate": false, "embeddings": [[4, 4, 4], [4, 4, 5]] }}},
]);
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
index.wait_task(value.uid()).await;
index
}
#[actix_rt::test]
async fn clear_documents() {
let server = Server::new().await;
let index = generate_default_user_provided_documents(&server).await;
let (value, _code) = index.clear_all_documents().await;
index.wait_task(value.uid()).await;
// Make sure the documents DB has been cleared
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [],
"offset": 0,
"limit": 20,
"total": 0
}
"###);
// Make sure the arroy DB has been cleared
let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1] })).await;
snapshot!(json_string!(documents), @r###"
{
"hits": [],
"query": "",
"processingTimeMs": 0,
"limit": 20,
"offset": 0,
"estimatedTotalHits": 0,
"semanticHitCount": 0
}
"###);
}

View File

@ -0,0 +1,228 @@
use meili_snap::{json_string, snapshot};
use crate::common::{GetAllDocumentsOptions, Server};
use crate::json;
use crate::vector::generate_default_user_provided_documents;
#[actix_rt::test]
async fn update_embedder() {
let server = Server::new().await;
let index = server.index("doggo");
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
snapshot!(code, @"200 OK");
snapshot!(value, @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false
}
"###);
let (response, code) = index
.update_settings(json!({
"embedders": { "manual": {}},
}))
.await;
snapshot!(code, @"202 Accepted");
server.wait_task(response.uid()).await;
let (response, code) = index
.update_settings(json!({
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 2,
}
},
}))
.await;
snapshot!(code, @"202 Accepted");
let ret = server.wait_task(response.uid()).await;
snapshot!(ret, @r###"
{
"uid": 1,
"indexUid": "doggo",
"status": "succeeded",
"type": "settingsUpdate",
"canceledBy": null,
"details": {
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 2
}
}
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
}
#[actix_rt::test]
async fn reset_embedder_documents() {
let server = Server::new().await;
let index = generate_default_user_provided_documents(&server).await;
let (response, code) = index.delete_settings().await;
snapshot!(code, @"202 Accepted");
server.wait_task(response.uid()).await;
// Make sure the documents are still present
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions {
limit: None,
offset: None,
retrieve_vectors: false,
fields: None,
})
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"id": 0,
"name": "kefir"
},
{
"id": 1,
"name": "echo"
},
{
"id": 2,
"name": "billou"
},
{
"id": 3,
"name": "intel"
},
{
"id": 4,
"name": "max"
}
],
"offset": 0,
"limit": 20,
"total": 5
}
"###);
// Make sure we are still able to retrieve their vectors
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"id": 0,
"name": "kefir",
"_vectors": {
"manual": {
"embeddings": [
[
0.0,
0.0,
0.0
]
],
"regenerate": false
}
}
},
{
"id": 1,
"name": "echo",
"_vectors": {
"manual": {
"embeddings": [
[
1.0,
1.0,
1.0
]
],
"regenerate": false
}
}
},
{
"id": 2,
"name": "billou",
"_vectors": {
"manual": {
"embeddings": [
[
2.0,
2.0,
2.0
],
[
2.0,
2.0,
3.0
]
],
"regenerate": false
}
}
},
{
"id": 3,
"name": "intel",
"_vectors": {
"manual": {
"embeddings": [
[
3.0,
3.0,
3.0
]
],
"regenerate": false
}
}
},
{
"id": 4,
"name": "max",
"_vectors": {
"manual": {
"embeddings": [
[
4.0,
4.0,
4.0
],
[
4.0,
4.0,
5.0
]
],
"regenerate": false
}
}
}
],
"offset": 0,
"limit": 20,
"total": 5
}
"###);
// Make sure the arroy DB has been cleared
let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1] })).await;
snapshot!(json_string!(documents), @r###"
{
"message": "Cannot find embedder with name `default`.",
"code": "invalid_embedder",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_embedder"
}
"###);
}

View File

@ -44,7 +44,7 @@ once_cell = "1.19.0"
ordered-float = "4.2.0" ordered-float = "4.2.0"
rand_pcg = { version = "0.3.1", features = ["serde1"] } rand_pcg = { version = "0.3.1", features = ["serde1"] }
rayon = "1.8.0" rayon = "1.8.0"
roaring = "0.10.2" roaring = { version = "0.10.2", features = ["serde"] }
rstar = { version = "0.11.0", features = ["serde"] } rstar = { version = "0.11.0", features = ["serde"] }
serde = { version = "1.0.195", features = ["derive"] } serde = { version = "1.0.195", features = ["derive"] }
serde_json = { version = "1.0.111", features = ["preserve_order"] } serde_json = { version = "1.0.111", features = ["preserve_order"] }
@ -71,10 +71,10 @@ csv = "1.3.0"
candle-core = { version = "0.4.1" } candle-core = { version = "0.4.1" }
candle-transformers = { version = "0.4.1" } candle-transformers = { version = "0.4.1" }
candle-nn = { version = "0.4.1" } candle-nn = { version = "0.4.1" }
tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default_features = false, features = [ tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default-features = false, features = [
"onig", "onig",
] } ] }
hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default_features = false, features = [ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default-features = false, features = [
"online", "online",
] } ] }
tiktoken-rs = "0.5.8" tiktoken-rs = "0.5.8"

View File

@ -4,6 +4,7 @@ use std::collections::HashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME;
use crate::{FieldId, FieldsIdsMap, Weight}; use crate::{FieldId, FieldsIdsMap, Weight};
#[derive(Debug, Default, Serialize, Deserialize)] #[derive(Debug, Default, Serialize, Deserialize)]
@ -23,7 +24,13 @@ impl FieldidsWeightsMap {
/// Should only be called in the case there are NO searchable attributes. /// Should only be called in the case there are NO searchable attributes.
/// All the fields will be inserted in the order of the fields ids map with a weight of 0. /// All the fields will be inserted in the order of the fields ids map with a weight of 0.
pub fn from_field_id_map_without_searchable(fid_map: &FieldsIdsMap) -> Self { pub fn from_field_id_map_without_searchable(fid_map: &FieldsIdsMap) -> Self {
FieldidsWeightsMap { map: fid_map.ids().map(|fid| (fid, 0)).collect() } FieldidsWeightsMap {
map: fid_map
.iter()
.filter(|(_fid, name)| !crate::is_faceted_by(name, RESERVED_VECTORS_FIELD_NAME))
.map(|(fid, _name)| (fid, 0))
.collect(),
}
} }
/// Removes a field id from the map, returning the associated weight previously in the map. /// Removes a field id from the map, returning the associated weight previously in the map.

View File

@ -41,6 +41,16 @@ impl FieldsIdsMap {
} }
} }
/// Get the ids of a field and all its nested fields based on its name.
pub fn nested_ids(&self, name: &str) -> Vec<FieldId> {
self.names_ids
.range(name.to_string()..)
.take_while(|(key, _)| key.starts_with(name))
.filter(|(key, _)| crate::is_faceted_by(key, name))
.map(|(_name, id)| *id)
.collect()
}
/// Get the id of a field based on its name. /// Get the id of a field based on its name.
pub fn id(&self, name: &str) -> Option<FieldId> { pub fn id(&self, name: &str) -> Option<FieldId> {
self.names_ids.get(name).copied() self.names_ids.get(name).copied()
@ -126,4 +136,32 @@ mod tests {
assert_eq!(iter.next(), Some((3, "title"))); assert_eq!(iter.next(), Some((3, "title")));
assert_eq!(iter.next(), None); assert_eq!(iter.next(), None);
} }
#[test]
fn nested_fields() {
let mut map = FieldsIdsMap::new();
assert_eq!(map.insert("id"), Some(0));
assert_eq!(map.insert("doggo"), Some(1));
assert_eq!(map.insert("doggo.name"), Some(2));
assert_eq!(map.insert("doggolution"), Some(3));
assert_eq!(map.insert("doggo.breed.name"), Some(4));
assert_eq!(map.insert("description"), Some(5));
insta::assert_debug_snapshot!(map.nested_ids("doggo"), @r###"
[
1,
4,
2,
]
"###);
insta::assert_debug_snapshot!(map.nested_ids("doggo.breed"), @r###"
[
4,
]
"###);
insta::assert_debug_snapshot!(map.nested_ids("_vector"), @"[]");
}
} }

View File

@ -9,6 +9,7 @@ use heed::types::*;
use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified}; use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use rstar::RTree; use rstar::RTree;
use serde::{Deserialize, Serialize};
use time::OffsetDateTime; use time::OffsetDateTime;
use crate::documents::PrimaryKey; use crate::documents::PrimaryKey;
@ -23,6 +24,7 @@ use crate::heed_codec::{
}; };
use crate::order_by_map::OrderByMap; use crate::order_by_map::OrderByMap;
use crate::proximity::ProximityPrecision; use crate::proximity::ProximityPrecision;
use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME;
use crate::vector::{Embedding, EmbeddingConfig}; use crate::vector::{Embedding, EmbeddingConfig};
use crate::{ use crate::{
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
@ -644,6 +646,7 @@ impl Index {
&self, &self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
user_fields: &[&str], user_fields: &[&str],
non_searchable_fields_ids: &[FieldId],
fields_ids_map: &FieldsIdsMap, fields_ids_map: &FieldsIdsMap,
) -> Result<()> { ) -> Result<()> {
// We can write the user defined searchable fields as-is. // We can write the user defined searchable fields as-is.
@ -662,6 +665,7 @@ impl Index {
for (weight, user_field) in user_fields.iter().enumerate() { for (weight, user_field) in user_fields.iter().enumerate() {
if crate::is_faceted_by(field_from_map, user_field) if crate::is_faceted_by(field_from_map, user_field)
&& !real_fields.contains(&field_from_map) && !real_fields.contains(&field_from_map)
&& !non_searchable_fields_ids.contains(&id)
{ {
real_fields.push(field_from_map); real_fields.push(field_from_map);
@ -708,6 +712,7 @@ impl Index {
Ok(self Ok(self
.fields_ids_map(rtxn)? .fields_ids_map(rtxn)?
.names() .names()
.filter(|name| !crate::is_faceted_by(name, RESERVED_VECTORS_FIELD_NAME))
.map(|field| Cow::Owned(field.to_string())) .map(|field| Cow::Owned(field.to_string()))
.collect()) .collect())
}) })
@ -1568,12 +1573,16 @@ impl Index {
Ok(script_language) Ok(script_language)
} }
/// Put the embedding configs:
/// 1. The name of the embedder
/// 2. The configuration option for this embedder
/// 3. The list of documents with a user provided embedding
pub(crate) fn put_embedding_configs( pub(crate) fn put_embedding_configs(
&self, &self,
wtxn: &mut RwTxn<'_>, wtxn: &mut RwTxn<'_>,
configs: Vec<(String, EmbeddingConfig)>, configs: Vec<IndexEmbeddingConfig>,
) -> heed::Result<()> { ) -> heed::Result<()> {
self.main.remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig)>>>().put( self.main.remap_types::<Str, SerdeJson<Vec<IndexEmbeddingConfig>>>().put(
wtxn, wtxn,
main_key::EMBEDDING_CONFIGS, main_key::EMBEDDING_CONFIGS,
&configs, &configs,
@ -1584,13 +1593,10 @@ impl Index {
self.main.remap_key_type::<Str>().delete(wtxn, main_key::EMBEDDING_CONFIGS) self.main.remap_key_type::<Str>().delete(wtxn, main_key::EMBEDDING_CONFIGS)
} }
pub fn embedding_configs( pub fn embedding_configs(&self, rtxn: &RoTxn<'_>) -> Result<Vec<IndexEmbeddingConfig>> {
&self,
rtxn: &RoTxn<'_>,
) -> Result<Vec<(String, crate::vector::EmbeddingConfig)>> {
Ok(self Ok(self
.main .main
.remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig)>>>() .remap_types::<Str, SerdeJson<Vec<IndexEmbeddingConfig>>>()
.get(rtxn, main_key::EMBEDDING_CONFIGS)? .get(rtxn, main_key::EMBEDDING_CONFIGS)?
.unwrap_or_default()) .unwrap_or_default())
} }
@ -1662,6 +1668,13 @@ impl Index {
} }
} }
#[derive(Debug, Deserialize, Serialize)]
pub struct IndexEmbeddingConfig {
pub name: String,
pub config: EmbeddingConfig,
pub user_provided: RoaringBitmap,
}
#[cfg(test)] #[cfg(test)]
pub(crate) mod tests { pub(crate) mod tests {
use std::collections::HashSet; use std::collections::HashSet;
@ -1669,15 +1682,17 @@ pub(crate) mod tests {
use big_s::S; use big_s::S;
use heed::{EnvOpenOptions, RwTxn}; use heed::{EnvOpenOptions, RwTxn};
use maplit::hashset; use maplit::{btreemap, hashset};
use tempfile::TempDir; use tempfile::TempDir;
use crate::documents::DocumentsBatchReader; use crate::documents::DocumentsBatchReader;
use crate::error::{Error, InternalError}; use crate::error::{Error, InternalError};
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
use crate::update::{ use crate::update::{
self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting,
Settings,
}; };
use crate::vector::settings::{EmbedderSource, EmbeddingSettings};
use crate::{db_snap, obkv_to_json, Filter, Index, Search, SearchResult}; use crate::{db_snap, obkv_to_json, Filter, Index, Search, SearchResult};
pub(crate) struct TempIndex { pub(crate) struct TempIndex {
@ -2783,4 +2798,95 @@ pub(crate) mod tests {
] ]
"###); "###);
} }
#[test]
fn vectors_are_never_indexed_as_searchable_or_filterable() {
let index = TempIndex::new();
index
.add_documents(documents!([
{ "id": 0, "_vectors": { "doggo": [2345] } },
{ "id": 1, "_vectors": { "doggo": [6789] } },
]))
.unwrap();
db_snap!(index, fields_ids_map, @r###"
0 id |
1 _vectors |
2 _vectors.doggo |
"###);
db_snap!(index, searchable_fields, @r###"["id"]"###);
db_snap!(index, fieldids_weights_map, @r###"
fid weight
0 0 |
"###);
let rtxn = index.read_txn().unwrap();
let mut search = index.search(&rtxn);
let results = search.query("2345").execute().unwrap();
assert!(results.candidates.is_empty());
drop(rtxn);
index
.update_settings(|settings| {
settings.set_searchable_fields(vec![S("_vectors"), S("_vectors.doggo")]);
settings.set_filterable_fields(hashset![S("_vectors"), S("_vectors.doggo")]);
})
.unwrap();
db_snap!(index, fields_ids_map, @r###"
0 id |
1 _vectors |
2 _vectors.doggo |
"###);
db_snap!(index, searchable_fields, @"[]");
db_snap!(index, fieldids_weights_map, @r###"
fid weight
"###);
let rtxn = index.read_txn().unwrap();
let mut search = index.search(&rtxn);
let results = search.query("2345").execute().unwrap();
assert!(results.candidates.is_empty());
let mut search = index.search(&rtxn);
let results = search
.filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap())
.execute()
.unwrap();
assert!(results.candidates.is_empty());
index
.update_settings(|settings| {
settings.set_embedder_settings(btreemap! {
S("doggo") => Setting::Set(EmbeddingSettings {
dimensions: Setting::Set(1),
source: Setting::Set(EmbedderSource::UserProvided),
..EmbeddingSettings::default()}),
});
})
.unwrap();
db_snap!(index, fields_ids_map, @r###"
0 id |
1 _vectors |
2 _vectors.doggo |
"###);
db_snap!(index, searchable_fields, @"[]");
db_snap!(index, fieldids_weights_map, @r###"
fid weight
"###);
let rtxn = index.read_txn().unwrap();
let mut search = index.search(&rtxn);
let results = search.query("2345").execute().unwrap();
assert!(results.candidates.is_empty());
let mut search = index.search(&rtxn);
let results = search
.filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap())
.execute()
.unwrap();
assert!(results.candidates.is_empty());
}
} }

View File

@ -22,7 +22,7 @@ pub enum SearchEvents {
RankingRuleStartIteration { ranking_rule_idx: usize, universe_len: u64 }, RankingRuleStartIteration { ranking_rule_idx: usize, universe_len: u64 },
RankingRuleNextBucket { ranking_rule_idx: usize, universe_len: u64, bucket_len: u64 }, RankingRuleNextBucket { ranking_rule_idx: usize, universe_len: u64, bucket_len: u64 },
RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 }, RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 },
RankingRuleEndIteration { ranking_rule_idx: usize, universe_len: u64 }, RankingRuleEndIteration { ranking_rule_idx: usize },
ExtendResults { new: Vec<u32> }, ExtendResults { new: Vec<u32> },
ProximityGraph { graph: RankingRuleGraph<ProximityGraph> }, ProximityGraph { graph: RankingRuleGraph<ProximityGraph> },
ProximityPaths { paths: Vec<Vec<Interned<ProximityCondition>>> }, ProximityPaths { paths: Vec<Vec<Interned<ProximityCondition>>> },
@ -123,12 +123,9 @@ impl SearchLogger<QueryGraph> for VisualSearchLogger {
&mut self, &mut self,
ranking_rule_idx: usize, ranking_rule_idx: usize,
_ranking_rule: &dyn RankingRule<QueryGraph>, _ranking_rule: &dyn RankingRule<QueryGraph>,
universe: &RoaringBitmap, _universe: &RoaringBitmap,
) { ) {
self.events.push(SearchEvents::RankingRuleEndIteration { self.events.push(SearchEvents::RankingRuleEndIteration { ranking_rule_idx });
ranking_rule_idx,
universe_len: universe.len(),
});
self.location.pop(); self.location.pop();
} }
fn add_to_results(&mut self, docids: &[u32]) { fn add_to_results(&mut self, docids: &[u32]) {
@ -326,7 +323,7 @@ impl<'ctx> DetailedLoggerFinish<'ctx> {
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1); assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
self.write_skip_bucket(bucket_len)?; self.write_skip_bucket(bucket_len)?;
} }
SearchEvents::RankingRuleEndIteration { ranking_rule_idx, universe_len: _ } => { SearchEvents::RankingRuleEndIteration { ranking_rule_idx } => {
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1); assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
self.write_end_iteration()?; self.write_end_iteration()?;
} }

View File

@ -1,244 +0,0 @@
---
source: milli/src/search/new/tests/attribute_fid.rs
expression: "format!(\"{document_ids_scores:#?}\")"
---
[
(
2,
[
Fid(
Rank {
rank: 19,
max_rank: 19,
},
),
Position(
Rank {
rank: 91,
max_rank: 91,
},
),
],
),
(
6,
[
Fid(
Rank {
rank: 15,
max_rank: 19,
},
),
Position(
Rank {
rank: 81,
max_rank: 91,
},
),
],
),
(
5,
[
Fid(
Rank {
rank: 14,
max_rank: 19,
},
),
Position(
Rank {
rank: 79,
max_rank: 91,
},
),
],
),
(
4,
[
Fid(
Rank {
rank: 13,
max_rank: 19,
},
),
Position(
Rank {
rank: 77,
max_rank: 91,
},
),
],
),
(
3,
[
Fid(
Rank {
rank: 12,
max_rank: 19,
},
),
Position(
Rank {
rank: 83,
max_rank: 91,
},
),
],
),
(
9,
[
Fid(
Rank {
rank: 11,
max_rank: 19,
},
),
Position(
Rank {
rank: 75,
max_rank: 91,
},
),
],
),
(
8,
[
Fid(
Rank {
rank: 10,
max_rank: 19,
},
),
Position(
Rank {
rank: 79,
max_rank: 91,
},
),
],
),
(
7,
[
Fid(
Rank {
rank: 10,
max_rank: 19,
},
),
Position(
Rank {
rank: 73,
max_rank: 91,
},
),
],
),
(
11,
[
Fid(
Rank {
rank: 7,
max_rank: 19,
},
),
Position(
Rank {
rank: 77,
max_rank: 91,
},
),
],
),
(
10,
[
Fid(
Rank {
rank: 6,
max_rank: 19,
},
),
Position(
Rank {
rank: 81,
max_rank: 91,
},
),
],
),
(
13,
[
Fid(
Rank {
rank: 6,
max_rank: 19,
},
),
Position(
Rank {
rank: 81,
max_rank: 91,
},
),
],
),
(
12,
[
Fid(
Rank {
rank: 6,
max_rank: 19,
},
),
Position(
Rank {
rank: 78,
max_rank: 91,
},
),
],
),
(
14,
[
Fid(
Rank {
rank: 5,
max_rank: 19,
},
),
Position(
Rank {
rank: 75,
max_rank: 91,
},
),
],
),
(
0,
[
Fid(
Rank {
rank: 1,
max_rank: 19,
},
),
Position(
Rank {
rank: 91,
max_rank: 91,
},
),
],
),
]

View File

@ -1,7 +0,0 @@
---
source: milli/src/index.rs
---
age 1 |
id 2 |
name 2 |

View File

@ -1,7 +0,0 @@
---
source: milli/src/index.rs
---
age 1 |
id 2 |
name 2 |

View File

@ -64,6 +64,13 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
self.index.delete_geo_rtree(self.wtxn)?; self.index.delete_geo_rtree(self.wtxn)?;
self.index.delete_geo_faceted_documents_ids(self.wtxn)?; self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
// Remove all user-provided bits from the configs
let mut configs = self.index.embedding_configs(self.wtxn)?;
for config in configs.iter_mut() {
config.user_provided.clear();
}
self.index.put_embedding_configs(self.wtxn, configs)?;
// Clear the other databases. // Clear the other databases.
external_documents_ids.clear(self.wtxn)?; external_documents_ids.clear(self.wtxn)?;
word_docids.clear(self.wtxn)?; word_docids.clear(self.wtxn)?;

View File

@ -8,18 +8,19 @@ use std::sync::Arc;
use bytemuck::cast_slice; use bytemuck::cast_slice;
use grenad::Writer; use grenad::Writer;
use itertools::EitherOrBoth;
use ordered_float::OrderedFloat; use ordered_float::OrderedFloat;
use roaring::RoaringBitmap;
use serde_json::Value; use serde_json::Value;
use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
use crate::index::IndexEmbeddingConfig;
use crate::prompt::Prompt; use crate::prompt::Prompt;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::helpers::try_split_at;
use crate::update::settings::InnerIndexSettingsDiff; use crate::update::settings::InnerIndexSettingsDiff;
use crate::vector::parsed_vectors::{ParsedVectorsDiff, RESERVED_VECTORS_FIELD_NAME}; use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState, RESERVED_VECTORS_FIELD_NAME};
use crate::vector::settings::{EmbedderAction, ReindexAction};
use crate::vector::Embedder; use crate::vector::Embedder;
use crate::{DocumentId, Result, ThreadPoolNoAbort}; use crate::{try_split_array_at, DocumentId, FieldId, FieldsIdsMap, Result, ThreadPoolNoAbort};
/// The length of the elements that are always in the buffer when inserting new values. /// The length of the elements that are always in the buffer when inserting new values.
const TRUNCATE_SIZE: usize = size_of::<DocumentId>(); const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
@ -35,6 +36,8 @@ pub struct ExtractedVectorPoints {
// embedder // embedder
pub embedder_name: String, pub embedder_name: String,
pub embedder: Arc<Embedder>, pub embedder: Arc<Embedder>,
pub add_to_user_provided: RoaringBitmap,
pub remove_from_user_provided: RoaringBitmap,
} }
enum VectorStateDelta { enum VectorStateDelta {
@ -42,12 +45,7 @@ enum VectorStateDelta {
// Remove all vectors, generated or manual, from this document // Remove all vectors, generated or manual, from this document
NowRemoved, NowRemoved,
// Add the manually specified vectors, passed in the other grenad NowManual(Vec<Vec<f32>>),
// Remove any previously generated vectors
// Note: changing the value of the manually specified vector **should not record** this delta
WasGeneratedNowManual(Vec<Vec<f32>>),
ManualDelta(Vec<Vec<f32>>, Vec<Vec<f32>>),
// Add the vector computed from the specified prompt // Add the vector computed from the specified prompt
// Remove any previous vector // Remove any previous vector
@ -56,14 +54,12 @@ enum VectorStateDelta {
} }
impl VectorStateDelta { impl VectorStateDelta {
fn into_values(self) -> (bool, String, (Vec<Vec<f32>>, Vec<Vec<f32>>)) { fn into_values(self) -> (bool, String, Vec<Vec<f32>>) {
match self { match self {
VectorStateDelta::NoChange => Default::default(), VectorStateDelta::NoChange => Default::default(),
VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()), VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()),
VectorStateDelta::WasGeneratedNowManual(add) => { // We always delete the previous vectors
(true, Default::default(), (Default::default(), add)) VectorStateDelta::NowManual(add) => (true, Default::default(), add),
}
VectorStateDelta::ManualDelta(del, add) => (false, Default::default(), (del, add)),
VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()), VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()),
} }
} }
@ -74,12 +70,27 @@ struct EmbedderVectorExtractor {
embedder: Arc<Embedder>, embedder: Arc<Embedder>,
prompt: Arc<Prompt>, prompt: Arc<Prompt>,
// (docid, _index) -> KvWriterDelAdd -> Vector
manual_vectors_writer: Writer<BufWriter<File>>,
// (docid) -> (prompt) // (docid) -> (prompt)
prompts_writer: Writer<BufWriter<File>>, prompts_writer: Writer<BufWriter<File>>,
// (docid) -> () // (docid) -> ()
remove_vectors_writer: Writer<BufWriter<File>>, remove_vectors_writer: Writer<BufWriter<File>>,
// (docid, _index) -> KvWriterDelAdd -> Vector
manual_vectors_writer: Writer<BufWriter<File>>,
// The docids of the documents that contains a user defined embedding
add_to_user_provided: RoaringBitmap,
action: ExtractionAction,
}
struct DocumentOperation {
// The docids of the documents that contains an auto-generated embedding
remove_from_user_provided: RoaringBitmap,
}
enum ExtractionAction {
SettingsFullReindex,
SettingsRegeneratePrompts { old_prompt: Arc<Prompt> },
DocumentOperation(DocumentOperation),
} }
/// Extracts the embedding vector contained in each document under the `_vectors` field. /// Extracts the embedding vector contained in each document under the `_vectors` field.
@ -89,6 +100,7 @@ struct EmbedderVectorExtractor {
pub fn extract_vector_points<R: io::Read + io::Seek>( pub fn extract_vector_points<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>, obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
embedders_configs: &[IndexEmbeddingConfig],
settings_diff: &InnerIndexSettingsDiff, settings_diff: &InnerIndexSettingsDiff,
) -> Result<Vec<ExtractedVectorPoints>> { ) -> Result<Vec<ExtractedVectorPoints>> {
let reindex_vectors = settings_diff.reindex_vectors(); let reindex_vectors = settings_diff.reindex_vectors();
@ -97,153 +109,207 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
let new_fields_ids_map = &settings_diff.new.fields_ids_map; let new_fields_ids_map = &settings_diff.new.fields_ids_map;
// the vector field id may have changed // the vector field id may have changed
let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME); let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
// filter the old vector fid if the settings has been changed forcing reindexing.
let old_vectors_fid = old_vectors_fid.filter(|_| !reindex_vectors);
let new_vectors_fid = new_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME); let new_vectors_fid = new_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
let mut extractors = Vec::new(); let mut extractors = Vec::new();
for (embedder_name, (embedder, prompt)) in
settings_diff.new.embedding_configs.clone().into_iter()
{
// (docid, _index) -> KvWriterDelAdd -> Vector
let manual_vectors_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
// (docid) -> (prompt) let mut configs = settings_diff.new.embedding_configs.clone().into_inner();
let prompts_writer = create_writer( let old_configs = &settings_diff.old.embedding_configs;
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
// (docid) -> () if reindex_vectors {
let remove_vectors_writer = create_writer( for (name, action) in settings_diff.embedding_config_updates.iter() {
indexer.chunk_compression_type, match action {
indexer.chunk_compression_level, EmbedderAction::WriteBackToDocuments(_) => continue, // already deleted
tempfile::tempfile()?, EmbedderAction::Reindex(action) => {
); let Some((embedder_name, (embedder, prompt))) = configs.remove_entry(name)
else {
tracing::error!(embedder = name, "Requested embedder config not found");
continue;
};
extractors.push(EmbedderVectorExtractor { // (docid, _index) -> KvWriterDelAdd -> Vector
embedder_name, let manual_vectors_writer = create_writer(
embedder, indexer.chunk_compression_type,
prompt, indexer.chunk_compression_level,
manual_vectors_writer, tempfile::tempfile()?,
prompts_writer, );
remove_vectors_writer,
}); // (docid) -> (prompt)
let prompts_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
// (docid) -> ()
let remove_vectors_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
let action = match action {
ReindexAction::FullReindex => ExtractionAction::SettingsFullReindex,
ReindexAction::RegeneratePrompts => {
let Some((_, old_prompt)) = old_configs.get(name) else {
tracing::error!(embedder = name, "Old embedder config not found");
continue;
};
ExtractionAction::SettingsRegeneratePrompts { old_prompt }
}
};
extractors.push(EmbedderVectorExtractor {
embedder_name,
embedder,
prompt,
prompts_writer,
remove_vectors_writer,
manual_vectors_writer,
add_to_user_provided: RoaringBitmap::new(),
action,
});
}
}
}
} else {
// document operation
for (embedder_name, (embedder, prompt)) in configs.into_iter() {
// (docid, _index) -> KvWriterDelAdd -> Vector
let manual_vectors_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
// (docid) -> (prompt)
let prompts_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
// (docid) -> ()
let remove_vectors_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
extractors.push(EmbedderVectorExtractor {
embedder_name,
embedder,
prompt,
prompts_writer,
remove_vectors_writer,
manual_vectors_writer,
add_to_user_provided: RoaringBitmap::new(),
action: ExtractionAction::DocumentOperation(DocumentOperation {
remove_from_user_provided: RoaringBitmap::new(),
}),
});
}
} }
let mut key_buffer = Vec::new(); let mut key_buffer = Vec::new();
let mut cursor = obkv_documents.into_cursor()?; let mut cursor = obkv_documents.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? { while let Some((key, value)) = cursor.move_on_next()? {
// this must always be serialized as (docid, external_docid); // this must always be serialized as (docid, external_docid);
const SIZE_OF_DOCUMENTID: usize = std::mem::size_of::<DocumentId>();
let (docid_bytes, external_id_bytes) = let (docid_bytes, external_id_bytes) =
try_split_at(key, std::mem::size_of::<DocumentId>()).unwrap(); try_split_array_at::<u8, SIZE_OF_DOCUMENTID>(key).unwrap();
debug_assert!(from_utf8(external_id_bytes).is_ok()); debug_assert!(from_utf8(external_id_bytes).is_ok());
let docid = DocumentId::from_be_bytes(docid_bytes);
let obkv = obkv::KvReader::new(value); let obkv = obkv::KvReader::new(value);
key_buffer.clear(); key_buffer.clear();
key_buffer.extend_from_slice(docid_bytes); key_buffer.extend_from_slice(docid_bytes.as_slice());
// since we only need the primary key when we throw an error we create this getter to // since we only need the primary key when we throw an error we create this getter to
// lazily get it when needed // lazily get it when needed
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() }; let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
let mut parsed_vectors = ParsedVectorsDiff::new(obkv, old_vectors_fid, new_vectors_fid) let mut parsed_vectors = ParsedVectorsDiff::new(
.map_err(|error| error.to_crate_error(document_id().to_string()))?; docid,
embedders_configs,
obkv,
old_vectors_fid,
new_vectors_fid,
)
.map_err(|error| error.to_crate_error(document_id().to_string()))?;
for EmbedderVectorExtractor { for EmbedderVectorExtractor {
embedder_name, embedder_name,
embedder: _, embedder: _,
prompt, prompt,
manual_vectors_writer,
prompts_writer, prompts_writer,
remove_vectors_writer, remove_vectors_writer,
manual_vectors_writer,
add_to_user_provided,
action,
} in extractors.iter_mut() } in extractors.iter_mut()
{ {
let delta = match parsed_vectors.remove(embedder_name) { let (old, new) = parsed_vectors.remove(embedder_name);
(Some(old), Some(new)) => { let delta = match action {
// no autogeneration ExtractionAction::SettingsFullReindex => match old {
let del_vectors = old.into_array_of_vectors(); // A full reindex can be triggered either by:
let add_vectors = new.into_array_of_vectors(); // 1. a new embedder
// 2. an existing embedder changed so that it must regenerate all generated embeddings.
if add_vectors.len() > usize::from(u8::MAX) { // For a new embedder, there can be `_vectors.embedder` embeddings to add to the DB
return Err(crate::Error::UserError(crate::UserError::TooManyVectors( VectorState::Inline(vectors) => {
document_id().to_string(), if !vectors.must_regenerate() {
add_vectors.len(), add_to_user_provided.insert(docid);
)));
}
VectorStateDelta::ManualDelta(del_vectors, add_vectors)
}
(Some(_old), None) => {
// Do we keep this document?
let document_is_kept = obkv
.iter()
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
if document_is_kept {
// becomes autogenerated
VectorStateDelta::NowGenerated(prompt.render(
obkv,
DelAdd::Addition,
new_fields_ids_map,
)?)
} else {
VectorStateDelta::NowRemoved
}
}
(None, Some(new)) => {
// was possibly autogenerated, remove all vectors for that document
let add_vectors = new.into_array_of_vectors();
if add_vectors.len() > usize::from(u8::MAX) {
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
document_id().to_string(),
add_vectors.len(),
)));
}
VectorStateDelta::WasGeneratedNowManual(add_vectors)
}
(None, None) => {
// Do we keep this document?
let document_is_kept = obkv
.iter()
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
if document_is_kept {
// Don't give up if the old prompt was failing
let old_prompt = Some(&prompt)
// TODO: this filter works because we erase the vec database when a embedding setting changes.
// When vector pipeline will be optimized, this should be removed.
.filter(|_| !settings_diff.reindex_vectors())
.map(|p| {
p.render(obkv, DelAdd::Deletion, old_fields_ids_map)
.unwrap_or_default()
});
let new_prompt =
prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
if old_prompt.as_ref() != Some(&new_prompt) {
let old_prompt = old_prompt.unwrap_or_default();
tracing::trace!(
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
);
VectorStateDelta::NowGenerated(new_prompt)
} else {
tracing::trace!("⏭️ Prompt unmodified, skipping");
VectorStateDelta::NoChange
} }
match vectors.into_array_of_vectors() {
Some(add_vectors) => {
if add_vectors.len() > usize::from(u8::MAX) {
return Err(crate::Error::UserError(
crate::UserError::TooManyVectors(
document_id().to_string(),
add_vectors.len(),
),
));
}
VectorStateDelta::NowManual(add_vectors)
}
None => VectorStateDelta::NoChange,
}
}
// this happens only when an existing embedder changed. We cannot regenerate userProvided vectors
VectorState::Manual => VectorStateDelta::NoChange,
// generated vectors must be regenerated
VectorState::Generated => regenerate_prompt(obkv, prompt, new_fields_ids_map)?,
},
// prompt regeneration is only triggered for existing embedders
ExtractionAction::SettingsRegeneratePrompts { old_prompt } => {
if old.must_regenerate() {
regenerate_if_prompt_changed(
obkv,
(old_prompt, prompt),
(&old_fields_ids_map, &new_fields_ids_map),
)?
} else { } else {
VectorStateDelta::NowRemoved // we can simply ignore user provided vectors as they are not regenerated and are
// already in the DB since this is an existing embedder
VectorStateDelta::NoChange
} }
} }
ExtractionAction::DocumentOperation(DocumentOperation {
remove_from_user_provided,
}) => extract_vector_document_diff(
docid,
obkv,
prompt,
(add_to_user_provided, remove_from_user_provided),
(old, new),
(&old_fields_ids_map, &new_fields_ids_map),
document_id,
)?,
}; };
// and we finally push the unique vectors into the writer // and we finally push the unique vectors into the writer
push_vectors_diff( push_vectors_diff(
remove_vectors_writer, remove_vectors_writer,
@ -251,7 +317,6 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
manual_vectors_writer, manual_vectors_writer,
&mut key_buffer, &mut key_buffer,
delta, delta,
reindex_vectors,
)?; )?;
} }
} }
@ -262,43 +327,185 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
embedder_name, embedder_name,
embedder, embedder,
prompt: _, prompt: _,
manual_vectors_writer,
prompts_writer, prompts_writer,
remove_vectors_writer, remove_vectors_writer,
action,
manual_vectors_writer,
add_to_user_provided,
} in extractors } in extractors
{ {
results.push(ExtractedVectorPoints { let remove_from_user_provided =
// docid, _index -> KvWriterDelAdd -> Vector if let ExtractionAction::DocumentOperation(DocumentOperation {
manual_vectors: writer_into_reader(manual_vectors_writer)?, remove_from_user_provided,
// docid -> () }) = action
remove_vectors: writer_into_reader(remove_vectors_writer)?, {
// docid -> prompt remove_from_user_provided
prompts: writer_into_reader(prompts_writer)?, } else {
Default::default()
};
results.push(ExtractedVectorPoints {
manual_vectors: writer_into_reader(manual_vectors_writer)?,
remove_vectors: writer_into_reader(remove_vectors_writer)?,
prompts: writer_into_reader(prompts_writer)?,
embedder, embedder,
embedder_name, embedder_name,
add_to_user_provided,
remove_from_user_provided,
}) })
} }
Ok(results) Ok(results)
} }
/// Computes the diff between both Del and Add numbers and fn extract_vector_document_diff(
/// only inserts the parts that differ in the sorter. docid: DocumentId,
obkv: obkv::KvReader<'_, FieldId>,
prompt: &Prompt,
(add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap),
(old, new): (VectorState, VectorState),
(old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap),
document_id: impl Fn() -> Value,
) -> Result<VectorStateDelta> {
match (old.must_regenerate(), new.must_regenerate()) {
(true, true) | (false, false) => {}
(true, false) => {
add_to_user_provided.insert(docid);
}
(false, true) => {
remove_from_user_provided.insert(docid);
}
}
let delta = match (old, new) {
// regardless of the previous state, if a document now contains inline _vectors, they must
// be extracted manually
(_old, VectorState::Inline(new)) => match new.into_array_of_vectors() {
Some(add_vectors) => {
if add_vectors.len() > usize::from(u8::MAX) {
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
document_id().to_string(),
add_vectors.len(),
)));
}
VectorStateDelta::NowManual(add_vectors)
}
None => VectorStateDelta::NoChange,
},
// no `_vectors` anywhere, we check for document removal and otherwise we regenerate the prompt if the
// document changed
(VectorState::Generated, VectorState::Generated) => {
// Do we keep this document?
let document_is_kept = obkv
.iter()
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
if document_is_kept {
// Don't give up if the old prompt was failing
let old_prompt = Some(&prompt).map(|p| {
p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default()
});
let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
if old_prompt.as_ref() != Some(&new_prompt) {
let old_prompt = old_prompt.unwrap_or_default();
tracing::trace!(
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
);
VectorStateDelta::NowGenerated(new_prompt)
} else {
tracing::trace!("⏭️ Prompt unmodified, skipping");
VectorStateDelta::NoChange
}
} else {
VectorStateDelta::NowRemoved
}
}
// inline to the left is not supposed to be possible because the embedder is not new, so `_vectors` was removed from
// the previous version of the document.
// Manual -> Generated is also not possible without an Inline to the right (which is handled above)
// Generated -> Generated is handled above, so not possible
// As a result, this code is unreachable
(_not_generated, VectorState::Generated) => {
// Do we keep this document?
let document_is_kept = obkv
.iter()
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
if document_is_kept {
// becomes autogenerated
VectorStateDelta::NowGenerated(prompt.render(
obkv,
DelAdd::Addition,
new_fields_ids_map,
)?)
} else {
// make sure the document is always removed from user provided on removal
remove_from_user_provided.insert(docid);
VectorStateDelta::NowRemoved
}
}
// inline to the left is not possible because the embedder is not new, and so `_vectors` was removed from the previous
// version of the document.
// however the Rust type system cannot know that.
(_manual, VectorState::Manual) => {
// Do we keep this document?
let document_is_kept = obkv
.iter()
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
if document_is_kept {
// if the new version of documents has the vectors in the DB,
// then they are user-provided and nothing possibly changed
VectorStateDelta::NoChange
} else {
// make sure the document is always removed from user provided on removal
remove_from_user_provided.insert(docid);
VectorStateDelta::NowRemoved
}
}
};
Ok(delta)
}
fn regenerate_if_prompt_changed(
obkv: obkv::KvReader<'_, FieldId>,
(old_prompt, new_prompt): (&Prompt, &Prompt),
(old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap),
) -> Result<VectorStateDelta> {
let old_prompt =
old_prompt.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or(Default::default());
let new_prompt = new_prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
if new_prompt == old_prompt {
return Ok(VectorStateDelta::NoChange);
}
Ok(VectorStateDelta::NowGenerated(new_prompt))
}
fn regenerate_prompt(
obkv: obkv::KvReader<'_, FieldId>,
prompt: &Prompt,
new_fields_ids_map: &FieldsIdsMap,
) -> Result<VectorStateDelta> {
let prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
Ok(VectorStateDelta::NowGenerated(prompt))
}
/// We cannot compute the diff between both Del and Add vectors.
/// We'll push every vector and compute the difference later in TypedChunk.
fn push_vectors_diff( fn push_vectors_diff(
remove_vectors_writer: &mut Writer<BufWriter<File>>, remove_vectors_writer: &mut Writer<BufWriter<File>>,
prompts_writer: &mut Writer<BufWriter<File>>, prompts_writer: &mut Writer<BufWriter<File>>,
manual_vectors_writer: &mut Writer<BufWriter<File>>, manual_vectors_writer: &mut Writer<BufWriter<File>>,
key_buffer: &mut Vec<u8>, key_buffer: &mut Vec<u8>,
delta: VectorStateDelta, delta: VectorStateDelta,
reindex_vectors: bool,
) -> Result<()> { ) -> Result<()> {
let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values(); let (must_remove, prompt, mut add_vectors) = delta.into_values();
if must_remove if must_remove {
// TODO: the below condition works because we erase the vec database when a embedding setting changes.
// When vector pipeline will be optimized, this should be removed.
&& !reindex_vectors
{
key_buffer.truncate(TRUNCATE_SIZE); key_buffer.truncate(TRUNCATE_SIZE);
remove_vectors_writer.insert(&key_buffer, [])?; remove_vectors_writer.insert(&key_buffer, [])?;
} }
@ -308,44 +515,22 @@ fn push_vectors_diff(
} }
// We sort and dedup the vectors // We sort and dedup the vectors
del_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
del_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
let merged_vectors_iter =
itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add));
// insert vectors into the writer // insert vectors into the writer
for (i, eob) in merged_vectors_iter.into_iter().enumerate().take(u16::MAX as usize) { for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) {
// Generate the key by extending the unique index to it. // Generate the key by extending the unique index to it.
key_buffer.truncate(TRUNCATE_SIZE); key_buffer.truncate(TRUNCATE_SIZE);
let index = u16::try_from(i).unwrap(); let index = u16::try_from(i).unwrap();
key_buffer.extend_from_slice(&index.to_be_bytes()); key_buffer.extend_from_slice(&index.to_be_bytes());
match eob { // We insert only the Add part of the Obkv to inform
EitherOrBoth::Both(_, _) => (), // no need to touch anything // that we only want to remove all those vectors.
EitherOrBoth::Left(vector) => { let mut obkv = KvWriterDelAdd::memory();
// TODO: the below condition works because we erase the vec database when a embedding setting changes. obkv.insert(DelAdd::Addition, cast_slice(&vector))?;
// When vector pipeline will be optimized, this should be removed. let bytes = obkv.into_inner()?;
if !reindex_vectors { manual_vectors_writer.insert(&key_buffer, bytes)?;
// We insert only the Del part of the Obkv to inform
// that we only want to remove all those vectors.
let mut obkv = KvWriterDelAdd::memory();
obkv.insert(DelAdd::Deletion, cast_slice(&vector))?;
let bytes = obkv.into_inner()?;
manual_vectors_writer.insert(&key_buffer, bytes)?;
}
}
EitherOrBoth::Right(vector) => {
// We insert only the Add part of the Obkv to inform
// that we only want to remove all those vectors.
let mut obkv = KvWriterDelAdd::memory();
obkv.insert(DelAdd::Addition, cast_slice(&vector))?;
let bytes = obkv.into_inner()?;
manual_vectors_writer.insert(&key_buffer, bytes)?;
}
}
} }
Ok(()) Ok(())

View File

@ -30,6 +30,7 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids
use self::extract_word_position_docids::extract_word_position_docids; use self::extract_word_position_docids::extract_word_position_docids;
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters}; use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
use super::{helpers, TypedChunk}; use super::{helpers, TypedChunk};
use crate::index::IndexEmbeddingConfig;
use crate::update::settings::InnerIndexSettingsDiff; use crate::update::settings::InnerIndexSettingsDiff;
use crate::{FieldId, Result, ThreadPoolNoAbortBuilder}; use crate::{FieldId, Result, ThreadPoolNoAbortBuilder};
@ -43,6 +44,7 @@ pub(crate) fn data_from_obkv_documents(
indexer: GrenadParameters, indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>, lmdb_writer_sx: Sender<Result<TypedChunk>>,
primary_key_id: FieldId, primary_key_id: FieldId,
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
settings_diff: Arc<InnerIndexSettingsDiff>, settings_diff: Arc<InnerIndexSettingsDiff>,
max_positions_per_attributes: Option<u32>, max_positions_per_attributes: Option<u32>,
) -> Result<()> { ) -> Result<()> {
@ -55,6 +57,7 @@ pub(crate) fn data_from_obkv_documents(
original_documents_chunk, original_documents_chunk,
indexer, indexer,
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
embedders_configs.clone(),
settings_diff.clone(), settings_diff.clone(),
) )
}) })
@ -210,6 +213,7 @@ fn send_original_documents_data(
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>, original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
indexer: GrenadParameters, indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>, lmdb_writer_sx: Sender<Result<TypedChunk>>,
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
settings_diff: Arc<InnerIndexSettingsDiff>, settings_diff: Arc<InnerIndexSettingsDiff>,
) -> Result<()> { ) -> Result<()> {
let original_documents_chunk = let original_documents_chunk =
@ -226,11 +230,17 @@ fn send_original_documents_data(
if index_vectors { if index_vectors {
let settings_diff = settings_diff.clone(); let settings_diff = settings_diff.clone();
let embedders_configs = embedders_configs.clone();
let original_documents_chunk = original_documents_chunk.clone(); let original_documents_chunk = original_documents_chunk.clone();
let lmdb_writer_sx = lmdb_writer_sx.clone(); let lmdb_writer_sx = lmdb_writer_sx.clone();
rayon::spawn(move || { rayon::spawn(move || {
match extract_vector_points(original_documents_chunk.clone(), indexer, &settings_diff) { match extract_vector_points(
original_documents_chunk.clone(),
indexer,
&embedders_configs,
&settings_diff,
) {
Ok(extracted_vectors) => { Ok(extracted_vectors) => {
for ExtractedVectorPoints { for ExtractedVectorPoints {
manual_vectors, manual_vectors,
@ -238,6 +248,8 @@ fn send_original_documents_data(
prompts, prompts,
embedder_name, embedder_name,
embedder, embedder,
add_to_user_provided,
remove_from_user_provided,
} in extracted_vectors } in extracted_vectors
{ {
let embeddings = match extract_embeddings( let embeddings = match extract_embeddings(
@ -262,6 +274,8 @@ fn send_original_documents_data(
expected_dimension: embedder.dimensions(), expected_dimension: embedder.dimensions(),
manual_vectors, manual_vectors,
embedder_name, embedder_name,
add_to_user_provided,
remove_from_user_provided,
})); }));
} }
} }

View File

@ -286,6 +286,7 @@ where
settings_diff.new.recompute_searchables(self.wtxn, self.index)?; settings_diff.new.recompute_searchables(self.wtxn, self.index)?;
let settings_diff = Arc::new(settings_diff); let settings_diff = Arc::new(settings_diff);
let embedders_configs = Arc::new(self.index.embedding_configs(self.wtxn)?);
let backup_pool; let backup_pool;
let pool = match self.indexer_config.thread_pool { let pool = match self.indexer_config.thread_pool {
@ -399,6 +400,7 @@ where
pool_params, pool_params,
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
primary_key_id, primary_key_id,
embedders_configs.clone(),
settings_diff_cloned, settings_diff_cloned,
max_positions_per_attributes, max_positions_per_attributes,
) )
@ -501,6 +503,8 @@ where
embeddings, embeddings,
manual_vectors, manual_vectors,
embedder_name, embedder_name,
add_to_user_provided,
remove_from_user_provided,
} => { } => {
dimension.insert(embedder_name.clone(), expected_dimension); dimension.insert(embedder_name.clone(), expected_dimension);
TypedChunk::VectorPoints { TypedChunk::VectorPoints {
@ -509,6 +513,8 @@ where
expected_dimension, expected_dimension,
manual_vectors, manual_vectors,
embedder_name, embedder_name,
add_to_user_provided,
remove_from_user_provided,
} }
} }
otherwise => otherwise, otherwise => otherwise,
@ -781,6 +787,7 @@ mod tests {
use super::*; use super::*;
use crate::documents::documents_batch_reader_from_objects; use crate::documents::documents_batch_reader_from_objects;
use crate::index::tests::TempIndex; use crate::index::tests::TempIndex;
use crate::index::IndexEmbeddingConfig;
use crate::search::TermsMatchingStrategy; use crate::search::TermsMatchingStrategy;
use crate::update::Setting; use crate::update::Setting;
use crate::{db_snap, Filter, Search}; use crate::{db_snap, Filter, Search};
@ -2616,10 +2623,12 @@ mod tests {
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let mut embedding_configs = index.embedding_configs(&rtxn).unwrap(); let mut embedding_configs = index.embedding_configs(&rtxn).unwrap();
let (embedder_name, embedder) = embedding_configs.pop().unwrap(); let IndexEmbeddingConfig { name: embedder_name, config: embedder, user_provided } =
embedding_configs.pop().unwrap();
insta::assert_snapshot!(embedder_name, @"manual");
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[0, 1, 2]>");
let embedder = let embedder =
std::sync::Arc::new(crate::vector::Embedder::new(embedder.embedder_options).unwrap()); std::sync::Arc::new(crate::vector::Embedder::new(embedder.embedder_options).unwrap());
assert_eq!("manual", embedder_name);
let res = index let res = index
.search(&rtxn) .search(&rtxn)
.semantic(embedder_name, embedder, Some([0.0, 1.0, 2.0].to_vec())) .semantic(embedder_name, embedder, Some([0.0, 1.0, 2.0].to_vec()))

View File

@ -1,7 +1,7 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::btree_map::Entry as BEntry; use std::collections::btree_map::Entry as BEntry;
use std::collections::hash_map::Entry as HEntry; use std::collections::hash_map::Entry as HEntry;
use std::collections::{HashMap, HashSet}; use std::collections::{BTreeMap, HashMap, HashSet};
use std::fs::File; use std::fs::File;
use std::io::{Read, Seek}; use std::io::{Read, Seek};
@ -27,6 +27,8 @@ use crate::update::del_add::{
use crate::update::index_documents::GrenadParameters; use crate::update::index_documents::GrenadParameters;
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
use crate::vector::settings::{EmbedderAction, WriteBackToDocuments};
use crate::{ use crate::{
is_faceted_by, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, is_faceted_by, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result,
}; };
@ -51,7 +53,6 @@ pub struct Transform<'a, 'i> {
fields_ids_map: FieldsIdsMap, fields_ids_map: FieldsIdsMap,
indexer_settings: &'a IndexerConfig, indexer_settings: &'a IndexerConfig,
pub autogenerate_docids: bool,
pub index_documents_method: IndexDocumentsMethod, pub index_documents_method: IndexDocumentsMethod,
available_documents_ids: AvailableDocumentsIds, available_documents_ids: AvailableDocumentsIds,
@ -105,7 +106,7 @@ impl<'a, 'i> Transform<'a, 'i> {
index: &'i Index, index: &'i Index,
indexer_settings: &'a IndexerConfig, indexer_settings: &'a IndexerConfig,
index_documents_method: IndexDocumentsMethod, index_documents_method: IndexDocumentsMethod,
autogenerate_docids: bool, _autogenerate_docids: bool,
) -> Result<Self> { ) -> Result<Self> {
// We must choose the appropriate merge function for when two or more documents // We must choose the appropriate merge function for when two or more documents
// with the same user id must be merged or fully replaced in the same batch. // with the same user id must be merged or fully replaced in the same batch.
@ -139,7 +140,6 @@ impl<'a, 'i> Transform<'a, 'i> {
index, index,
fields_ids_map: index.fields_ids_map(wtxn)?, fields_ids_map: index.fields_ids_map(wtxn)?,
indexer_settings, indexer_settings,
autogenerate_docids,
available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids), available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids),
original_sorter, original_sorter,
flattened_sorter, flattened_sorter,
@ -808,13 +808,13 @@ impl<'a, 'i> Transform<'a, 'i> {
let mut new_inner_settings = old_inner_settings.clone(); let mut new_inner_settings = old_inner_settings.clone();
new_inner_settings.fields_ids_map = fields_ids_map; new_inner_settings.fields_ids_map = fields_ids_map;
let embedding_configs_updated = false; let embedding_config_updates = Default::default();
let settings_update_only = false; let settings_update_only = false;
let settings_diff = InnerIndexSettingsDiff::new( let settings_diff = InnerIndexSettingsDiff::new(
old_inner_settings, old_inner_settings,
new_inner_settings, new_inner_settings,
primary_key_id, primary_key_id,
embedding_configs_updated, embedding_config_updates,
settings_update_only, settings_update_only,
); );
@ -835,10 +835,13 @@ impl<'a, 'i> Transform<'a, 'i> {
/// Rebind the field_ids of the provided document to their values /// Rebind the field_ids of the provided document to their values
/// based on the field_ids_maps difference between the old and the new settings, /// based on the field_ids_maps difference between the old and the new settings,
/// then fill the provided buffers with delta documents using KvWritterDelAdd. /// then fill the provided buffers with delta documents using KvWritterDelAdd.
#[allow(clippy::too_many_arguments)] // need the vectors + fid, feel free to create a struct xo xo
fn rebind_existing_document( fn rebind_existing_document(
old_obkv: KvReader<FieldId>, old_obkv: KvReader<FieldId>,
settings_diff: &InnerIndexSettingsDiff, settings_diff: &InnerIndexSettingsDiff,
modified_faceted_fields: &HashSet<String>, modified_faceted_fields: &HashSet<String>,
mut injected_vectors: serde_json::Map<String, serde_json::Value>,
old_vectors_fid: Option<FieldId>,
original_obkv_buffer: Option<&mut Vec<u8>>, original_obkv_buffer: Option<&mut Vec<u8>>,
flattened_obkv_buffer: Option<&mut Vec<u8>>, flattened_obkv_buffer: Option<&mut Vec<u8>>,
) -> Result<()> { ) -> Result<()> {
@ -861,9 +864,49 @@ impl<'a, 'i> Transform<'a, 'i> {
// The operations that we must perform on the different fields. // The operations that we must perform on the different fields.
let mut operations = HashMap::new(); let mut operations = HashMap::new();
let mut error_seen = false;
let mut obkv_writer = KvWriter::<_, FieldId>::memory(); let mut obkv_writer = KvWriter::<_, FieldId>::memory();
for (id, val) in old_obkv.iter() { 'write_fid: for (id, val) in old_obkv.iter() {
if !injected_vectors.is_empty() {
'inject_vectors: {
let Some(vectors_fid) = old_vectors_fid else { break 'inject_vectors };
if id < vectors_fid {
break 'inject_vectors;
}
let mut existing_vectors = if id == vectors_fid {
let existing_vectors: std::result::Result<
serde_json::Map<String, serde_json::Value>,
serde_json::Error,
> = serde_json::from_slice(val);
match existing_vectors {
Ok(existing_vectors) => existing_vectors,
Err(error) => {
if !error_seen {
tracing::error!(%error, "Unexpected `_vectors` field that is not a map. Treating as an empty map");
error_seen = true;
}
Default::default()
}
}
} else {
Default::default()
};
existing_vectors.append(&mut injected_vectors);
operations.insert(vectors_fid, DelAddOperation::DeletionAndAddition);
obkv_writer
.insert(vectors_fid, serde_json::to_vec(&existing_vectors).unwrap())?;
if id == vectors_fid {
continue 'write_fid;
}
}
}
if is_primary_key(id) || necessary_faceted_field(id) || reindex_vectors { if is_primary_key(id) || necessary_faceted_field(id) || reindex_vectors {
operations.insert(id, DelAddOperation::DeletionAndAddition); operations.insert(id, DelAddOperation::DeletionAndAddition);
obkv_writer.insert(id, val)?; obkv_writer.insert(id, val)?;
@ -872,6 +915,15 @@ impl<'a, 'i> Transform<'a, 'i> {
obkv_writer.insert(id, val)?; obkv_writer.insert(id, val)?;
} }
} }
if !injected_vectors.is_empty() {
'inject_vectors: {
let Some(vectors_fid) = old_vectors_fid else { break 'inject_vectors };
operations.insert(vectors_fid, DelAddOperation::DeletionAndAddition);
obkv_writer.insert(vectors_fid, serde_json::to_vec(&injected_vectors).unwrap())?;
}
}
let data = obkv_writer.into_inner()?; let data = obkv_writer.into_inner()?;
let obkv = KvReader::<FieldId>::new(&data); let obkv = KvReader::<FieldId>::new(&data);
@ -937,6 +989,35 @@ impl<'a, 'i> Transform<'a, 'i> {
None None
}; };
let readers: Result<
BTreeMap<&str, (Vec<arroy::Reader<arroy::distances::Angular>>, &RoaringBitmap)>,
> = settings_diff
.embedding_config_updates
.iter()
.filter_map(|(name, action)| {
if let EmbedderAction::WriteBackToDocuments(WriteBackToDocuments {
embedder_id,
user_provided,
}) = action
{
let readers: Result<Vec<_>> =
self.index.arroy_readers(wtxn, *embedder_id).collect();
match readers {
Ok(readers) => Some(Ok((name.as_str(), (readers, user_provided)))),
Err(error) => Some(Err(error)),
}
} else {
None
}
})
.collect();
let readers = readers?;
let old_vectors_fid = settings_diff
.old
.fields_ids_map
.id(crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME);
// We initialize the sorter with the user indexing settings. // We initialize the sorter with the user indexing settings.
let mut flattened_sorter = let mut flattened_sorter =
if settings_diff.reindex_searchable() || settings_diff.reindex_facets() { if settings_diff.reindex_searchable() || settings_diff.reindex_facets() {
@ -963,10 +1044,50 @@ impl<'a, 'i> Transform<'a, 'i> {
InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
)?; )?;
let injected_vectors: std::result::Result<
serde_json::Map<String, serde_json::Value>,
arroy::Error,
> = readers
.iter()
.filter_map(|(name, (readers, user_provided))| {
if !user_provided.contains(docid) {
return None;
}
let mut vectors = Vec::new();
for reader in readers {
let Some(vector) = reader.item_vector(wtxn, docid).transpose() else {
break;
};
match vector {
Ok(vector) => vectors.push(vector),
Err(error) => return Some(Err(error)),
}
}
if vectors.is_empty() {
return None;
}
Some(Ok((
name.to_string(),
serde_json::to_value(ExplicitVectors {
embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors(
vectors,
)),
regenerate: false,
})
.unwrap(),
)))
})
.collect();
let injected_vectors = injected_vectors?;
Self::rebind_existing_document( Self::rebind_existing_document(
old_obkv, old_obkv,
&settings_diff, &settings_diff,
&modified_faceted_fields, &modified_faceted_fields,
injected_vectors,
old_vectors_fid,
Some(&mut original_obkv_buffer).filter(|_| original_sorter.is_some()), Some(&mut original_obkv_buffer).filter(|_| original_sorter.is_some()),
Some(&mut flattened_obkv_buffer).filter(|_| flattened_sorter.is_some()), Some(&mut flattened_obkv_buffer).filter(|_| flattened_sorter.is_some()),
)?; )?;
@ -983,6 +1104,23 @@ impl<'a, 'i> Transform<'a, 'i> {
} }
} }
let mut writers = Vec::new();
// delete all vectors from the embedders that need removal
for (_, (readers, _)) in readers {
for reader in readers {
let dimensions = reader.dimensions();
let arroy_index = reader.index();
drop(reader);
let writer = arroy::Writer::new(self.index.vector_arroy, arroy_index, dimensions);
writers.push(writer);
}
}
for writer in writers {
writer.clear(wtxn)?;
}
let grenad_params = GrenadParameters { let grenad_params = GrenadParameters {
chunk_compression_type: self.indexer_settings.chunk_compression_type, chunk_compression_type: self.indexer_settings.chunk_compression_type,
chunk_compression_level: self.indexer_settings.chunk_compression_level, chunk_compression_level: self.indexer_settings.chunk_compression_level,

View File

@ -20,6 +20,7 @@ use super::MergeFn;
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
use crate::facet::FacetType; use crate::facet::FacetType;
use crate::index::db_name::DOCUMENTS; use crate::index::db_name::DOCUMENTS;
use crate::index::IndexEmbeddingConfig;
use crate::proximity::MAX_DISTANCE; use crate::proximity::MAX_DISTANCE;
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
use crate::update::facet::FacetsUpdate; use crate::update::facet::FacetsUpdate;
@ -90,6 +91,8 @@ pub(crate) enum TypedChunk {
expected_dimension: usize, expected_dimension: usize,
manual_vectors: grenad::Reader<BufReader<File>>, manual_vectors: grenad::Reader<BufReader<File>>,
embedder_name: String, embedder_name: String,
add_to_user_provided: RoaringBitmap,
remove_from_user_provided: RoaringBitmap,
}, },
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
} }
@ -154,8 +157,11 @@ pub(crate) fn write_typed_chunk_into_index(
let mut docids = index.documents_ids(wtxn)?; let mut docids = index.documents_ids(wtxn)?;
let mut iter = merger.into_stream_merger_iter()?; let mut iter = merger.into_stream_merger_iter()?;
let embedders: BTreeSet<_> = let embedders: BTreeSet<_> = index
index.embedding_configs(wtxn)?.into_iter().map(|(k, _v)| k).collect(); .embedding_configs(wtxn)?
.into_iter()
.map(|IndexEmbeddingConfig { name, .. }| name)
.collect();
let mut vectors_buffer = Vec::new(); let mut vectors_buffer = Vec::new();
while let Some((key, reader)) = iter.next()? { while let Some((key, reader)) = iter.next()? {
let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
@ -181,7 +187,7 @@ pub(crate) fn write_typed_chunk_into_index(
// if the `_vectors` field cannot be parsed as map of vectors, just write it as-is // if the `_vectors` field cannot be parsed as map of vectors, just write it as-is
break 'vectors Some(addition); break 'vectors Some(addition);
}; };
vectors.retain_user_provided_vectors(&embedders); vectors.retain_not_embedded_vectors(&embedders);
let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors; let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors;
if vectors.is_empty() { if vectors.is_empty() {
// skip writing empty `_vectors` map // skip writing empty `_vectors` map
@ -619,6 +625,8 @@ pub(crate) fn write_typed_chunk_into_index(
let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn); let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn); let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn); let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn);
let mut add_to_user_provided = RoaringBitmap::new();
let mut remove_from_user_provided = RoaringBitmap::new();
let mut params = None; let mut params = None;
for typed_chunk in typed_chunks { for typed_chunk in typed_chunks {
let TypedChunk::VectorPoints { let TypedChunk::VectorPoints {
@ -627,6 +635,8 @@ pub(crate) fn write_typed_chunk_into_index(
embeddings, embeddings,
expected_dimension, expected_dimension,
embedder_name, embedder_name,
add_to_user_provided: aud,
remove_from_user_provided: rud,
} = typed_chunk } = typed_chunk
else { else {
unreachable!(); unreachable!();
@ -639,11 +649,23 @@ pub(crate) fn write_typed_chunk_into_index(
if let Some(embeddings) = embeddings { if let Some(embeddings) = embeddings {
embeddings_builder.push(embeddings.into_cursor()?); embeddings_builder.push(embeddings.into_cursor()?);
} }
add_to_user_provided |= aud;
remove_from_user_provided |= rud;
} }
// typed chunks has always at least 1 chunk. // typed chunks has always at least 1 chunk.
let Some((expected_dimension, embedder_name)) = params else { unreachable!() }; let Some((expected_dimension, embedder_name)) = params else { unreachable!() };
let mut embedding_configs = index.embedding_configs(wtxn)?;
let index_embedder_config = embedding_configs
.iter_mut()
.find(|IndexEmbeddingConfig { name, .. }| name == &embedder_name)
.unwrap();
index_embedder_config.user_provided -= remove_from_user_provided;
index_embedder_config.user_provided |= add_to_user_provided;
index.put_embedding_configs(wtxn, embedding_configs)?;
let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or( let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None }, InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
)?; )?;

View File

@ -6,6 +6,7 @@ use std::sync::Arc;
use charabia::{Normalize, Tokenizer, TokenizerBuilder}; use charabia::{Normalize, Tokenizer, TokenizerBuilder};
use deserr::{DeserializeError, Deserr}; use deserr::{DeserializeError, Deserr};
use itertools::{EitherOrBoth, Itertools}; use itertools::{EitherOrBoth, Itertools};
use roaring::RoaringBitmap;
use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde::{Deserialize, Deserializer, Serialize, Serializer};
use time::OffsetDateTime; use time::OffsetDateTime;
@ -14,12 +15,18 @@ use super::index_documents::{IndexDocumentsConfig, Transform};
use super::IndexerConfig; use super::IndexerConfig;
use crate::criterion::Criterion; use crate::criterion::Criterion;
use crate::error::UserError; use crate::error::UserError;
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::index::{
IndexEmbeddingConfig, DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS,
};
use crate::order_by_map::OrderByMap; use crate::order_by_map::OrderByMap;
use crate::proximity::ProximityPrecision; use crate::proximity::ProximityPrecision;
use crate::update::index_documents::IndexDocumentsMethod; use crate::update::index_documents::IndexDocumentsMethod;
use crate::update::{IndexDocuments, UpdateIndexingStep}; use crate::update::{IndexDocuments, UpdateIndexingStep};
use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings}; use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME;
use crate::vector::settings::{
check_set, check_unset, EmbedderAction, EmbedderSource, EmbeddingSettings, ReindexAction,
WriteBackToDocuments,
};
use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs};
use crate::{FieldId, FieldsIdsMap, Index, Result}; use crate::{FieldId, FieldsIdsMap, Index, Result};
@ -490,6 +497,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
self.index.put_all_searchable_fields_from_fields_ids_map( self.index.put_all_searchable_fields_from_fields_ids_map(
self.wtxn, self.wtxn,
&names, &names,
&fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME),
&fields_ids_map, &fields_ids_map,
)?; )?;
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
@ -919,92 +927,177 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
Ok(changed) Ok(changed)
} }
fn update_embedding_configs(&mut self) -> Result<bool> { fn update_embedding_configs(&mut self) -> Result<BTreeMap<String, EmbedderAction>> {
let update = match std::mem::take(&mut self.embedder_settings) { match std::mem::take(&mut self.embedder_settings) {
Setting::Set(configs) => { Setting::Set(configs) => self.update_embedding_configs_set(configs),
let mut changed = false; Setting::Reset => {
// all vectors should be written back to documents
let old_configs = self.index.embedding_configs(self.wtxn)?; let old_configs = self.index.embedding_configs(self.wtxn)?;
let old_configs: BTreeMap<String, Setting<EmbeddingSettings>> = let remove_all: Result<BTreeMap<String, EmbedderAction>> = old_configs
old_configs.into_iter().map(|(k, v)| (k, Setting::Set(v.into()))).collect();
let mut new_configs = BTreeMap::new();
for joined in old_configs
.into_iter() .into_iter()
.merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right)) .map(|IndexEmbeddingConfig { name, config: _, user_provided }| -> Result<_> {
{ let embedder_id =
match joined { self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or(
// updated config crate::InternalError::DatabaseMissingEntry {
EitherOrBoth::Both((name, mut old), (_, new)) => { db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID,
changed |= EmbeddingSettings::apply_and_need_reindex(&mut old, new); key: None,
if changed { },
tracing::debug!(embedder = name, "need reindex"); )?;
} else { Ok((
tracing::debug!(embedder = name, "skip reindex"); name,
} EmbedderAction::WriteBackToDocuments(WriteBackToDocuments {
let new = validate_embedding_settings(old, &name)?; embedder_id,
new_configs.insert(name, new); user_provided,
} }),
// unchanged config ))
EitherOrBoth::Left((name, setting)) => {
new_configs.insert(name, setting);
}
// new config
EitherOrBoth::Right((name, mut setting)) => {
// apply the default source in case the source was not set so that it gets validated
crate::vector::settings::EmbeddingSettings::apply_default_source(
&mut setting,
);
crate::vector::settings::EmbeddingSettings::apply_default_openai_model(
&mut setting,
);
let setting = validate_embedding_settings(setting, &name)?;
changed = true;
new_configs.insert(name, setting);
}
}
}
let new_configs: Vec<(String, EmbeddingConfig)> = new_configs
.into_iter()
.filter_map(|(name, setting)| match setting {
Setting::Set(value) => Some((name, value.into())),
Setting::Reset => None,
Setting::NotSet => Some((name, EmbeddingSettings::default().into())),
}) })
.collect(); .collect();
let remove_all = remove_all?;
self.index.embedder_category_id.clear(self.wtxn)?; self.index.embedder_category_id.clear(self.wtxn)?;
for (index, (embedder_name, _)) in new_configs.iter().enumerate() {
self.index.embedder_category_id.put_with_flags(
self.wtxn,
heed::PutFlags::APPEND,
embedder_name,
&index
.try_into()
.map_err(|_| UserError::TooManyEmbedders(new_configs.len()))?,
)?;
}
if new_configs.is_empty() {
self.index.delete_embedding_configs(self.wtxn)?;
} else {
self.index.put_embedding_configs(self.wtxn, new_configs)?;
}
changed
}
Setting::Reset => {
self.index.delete_embedding_configs(self.wtxn)?; self.index.delete_embedding_configs(self.wtxn)?;
true Ok(remove_all)
} }
Setting::NotSet => false, Setting::NotSet => Ok(Default::default()),
};
// if any changes force a reindexing
// clear the vector database.
if update {
self.index.vector_arroy.clear(self.wtxn)?;
} }
}
Ok(update) fn update_embedding_configs_set(
&mut self,
configs: BTreeMap<String, Setting<EmbeddingSettings>>,
) -> Result<BTreeMap<String, EmbedderAction>> {
use crate::vector::settings::SettingsDiff;
let old_configs = self.index.embedding_configs(self.wtxn)?;
let old_configs: BTreeMap<String, (EmbeddingSettings, RoaringBitmap)> = old_configs
.into_iter()
.map(|IndexEmbeddingConfig { name, config, user_provided }| {
(name, (config.into(), user_provided))
})
.collect();
let mut updated_configs = BTreeMap::new();
let mut embedder_actions = BTreeMap::new();
for joined in old_configs
.into_iter()
.merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right))
{
match joined {
// updated config
EitherOrBoth::Both((name, (old, user_provided)), (_, new)) => {
let settings_diff = SettingsDiff::from_settings(old, new);
match settings_diff {
SettingsDiff::Remove => {
tracing::debug!(
embedder = name,
user_provided = user_provided.len(),
"removing embedder"
);
let embedder_id =
self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or(
crate::InternalError::DatabaseMissingEntry {
db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID,
key: None,
},
)?;
// free id immediately
self.index.embedder_category_id.delete(self.wtxn, &name)?;
embedder_actions.insert(
name,
EmbedderAction::WriteBackToDocuments(WriteBackToDocuments {
embedder_id,
user_provided,
}),
);
}
SettingsDiff::Reindex { action, updated_settings } => {
tracing::debug!(
embedder = name,
user_provided = user_provided.len(),
?action,
"reindex embedder"
);
embedder_actions.insert(name.clone(), EmbedderAction::Reindex(action));
let new =
validate_embedding_settings(Setting::Set(updated_settings), &name)?;
updated_configs.insert(name, (new, user_provided));
}
SettingsDiff::UpdateWithoutReindex { updated_settings } => {
tracing::debug!(
embedder = name,
user_provided = user_provided.len(),
"update without reindex embedder"
);
let new =
validate_embedding_settings(Setting::Set(updated_settings), &name)?;
updated_configs.insert(name, (new, user_provided));
}
}
}
// unchanged config
EitherOrBoth::Left((name, (setting, user_provided))) => {
tracing::debug!(embedder = name, "unchanged embedder");
updated_configs.insert(name, (Setting::Set(setting), user_provided));
}
// new config
EitherOrBoth::Right((name, mut setting)) => {
tracing::debug!(embedder = name, "new embedder");
// apply the default source in case the source was not set so that it gets validated
crate::vector::settings::EmbeddingSettings::apply_default_source(&mut setting);
crate::vector::settings::EmbeddingSettings::apply_default_openai_model(
&mut setting,
);
let setting = validate_embedding_settings(setting, &name)?;
embedder_actions
.insert(name.clone(), EmbedderAction::Reindex(ReindexAction::FullReindex));
updated_configs.insert(name, (setting, RoaringBitmap::new()));
}
}
}
let mut free_indices: [bool; u8::MAX as usize] = [true; u8::MAX as usize];
for res in self.index.embedder_category_id.iter(self.wtxn)? {
let (_name, id) = res?;
free_indices[id as usize] = false;
}
let mut free_indices = free_indices.iter_mut().enumerate();
let mut find_free_index =
move || free_indices.find(|(_, free)| **free).map(|(index, _)| index as u8);
for (name, action) in embedder_actions.iter() {
match action {
EmbedderAction::Reindex(ReindexAction::RegeneratePrompts) => {
/* cannot be a new embedder, so has to have an id already */
}
EmbedderAction::Reindex(ReindexAction::FullReindex) => {
if self.index.embedder_category_id.get(self.wtxn, name)?.is_none() {
let id = find_free_index()
.ok_or(UserError::TooManyEmbedders(updated_configs.len()))?;
tracing::debug!(embedder = name, id, "assigning free id to new embedder");
self.index.embedder_category_id.put(self.wtxn, name, &id)?;
}
}
EmbedderAction::WriteBackToDocuments(_) => { /* already removed */ }
}
}
let updated_configs: Vec<IndexEmbeddingConfig> = updated_configs
.into_iter()
.filter_map(|(name, (config, user_provided))| match config {
Setting::Set(config) => {
Some(IndexEmbeddingConfig { name, config: config.into(), user_provided })
}
Setting::Reset => None,
Setting::NotSet => Some(IndexEmbeddingConfig {
name,
config: EmbeddingSettings::default().into(),
user_provided,
}),
})
.collect();
if updated_configs.is_empty() {
self.index.delete_embedding_configs(self.wtxn)?;
} else {
self.index.put_embedding_configs(self.wtxn, updated_configs)?;
}
Ok(embedder_actions)
} }
fn update_search_cutoff(&mut self) -> Result<bool> { fn update_search_cutoff(&mut self) -> Result<bool> {
@ -1058,13 +1151,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
self.update_searchable()?; self.update_searchable()?;
self.update_exact_attributes()?; self.update_exact_attributes()?;
self.update_proximity_precision()?; self.update_proximity_precision()?;
// TODO: very rough approximation of the needs for reindexing where any change will result in
// a full reindexing. let embedding_config_updates = self.update_embedding_configs()?;
// What can be done instead:
// 1. Only change the distance on a distance change
// 2. Only change the name -> embedder mapping on a name change
// 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage
let embedding_configs_updated = self.update_embedding_configs()?;
let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?; let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?;
new_inner_settings.recompute_facets(self.wtxn, self.index)?; new_inner_settings.recompute_facets(self.wtxn, self.index)?;
@ -1078,7 +1166,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
old_inner_settings, old_inner_settings,
new_inner_settings, new_inner_settings,
primary_key_id, primary_key_id,
embedding_configs_updated, embedding_config_updates,
settings_update_only, settings_update_only,
); );
@ -1094,8 +1182,7 @@ pub struct InnerIndexSettingsDiff {
pub(crate) old: InnerIndexSettings, pub(crate) old: InnerIndexSettings,
pub(crate) new: InnerIndexSettings, pub(crate) new: InnerIndexSettings,
pub(crate) primary_key_id: Option<FieldId>, pub(crate) primary_key_id: Option<FieldId>,
// TODO: compare directly the embedders. pub(crate) embedding_config_updates: BTreeMap<String, EmbedderAction>,
pub(crate) embedding_configs_updated: bool,
pub(crate) settings_update_only: bool, pub(crate) settings_update_only: bool,
/// The set of only the additional searchable fields. /// The set of only the additional searchable fields.
/// If any other searchable field has been modified, is set to None. /// If any other searchable field has been modified, is set to None.
@ -1116,7 +1203,7 @@ impl InnerIndexSettingsDiff {
old_settings: InnerIndexSettings, old_settings: InnerIndexSettings,
new_settings: InnerIndexSettings, new_settings: InnerIndexSettings,
primary_key_id: Option<FieldId>, primary_key_id: Option<FieldId>,
embedding_configs_updated: bool, embedding_config_updates: BTreeMap<String, EmbedderAction>,
settings_update_only: bool, settings_update_only: bool,
) -> Self { ) -> Self {
let only_additional_fields = match ( let only_additional_fields = match (
@ -1153,7 +1240,7 @@ impl InnerIndexSettingsDiff {
old: old_settings, old: old_settings,
new: new_settings, new: new_settings,
primary_key_id, primary_key_id,
embedding_configs_updated, embedding_config_updates,
settings_update_only, settings_update_only,
only_additional_fields, only_additional_fields,
cache_reindex_searchable_without_user_defined, cache_reindex_searchable_without_user_defined,
@ -1220,7 +1307,7 @@ impl InnerIndexSettingsDiff {
} }
pub fn reindex_vectors(&self) -> bool { pub fn reindex_vectors(&self) -> bool {
self.embedding_configs_updated !self.embedding_config_updates.is_empty()
} }
pub fn settings_update_only(&self) -> bool { pub fn settings_update_only(&self) -> bool {
@ -1252,6 +1339,8 @@ pub(crate) struct InnerIndexSettings {
pub embedding_configs: EmbeddingConfigs, pub embedding_configs: EmbeddingConfigs,
pub existing_fields: HashSet<String>, pub existing_fields: HashSet<String>,
pub geo_fields_ids: Option<(FieldId, FieldId)>, pub geo_fields_ids: Option<(FieldId, FieldId)>,
pub non_searchable_fields_ids: Vec<FieldId>,
pub non_faceted_fields_ids: Vec<FieldId>,
} }
impl InnerIndexSettings { impl InnerIndexSettings {
@ -1265,8 +1354,8 @@ impl InnerIndexSettings {
let user_defined_searchable_fields = let user_defined_searchable_fields =
user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect()); user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect());
let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?; let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?;
let searchable_fields_ids = index.searchable_fields_ids(rtxn)?; let mut searchable_fields_ids = index.searchable_fields_ids(rtxn)?;
let faceted_fields_ids = index.faceted_fields_ids(rtxn)?; let mut faceted_fields_ids = index.faceted_fields_ids(rtxn)?;
let exact_attributes = index.exact_attributes_ids(rtxn)?; let exact_attributes = index.exact_attributes_ids(rtxn)?;
let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default(); let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default();
let embedding_configs = embedders(index.embedding_configs(rtxn)?)?; let embedding_configs = embedders(index.embedding_configs(rtxn)?)?;
@ -1294,6 +1383,10 @@ impl InnerIndexSettings {
None => None, None => None,
}; };
let vectors_fids = fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME);
searchable_fields_ids.retain(|id| !vectors_fids.contains(id));
faceted_fields_ids.retain(|id| !vectors_fids.contains(id));
Ok(Self { Ok(Self {
stop_words, stop_words,
allowed_separators, allowed_separators,
@ -1308,6 +1401,8 @@ impl InnerIndexSettings {
embedding_configs, embedding_configs,
existing_fields, existing_fields,
geo_fields_ids, geo_fields_ids,
non_searchable_fields_ids: vectors_fids.clone(),
non_faceted_fields_ids: vectors_fids.clone(),
}) })
} }
@ -1315,9 +1410,10 @@ impl InnerIndexSettings {
pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> { pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> {
let new_facets = self let new_facets = self
.fields_ids_map .fields_ids_map
.names() .iter()
.filter(|&field| crate::is_faceted(field, &self.user_defined_faceted_fields)) .filter(|(fid, _field)| !self.non_faceted_fields_ids.contains(fid))
.map(|field| field.to_string()) .filter(|(_fid, field)| crate::is_faceted(field, &self.user_defined_faceted_fields))
.map(|(_fid, field)| field.to_string())
.collect(); .collect();
index.put_faceted_fields(wtxn, &new_facets)?; index.put_faceted_fields(wtxn, &new_facets)?;
@ -1337,6 +1433,7 @@ impl InnerIndexSettings {
index.put_all_searchable_fields_from_fields_ids_map( index.put_all_searchable_fields_from_fields_ids_map(
wtxn, wtxn,
&searchable_fields, &searchable_fields,
&self.non_searchable_fields_ids,
&self.fields_ids_map, &self.fields_ids_map,
)?; )?;
} }
@ -1347,19 +1444,25 @@ impl InnerIndexSettings {
} }
} }
fn embedders(embedding_configs: Vec<(String, EmbeddingConfig)>) -> Result<EmbeddingConfigs> { fn embedders(embedding_configs: Vec<IndexEmbeddingConfig>) -> Result<EmbeddingConfigs> {
let res: Result<_> = embedding_configs let res: Result<_> = embedding_configs
.into_iter() .into_iter()
.map(|(name, EmbeddingConfig { embedder_options, prompt })| { .map(
let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?); |IndexEmbeddingConfig {
name,
config: EmbeddingConfig { embedder_options, prompt },
..
}| {
let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?);
let embedder = Arc::new( let embedder = Arc::new(
Embedder::new(embedder_options.clone()) Embedder::new(embedder_options.clone())
.map_err(crate::vector::Error::from) .map_err(crate::vector::Error::from)
.map_err(crate::Error::from)?, .map_err(crate::Error::from)?,
); );
Ok((name, (embedder, prompt))) Ok((name, (embedder, prompt)))
}) },
)
.collect(); .collect();
res.map(EmbeddingConfigs::new) res.map(EmbeddingConfigs::new)
} }

View File

@ -152,6 +152,10 @@ impl EmbeddingConfigs {
&self.0 &self.0
} }
pub fn into_inner(self) -> HashMap<String, (Arc<Embedder>, Arc<Prompt>)> {
self.0
}
/// Get the name of the default embedder configuration. /// Get the name of the default embedder configuration.
/// ///
/// The default embedder is determined as follows: /// The default embedder is determined as follows:

View File

@ -4,8 +4,9 @@ use obkv::KvReader;
use serde_json::{from_slice, Value}; use serde_json::{from_slice, Value};
use super::Embedding; use super::Embedding;
use crate::index::IndexEmbeddingConfig;
use crate::update::del_add::{DelAdd, KvReaderDelAdd}; use crate::update::del_add::{DelAdd, KvReaderDelAdd};
use crate::{FieldId, InternalError, UserError}; use crate::{DocumentId, FieldId, InternalError, UserError};
pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors"; pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors";
@ -17,11 +18,20 @@ pub enum Vectors {
} }
impl Vectors { impl Vectors {
pub fn into_array_of_vectors(self) -> Vec<Embedding> { pub fn must_regenerate(&self) -> bool {
match self { match self {
Vectors::ImplicitlyUserProvided(embeddings) Vectors::ImplicitlyUserProvided(_) => false,
| Vectors::Explicit(ExplicitVectors { embeddings, user_provided: _ }) => { Vectors::Explicit(ExplicitVectors { regenerate, .. }) => *regenerate,
embeddings.into_array_of_vectors().unwrap_or_default() }
}
pub fn into_array_of_vectors(self) -> Option<Vec<Embedding>> {
match self {
Vectors::ImplicitlyUserProvided(embeddings) => {
Some(embeddings.into_array_of_vectors().unwrap_or_default())
}
Vectors::Explicit(ExplicitVectors { embeddings, regenerate: _ }) => {
embeddings.map(|embeddings| embeddings.into_array_of_vectors().unwrap_or_default())
} }
} }
} }
@ -30,22 +40,46 @@ impl Vectors {
#[derive(serde::Serialize, serde::Deserialize, Debug)] #[derive(serde::Serialize, serde::Deserialize, Debug)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct ExplicitVectors { pub struct ExplicitVectors {
pub embeddings: VectorOrArrayOfVectors, pub embeddings: Option<VectorOrArrayOfVectors>,
pub user_provided: bool, pub regenerate: bool,
}
pub enum VectorState {
Inline(Vectors),
Manual,
Generated,
}
impl VectorState {
pub fn must_regenerate(&self) -> bool {
match self {
VectorState::Inline(vectors) => vectors.must_regenerate(),
VectorState::Manual => false,
VectorState::Generated => true,
}
}
}
pub enum VectorsState {
NoVectorsFid,
NoVectorsFieldInDocument,
Vectors(BTreeMap<String, Vectors>),
} }
pub struct ParsedVectorsDiff { pub struct ParsedVectorsDiff {
pub old: Option<BTreeMap<String, Vectors>>, old: BTreeMap<String, VectorState>,
pub new: Option<BTreeMap<String, Vectors>>, new: VectorsState,
} }
impl ParsedVectorsDiff { impl ParsedVectorsDiff {
pub fn new( pub fn new(
docid: DocumentId,
embedders_configs: &[IndexEmbeddingConfig],
documents_diff: KvReader<'_, FieldId>, documents_diff: KvReader<'_, FieldId>,
old_vectors_fid: Option<FieldId>, old_vectors_fid: Option<FieldId>,
new_vectors_fid: Option<FieldId>, new_vectors_fid: Option<FieldId>,
) -> Result<Self, Error> { ) -> Result<Self, Error> {
let old = match old_vectors_fid let mut old = match old_vectors_fid
.and_then(|vectors_fid| documents_diff.get(vectors_fid)) .and_then(|vectors_fid| documents_diff.get(vectors_fid))
.map(KvReaderDelAdd::new) .map(KvReaderDelAdd::new)
.map(|obkv| to_vector_map(obkv, DelAdd::Deletion)) .map(|obkv| to_vector_map(obkv, DelAdd::Deletion))
@ -61,19 +95,54 @@ impl ParsedVectorsDiff {
return Err(error); return Err(error);
} }
} }
.flatten(); .flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, VectorState::Inline(vec))).collect());
let new = new_vectors_fid for embedding_config in embedders_configs {
.and_then(|vectors_fid| documents_diff.get(vectors_fid)) if embedding_config.user_provided.contains(docid) {
.map(KvReaderDelAdd::new) old.entry(embedding_config.name.to_string()).or_insert(VectorState::Manual);
.map(|obkv| to_vector_map(obkv, DelAdd::Addition)) }
.transpose()? }
.flatten();
let new = 'new: {
let Some(new_vectors_fid) = new_vectors_fid else {
break 'new VectorsState::NoVectorsFid;
};
let Some(bytes) = documents_diff.get(new_vectors_fid) else {
break 'new VectorsState::NoVectorsFieldInDocument;
};
let obkv = KvReaderDelAdd::new(bytes);
match to_vector_map(obkv, DelAdd::Addition)? {
Some(new) => VectorsState::Vectors(new),
None => VectorsState::NoVectorsFieldInDocument,
}
};
Ok(Self { old, new }) Ok(Self { old, new })
} }
pub fn remove(&mut self, embedder_name: &str) -> (Option<Vectors>, Option<Vectors>) { pub fn remove(&mut self, embedder_name: &str) -> (VectorState, VectorState) {
let old = self.old.as_mut().and_then(|old| old.remove(embedder_name)); let old = self.old.remove(embedder_name).unwrap_or(VectorState::Generated);
let new = self.new.as_mut().and_then(|new| new.remove(embedder_name)); let state_from_old = match old {
// assume a userProvided is still userProvided
VectorState::Manual => VectorState::Manual,
// generated is still generated
VectorState::Generated => VectorState::Generated,
// weird case that shouldn't happen were the previous docs version is inline,
// but it was removed in the new version
// Since it is not in the new version, we switch to generated
VectorState::Inline(_) => VectorState::Generated,
};
let new = match &mut self.new {
VectorsState::Vectors(new) => {
new.remove(embedder_name).map(VectorState::Inline).unwrap_or(state_from_old)
}
_ =>
// if no `_vectors` field is present in the new document,
// the state depends on the previous version of the document
{
state_from_old
}
};
(old, new) (old, new)
} }
} }
@ -89,15 +158,8 @@ impl ParsedVectors {
Ok(ParsedVectors(value)) Ok(ParsedVectors(value))
} }
pub fn retain_user_provided_vectors(&mut self, embedders: &BTreeSet<String>) { pub fn retain_not_embedded_vectors(&mut self, embedders: &BTreeSet<String>) {
self.0.retain(|k, v| match v { self.0.retain(|k, _v| !embedders.contains(k))
Vectors::ImplicitlyUserProvided(_) => true,
Vectors::Explicit(ExplicitVectors { embeddings: _, user_provided }) => {
*user_provided
// if the embedder is not in the config, then never touch it
|| !embedders.contains(k)
}
});
} }
} }
@ -150,6 +212,22 @@ impl VectorOrArrayOfVectors {
pub fn from_array_of_vectors(array_of_vec: Vec<Embedding>) -> Self { pub fn from_array_of_vectors(array_of_vec: Vec<Embedding>) -> Self {
Self { inner: Some(either::Either::Left(array_of_vec)) } Self { inner: Some(either::Either::Left(array_of_vec)) }
} }
pub fn from_vector(vec: Embedding) -> Self {
Self { inner: Some(either::Either::Right(vec)) }
}
}
impl From<Embedding> for VectorOrArrayOfVectors {
fn from(vec: Embedding) -> Self {
Self::from_vector(vec)
}
}
impl From<Vec<Embedding>> for VectorOrArrayOfVectors {
fn from(vec: Vec<Embedding>) -> Self {
Self::from_array_of_vectors(vec)
}
} }
#[cfg(test)] #[cfg(test)]

View File

@ -1,4 +1,5 @@
use deserr::Deserr; use deserr::Deserr;
use roaring::RoaringBitmap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::rest::InputType; use super::rest::InputType;
@ -72,6 +73,238 @@ pub fn check_unset<T>(
} }
} }
/// Indicates what action should take place during a reindexing operation for an embedder
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum ReindexAction {
/// An indexing operation should take place for this embedder, keeping existing vectors
/// and checking whether the document template changed or not
RegeneratePrompts,
/// An indexing operation should take place for all documents for this embedder, removing existing vectors
/// (except userProvided ones)
FullReindex,
}
pub enum SettingsDiff {
Remove,
Reindex { action: ReindexAction, updated_settings: EmbeddingSettings },
UpdateWithoutReindex { updated_settings: EmbeddingSettings },
}
pub enum EmbedderAction {
WriteBackToDocuments(WriteBackToDocuments),
Reindex(ReindexAction),
}
pub struct WriteBackToDocuments {
pub embedder_id: u8,
pub user_provided: RoaringBitmap,
}
impl SettingsDiff {
pub fn from_settings(old: EmbeddingSettings, new: Setting<EmbeddingSettings>) -> Self {
match new {
Setting::Set(new) => {
let EmbeddingSettings {
mut source,
mut model,
mut revision,
mut api_key,
mut dimensions,
mut document_template,
mut url,
mut query,
mut input_field,
mut path_to_embeddings,
mut embedding_object,
mut input_type,
mut distribution,
} = old;
let EmbeddingSettings {
source: new_source,
model: new_model,
revision: new_revision,
api_key: new_api_key,
dimensions: new_dimensions,
document_template: new_document_template,
url: new_url,
query: new_query,
input_field: new_input_field,
path_to_embeddings: new_path_to_embeddings,
embedding_object: new_embedding_object,
input_type: new_input_type,
distribution: new_distribution,
} = new;
let mut reindex_action = None;
// **Warning**: do not use short-circuiting || here, we want all these operations applied
if source.apply(new_source) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
// when the source changes, we need to reapply the default settings for the new source
apply_default_for_source(
&source,
&mut model,
&mut revision,
&mut dimensions,
&mut url,
&mut query,
&mut input_field,
&mut path_to_embeddings,
&mut embedding_object,
&mut input_type,
&mut document_template,
)
}
if model.apply(new_model) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if revision.apply(new_revision) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if dimensions.apply(new_dimensions) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if url.apply(new_url) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if query.apply(new_query) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if input_field.apply(new_input_field) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if path_to_embeddings.apply(new_path_to_embeddings) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if embedding_object.apply(new_embedding_object) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if input_type.apply(new_input_type) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if document_template.apply(new_document_template) {
ReindexAction::push_action(
&mut reindex_action,
ReindexAction::RegeneratePrompts,
);
}
distribution.apply(new_distribution);
api_key.apply(new_api_key);
let updated_settings = EmbeddingSettings {
source,
model,
revision,
api_key,
dimensions,
document_template,
url,
query,
input_field,
path_to_embeddings,
embedding_object,
input_type,
distribution,
};
match reindex_action {
Some(action) => Self::Reindex { action, updated_settings },
None => Self::UpdateWithoutReindex { updated_settings },
}
}
Setting::Reset => Self::Remove,
Setting::NotSet => Self::UpdateWithoutReindex { updated_settings: old },
}
}
}
impl ReindexAction {
fn push_action(this: &mut Option<Self>, other: Self) {
*this = match (*this, other) {
(_, ReindexAction::FullReindex) => Some(ReindexAction::FullReindex),
(Some(ReindexAction::FullReindex), _) => Some(ReindexAction::FullReindex),
(_, ReindexAction::RegeneratePrompts) => Some(ReindexAction::RegeneratePrompts),
}
}
}
#[allow(clippy::too_many_arguments)] // private function
fn apply_default_for_source(
source: &Setting<EmbedderSource>,
model: &mut Setting<String>,
revision: &mut Setting<String>,
dimensions: &mut Setting<usize>,
url: &mut Setting<String>,
query: &mut Setting<serde_json::Value>,
input_field: &mut Setting<Vec<String>>,
path_to_embeddings: &mut Setting<Vec<String>>,
embedding_object: &mut Setting<Vec<String>>,
input_type: &mut Setting<InputType>,
document_template: &mut Setting<String>,
) {
match source {
Setting::Set(EmbedderSource::HuggingFace) => {
*model = Setting::Reset;
*revision = Setting::Reset;
*dimensions = Setting::NotSet;
*url = Setting::NotSet;
*query = Setting::NotSet;
*input_field = Setting::NotSet;
*path_to_embeddings = Setting::NotSet;
*embedding_object = Setting::NotSet;
*input_type = Setting::NotSet;
}
Setting::Set(EmbedderSource::Ollama) => {
*model = Setting::Reset;
*revision = Setting::NotSet;
*dimensions = Setting::Reset;
*url = Setting::NotSet;
*query = Setting::NotSet;
*input_field = Setting::NotSet;
*path_to_embeddings = Setting::NotSet;
*embedding_object = Setting::NotSet;
*input_type = Setting::NotSet;
}
Setting::Set(EmbedderSource::OpenAi) | Setting::Reset => {
*model = Setting::Reset;
*revision = Setting::NotSet;
*dimensions = Setting::NotSet;
*url = Setting::NotSet;
*query = Setting::NotSet;
*input_field = Setting::NotSet;
*path_to_embeddings = Setting::NotSet;
*embedding_object = Setting::NotSet;
*input_type = Setting::NotSet;
}
Setting::Set(EmbedderSource::Rest) => {
*model = Setting::NotSet;
*revision = Setting::NotSet;
*dimensions = Setting::Reset;
*url = Setting::Reset;
*query = Setting::Reset;
*input_field = Setting::Reset;
*path_to_embeddings = Setting::Reset;
*embedding_object = Setting::Reset;
*input_type = Setting::Reset;
}
Setting::Set(EmbedderSource::UserProvided) => {
*model = Setting::NotSet;
*revision = Setting::NotSet;
*dimensions = Setting::Reset;
*url = Setting::NotSet;
*query = Setting::NotSet;
*input_field = Setting::NotSet;
*path_to_embeddings = Setting::NotSet;
*embedding_object = Setting::NotSet;
*input_type = Setting::NotSet;
*document_template = Setting::NotSet;
}
Setting::NotSet => {}
}
}
pub fn check_set<T>( pub fn check_set<T>(
key: &Setting<T>, key: &Setting<T>,
field: &'static str, field: &'static str,
@ -210,66 +443,6 @@ impl EmbeddingSettings {
*model = Setting::Set(openai::EmbeddingModel::default().name().to_owned()) *model = Setting::Set(openai::EmbeddingModel::default().name().to_owned())
} }
} }
pub(crate) fn apply_and_need_reindex(
old: &mut Setting<EmbeddingSettings>,
new: Setting<EmbeddingSettings>,
) -> bool {
match (old, new) {
(
Setting::Set(EmbeddingSettings {
source: old_source,
model: old_model,
revision: old_revision,
api_key: old_api_key,
dimensions: old_dimensions,
document_template: old_document_template,
url: old_url,
query: old_query,
input_field: old_input_field,
path_to_embeddings: old_path_to_embeddings,
embedding_object: old_embedding_object,
input_type: old_input_type,
distribution: old_distribution,
}),
Setting::Set(EmbeddingSettings {
source: new_source,
model: new_model,
revision: new_revision,
api_key: new_api_key,
dimensions: new_dimensions,
document_template: new_document_template,
url: new_url,
query: new_query,
input_field: new_input_field,
path_to_embeddings: new_path_to_embeddings,
embedding_object: new_embedding_object,
input_type: new_input_type,
distribution: new_distribution,
}),
) => {
let mut needs_reindex = false;
needs_reindex |= old_source.apply(new_source);
needs_reindex |= old_model.apply(new_model);
needs_reindex |= old_revision.apply(new_revision);
needs_reindex |= old_dimensions.apply(new_dimensions);
needs_reindex |= old_document_template.apply(new_document_template);
needs_reindex |= old_url.apply(new_url);
needs_reindex |= old_query.apply(new_query);
needs_reindex |= old_input_field.apply(new_input_field);
needs_reindex |= old_path_to_embeddings.apply(new_path_to_embeddings);
needs_reindex |= old_embedding_object.apply(new_embedding_object);
needs_reindex |= old_input_type.apply(new_input_type);
old_distribution.apply(new_distribution);
old_api_key.apply(new_api_key);
needs_reindex
}
(Setting::Reset, Setting::Reset) | (_, Setting::NotSet) => false,
_ => true,
}
}
} }
#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] #[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)]

View File

@ -21,7 +21,7 @@ reqwest = { version = "0.11.23", features = [
"stream", "stream",
"json", "json",
"rustls-tls", "rustls-tls",
], default_features = false } ], default-features = false }
serde = { version = "1.0.195", features = ["derive"] } serde = { version = "1.0.195", features = ["derive"] }
serde_json = "1.0.111" serde_json = "1.0.111"
sha2 = "0.10.8" sha2 = "0.10.8"