Remove the vectors from the documents database

This commit is contained in:
Tamo 2024-05-22 15:27:09 +02:00
parent 7a84697570
commit 84e498299b
14 changed files with 407 additions and 51 deletions

2
Cargo.lock generated
View File

@ -2455,6 +2455,7 @@ name = "index-scheduler"
version = "1.9.0" version = "1.9.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"arroy",
"big_s", "big_s",
"bincode", "bincode",
"crossbeam", "crossbeam",
@ -2465,6 +2466,7 @@ dependencies = [
"file-store", "file-store",
"flate2", "flate2",
"insta", "insta",
"maplit",
"meili-snap", "meili-snap",
"meilisearch-auth", "meilisearch-auth",
"meilisearch-types", "meilisearch-types",

View File

@ -40,7 +40,9 @@ ureq = "2.9.7"
uuid = { version = "1.6.1", features = ["serde", "v4"] } uuid = { version = "1.6.1", features = ["serde", "v4"] }
[dev-dependencies] [dev-dependencies]
arroy = "0.3.1"
big_s = "1.0.2" big_s = "1.0.2"
crossbeam = "0.8.4" crossbeam = "0.8.4"
insta = { version = "1.34.0", features = ["json", "redactions"] } insta = { version = "1.34.0", features = ["json", "redactions"] }
maplit = "1.0.2"
meili-snap = { path = "../meili-snap" } meili-snap = { path = "../meili-snap" }

View File

@ -1459,11 +1459,11 @@ impl IndexScheduler {
// TODO: consider using a type alias or a struct embedder/template // TODO: consider using a type alias or a struct embedder/template
pub fn embedders( pub fn embedders(
&self, &self,
embedding_configs: Vec<(String, milli::vector::EmbeddingConfig)>, embedding_configs: Vec<(String, milli::vector::EmbeddingConfig, RoaringBitmap)>,
) -> Result<EmbeddingConfigs> { ) -> Result<EmbeddingConfigs> {
let res: Result<_> = embedding_configs let res: Result<_> = embedding_configs
.into_iter() .into_iter()
.map(|(name, milli::vector::EmbeddingConfig { embedder_options, prompt })| { .map(|(name, milli::vector::EmbeddingConfig { embedder_options, prompt }, _)| {
let prompt = let prompt =
Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?); Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?);
// optimistically return existing embedder // optimistically return existing embedder
@ -1748,6 +1748,9 @@ mod tests {
use meilisearch_types::milli::update::IndexDocumentsMethod::{ use meilisearch_types::milli::update::IndexDocumentsMethod::{
ReplaceDocuments, UpdateDocuments, ReplaceDocuments, UpdateDocuments,
}; };
use meilisearch_types::milli::update::Setting;
use meilisearch_types::milli::vector::settings::EmbeddingSettings;
use meilisearch_types::settings::{Checked, Unchecked};
use meilisearch_types::tasks::IndexSwap; use meilisearch_types::tasks::IndexSwap;
use meilisearch_types::VERSION_FILE_NAME; use meilisearch_types::VERSION_FILE_NAME;
use tempfile::{NamedTempFile, TempDir}; use tempfile::{NamedTempFile, TempDir};
@ -3052,7 +3055,9 @@ mod tests {
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let configs = index.embedding_configs(&rtxn).unwrap(); let configs = index.embedding_configs(&rtxn).unwrap();
let (_, embedding_config) = configs.first().unwrap(); let (name, embedding_config, user_provided) = configs.first().unwrap();
insta::assert_snapshot!(name, @"default");
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
insta::assert_json_snapshot!(embedding_config.embedder_options); insta::assert_json_snapshot!(embedding_config.embedder_options);
} }
@ -5017,13 +5022,15 @@ mod tests {
let configs = index.embedding_configs(&rtxn).unwrap(); let configs = index.embedding_configs(&rtxn).unwrap();
// for consistency with the below // for consistency with the below
#[allow(clippy::get_first)] #[allow(clippy::get_first)]
let (name, fakerest_config) = configs.get(0).unwrap(); let (name, fakerest_config, user_provided) = configs.get(0).unwrap();
insta::assert_json_snapshot!(name, @r###""A_fakerest""###); insta::assert_snapshot!(name, @"A_fakerest");
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
insta::assert_json_snapshot!(fakerest_config.embedder_options); insta::assert_json_snapshot!(fakerest_config.embedder_options);
let fakerest_name = name.clone(); let fakerest_name = name.clone();
let (name, simple_hf_config) = configs.get(1).unwrap(); let (name, simple_hf_config, user_provided) = configs.get(1).unwrap();
insta::assert_json_snapshot!(name, @r###""B_small_hf""###); insta::assert_snapshot!(name, @"B_small_hf");
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
insta::assert_json_snapshot!(simple_hf_config.embedder_options); insta::assert_json_snapshot!(simple_hf_config.embedder_options);
let simple_hf_name = name.clone(); let simple_hf_name = name.clone();
@ -5091,6 +5098,18 @@ mod tests {
let index = index_scheduler.index("doggos").unwrap(); let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
// Ensure the document have been inserted into the relevant bitamp
let configs = index.embedding_configs(&rtxn).unwrap();
// for consistency with the below
#[allow(clippy::get_first)]
let (name, _config, user_defined) = configs.get(0).unwrap();
insta::assert_snapshot!(name, @"A_fakerest");
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>");
let (name, _config, user_defined) = configs.get(1).unwrap();
insta::assert_snapshot!(name, @"B_small_hf");
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>");
let embeddings = index.embeddings(&rtxn, 0).unwrap(); let embeddings = index.embeddings(&rtxn, 0).unwrap();
assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true"); assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true");
@ -5153,6 +5172,18 @@ mod tests {
let index = index_scheduler.index("doggos").unwrap(); let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
// Ensure the document have been inserted into the relevant bitamp
let configs = index.embedding_configs(&rtxn).unwrap();
// for consistency with the below
#[allow(clippy::get_first)]
let (name, _config, user_defined) = configs.get(0).unwrap();
insta::assert_snapshot!(name, @"A_fakerest");
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>");
let (name, _config, user_defined) = configs.get(1).unwrap();
insta::assert_snapshot!(name, @"B_small_hf");
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>");
let embeddings = index.embeddings(&rtxn, 0).unwrap(); let embeddings = index.embeddings(&rtxn, 0).unwrap();
// automatically changed to patou // automatically changed to patou
@ -5176,4 +5207,246 @@ mod tests {
} }
} }
} }
#[test]
fn import_vectors_first_and_embedder_later() {
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
let content = serde_json::json!(
[
{
"id": 0,
"doggo": "kefir",
},
{
"id": 1,
"doggo": "intel",
"_vectors": {
"my_doggo_embedder": vec![1; 384],
"unknown embedder": vec![1, 2, 3],
}
},
{
"id": 2,
"doggo": "max",
"_vectors": {
"my_doggo_embedder": {
"userProvided": true,
"embeddings": vec![2; 384],
},
"unknown embedder": vec![4, 5],
},
},
{
"id": 3,
"doggo": "marcel",
"_vectors": {
"my_doggo_embedder": {
"userProvided": false,
"embeddings": vec![3; 384],
},
},
},
{
"id": 4,
"doggo": "sora",
"_vectors": {
"my_doggo_embedder": {
"userProvided": false,
},
},
},
]
);
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0 as u128).unwrap();
let documents_count =
read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file)
.unwrap();
snapshot!(documents_count, @"5");
file.persist().unwrap();
index_scheduler
.register(
KindWithContent::DocumentAdditionOrUpdate {
index_uid: S("doggos"),
primary_key: None,
method: ReplaceDocuments,
content_file: uuid,
documents_count,
allow_index_creation: true,
},
None,
false,
)
.unwrap();
index_scheduler.assert_internally_consistent();
handle.advance_one_successful_batch();
index_scheduler.assert_internally_consistent();
let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap();
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
let documents = index
.all_documents(&rtxn)
.unwrap()
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
.collect::<Vec<_>>();
snapshot!(serde_json::to_string(&documents).unwrap(), name: "documents after initial push");
let mut setting = meilisearch_types::settings::Settings::<Unchecked>::default();
setting.embedders = Setting::Set(maplit::btreemap! {
S("my_doggo_embedder") => Setting::Set(EmbeddingSettings {
source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace),
model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")),
revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")),
document_template: Setting::Set(S("{{doc.doggo}}")),
.. EmbeddingSettings::default()
})
});
index_scheduler
.register(
KindWithContent::SettingsUpdate {
index_uid: S("doggos"),
new_settings: Box::new(setting),
is_deletion: false,
allow_index_creation: false,
},
None,
false,
)
.unwrap();
index_scheduler.assert_internally_consistent();
handle.advance_one_successful_batch();
index_scheduler.assert_internally_consistent();
let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap();
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
let documents = index
.all_documents(&rtxn)
.unwrap()
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
.collect::<Vec<_>>();
// the all the vectors linked to the new specified embedder have been removed
// Only the unknown embedders stays in the document DB
snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]"###);
let conf = index.embedding_configs(&rtxn).unwrap();
// even though we specified the vector for the ID 3, it shouldn't be marked
// as user provided since we explicitely marked it as NOT user provided.
snapshot!(format!("{conf:#?}"), @r###"
[
(
"my_doggo_embedder",
EmbeddingConfig {
embedder_options: HuggingFace(
EmbedderOptions {
model: "sentence-transformers/all-MiniLM-L6-v2",
revision: Some(
"e4ce9877abf3edfe10b0d82785e83bdcb973e22e",
),
distribution: None,
},
),
prompt: PromptData {
template: "{{doc.doggo}}",
},
},
RoaringBitmap<[1, 2]>,
),
]
"###);
let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap();
let embeddings = index.embeddings(&rtxn, docid).unwrap();
let embedding = &embeddings["my_doggo_embedder"];
assert!(!embedding.is_empty(), "{embedding:?}");
// the document with the id 3 should keep its original embedding
let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap();
let mut embeddings = Vec::new();
'vectors: for i in 0..=u8::MAX {
let reader = arroy::Reader::open(&rtxn, 0 | (i as u16), index.vector_arroy)
.map(Some)
.or_else(|e| match e {
arroy::Error::MissingMetadata => Ok(None),
e => Err(e),
})
.transpose();
let Some(reader) = reader else {
break 'vectors;
};
let embedding = reader.unwrap().item_vector(&rtxn, docid).unwrap();
if let Some(embedding) = embedding {
embeddings.push(embedding)
} else {
break 'vectors;
}
}
snapshot!(embeddings.len(), @"1");
assert!(embeddings[0].iter().all(|i| *i == 3.0), "{:?}", embeddings[0]);
// If we update marcel it should regenerate its embedding automatically
let content = serde_json::json!(
[
{
"id": 3,
"doggo": "marvel",
},
{
"id": 4,
"doggo": "sorry",
},
]
);
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(1 as u128).unwrap();
let documents_count =
read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file)
.unwrap();
snapshot!(documents_count, @"2");
file.persist().unwrap();
index_scheduler
.register(
KindWithContent::DocumentAdditionOrUpdate {
index_uid: S("doggos"),
primary_key: None,
method: UpdateDocuments,
content_file: uuid,
documents_count,
allow_index_creation: true,
},
None,
false,
)
.unwrap();
index_scheduler.assert_internally_consistent();
handle.advance_one_successful_batch();
index_scheduler.assert_internally_consistent();
// the document with the id 3 should have its original embedding updated
let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap();
let embeddings = index.embeddings(&rtxn, docid).unwrap();
let embedding = &embeddings["my_doggo_embedder"];
assert!(!embedding.is_empty());
/// TODO: it shouldnt be equal to 3.0
assert!(embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]);
// the document with the id 4 should generate an embedding
// let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap();
// let embeddings = index.embeddings(&rtxn, docid).unwrap();
// dbg!(&embeddings);
// let embedding = &embeddings["my_doggo_embedder"];
// assert!(!embedding.is_empty());
// assert!(embedding[0]);
}
} }

View File

@ -0,0 +1,4 @@
---
source: index-scheduler/src/lib.rs
---
[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]

View File

@ -672,7 +672,7 @@ pub fn settings(
let embedders: BTreeMap<_, _> = index let embedders: BTreeMap<_, _> = index
.embedding_configs(rtxn)? .embedding_configs(rtxn)?
.into_iter() .into_iter()
.map(|(name, config)| (name, Setting::Set(config.into()))) .map(|(name, config, _)| (name, Setting::Set(config.into())))
.collect(); .collect();
let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) }; let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) };

View File

@ -44,7 +44,7 @@ once_cell = "1.19.0"
ordered-float = "4.2.0" ordered-float = "4.2.0"
rand_pcg = { version = "0.3.1", features = ["serde1"] } rand_pcg = { version = "0.3.1", features = ["serde1"] }
rayon = "1.8.0" rayon = "1.8.0"
roaring = "0.10.2" roaring = { version = "0.10.2", features = ["serde"] }
rstar = { version = "0.11.0", features = ["serde"] } rstar = { version = "0.11.0", features = ["serde"] }
serde = { version = "1.0.195", features = ["derive"] } serde = { version = "1.0.195", features = ["derive"] }
serde_json = { version = "1.0.111", features = ["preserve_order"] } serde_json = { version = "1.0.111", features = ["preserve_order"] }

View File

@ -1572,16 +1572,18 @@ impl Index {
Ok(script_language) Ok(script_language)
} }
/// Put the embedding configs:
/// 1. The name of the embedder
/// 2. The configuration option for this embedder
/// 3. The list of documents with a user provided embedding
pub(crate) fn put_embedding_configs( pub(crate) fn put_embedding_configs(
&self, &self,
wtxn: &mut RwTxn<'_>, wtxn: &mut RwTxn<'_>,
configs: Vec<(String, EmbeddingConfig)>, configs: Vec<(String, EmbeddingConfig, RoaringBitmap)>,
) -> heed::Result<()> { ) -> heed::Result<()> {
self.main.remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig)>>>().put( self.main
wtxn, .remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig, RoaringBitmap)>>>()
main_key::EMBEDDING_CONFIGS, .put(wtxn, main_key::EMBEDDING_CONFIGS, &configs)
&configs,
)
} }
pub(crate) fn delete_embedding_configs(&self, wtxn: &mut RwTxn<'_>) -> heed::Result<bool> { pub(crate) fn delete_embedding_configs(&self, wtxn: &mut RwTxn<'_>) -> heed::Result<bool> {
@ -1591,10 +1593,10 @@ impl Index {
pub fn embedding_configs( pub fn embedding_configs(
&self, &self,
rtxn: &RoTxn<'_>, rtxn: &RoTxn<'_>,
) -> Result<Vec<(String, crate::vector::EmbeddingConfig)>> { ) -> Result<Vec<(String, EmbeddingConfig, RoaringBitmap)>> {
Ok(self Ok(self
.main .main
.remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig)>>>() .remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig, RoaringBitmap)>>>()
.get(rtxn, main_key::EMBEDDING_CONFIGS)? .get(rtxn, main_key::EMBEDDING_CONFIGS)?
.unwrap_or_default()) .unwrap_or_default())
} }

View File

@ -10,16 +10,16 @@ use bytemuck::cast_slice;
use grenad::Writer; use grenad::Writer;
use itertools::EitherOrBoth; use itertools::EitherOrBoth;
use ordered_float::OrderedFloat; use ordered_float::OrderedFloat;
use roaring::RoaringBitmap;
use serde_json::Value; use serde_json::Value;
use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
use crate::prompt::Prompt; use crate::prompt::Prompt;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::helpers::try_split_at;
use crate::update::settings::InnerIndexSettingsDiff; use crate::update::settings::InnerIndexSettingsDiff;
use crate::vector::parsed_vectors::{ParsedVectorsDiff, RESERVED_VECTORS_FIELD_NAME}; use crate::vector::parsed_vectors::{ParsedVectorsDiff, RESERVED_VECTORS_FIELD_NAME};
use crate::vector::Embedder; use crate::vector::Embedder;
use crate::{DocumentId, Result, ThreadPoolNoAbort}; use crate::{try_split_array_at, DocumentId, Result, ThreadPoolNoAbort};
/// The length of the elements that are always in the buffer when inserting new values. /// The length of the elements that are always in the buffer when inserting new values.
const TRUNCATE_SIZE: usize = size_of::<DocumentId>(); const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
@ -35,6 +35,8 @@ pub struct ExtractedVectorPoints {
// embedder // embedder
pub embedder_name: String, pub embedder_name: String,
pub embedder: Arc<Embedder>, pub embedder: Arc<Embedder>,
pub user_defined: RoaringBitmap,
pub remove_from_user_defined: RoaringBitmap,
} }
enum VectorStateDelta { enum VectorStateDelta {
@ -80,6 +82,11 @@ struct EmbedderVectorExtractor {
prompts_writer: Writer<BufWriter<File>>, prompts_writer: Writer<BufWriter<File>>,
// (docid) -> () // (docid) -> ()
remove_vectors_writer: Writer<BufWriter<File>>, remove_vectors_writer: Writer<BufWriter<File>>,
// The docids of the documents that contains a user defined embedding
user_defined: RoaringBitmap,
// The docids of the documents that contains an auto-generated embedding
remove_from_user_defined: RoaringBitmap,
} }
/// Extracts the embedding vector contained in each document under the `_vectors` field. /// Extracts the embedding vector contained in each document under the `_vectors` field.
@ -134,6 +141,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
manual_vectors_writer, manual_vectors_writer,
prompts_writer, prompts_writer,
remove_vectors_writer, remove_vectors_writer,
user_defined: RoaringBitmap::new(),
remove_from_user_defined: RoaringBitmap::new(),
}); });
} }
@ -141,13 +150,15 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
let mut cursor = obkv_documents.into_cursor()?; let mut cursor = obkv_documents.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? { while let Some((key, value)) = cursor.move_on_next()? {
// this must always be serialized as (docid, external_docid); // this must always be serialized as (docid, external_docid);
const SIZE_OF_DOCUMENTID: usize = std::mem::size_of::<DocumentId>();
let (docid_bytes, external_id_bytes) = let (docid_bytes, external_id_bytes) =
try_split_at(key, std::mem::size_of::<DocumentId>()).unwrap(); try_split_array_at::<u8, SIZE_OF_DOCUMENTID>(key).unwrap();
debug_assert!(from_utf8(external_id_bytes).is_ok()); debug_assert!(from_utf8(external_id_bytes).is_ok());
let docid = DocumentId::from_be_bytes(docid_bytes);
let obkv = obkv::KvReader::new(value); let obkv = obkv::KvReader::new(value);
key_buffer.clear(); key_buffer.clear();
key_buffer.extend_from_slice(docid_bytes); key_buffer.extend_from_slice(docid_bytes.as_slice());
// since we only need the primary key when we throw an error we create this getter to // since we only need the primary key when we throw an error we create this getter to
// lazily get it when needed // lazily get it when needed
@ -163,10 +174,22 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
manual_vectors_writer, manual_vectors_writer,
prompts_writer, prompts_writer,
remove_vectors_writer, remove_vectors_writer,
user_defined,
remove_from_user_defined,
} in extractors.iter_mut() } in extractors.iter_mut()
{ {
let delta = match parsed_vectors.remove(embedder_name) { let delta = match parsed_vectors.remove(embedder_name) {
(Some(old), Some(new)) => { (Some(old), Some(new)) => {
match (old.is_user_provided(), new.is_user_provided()) {
(true, true) | (false, false) => (),
(true, false) => {
remove_from_user_defined.insert(docid);
}
(false, true) => {
user_defined.insert(docid);
}
}
// no autogeneration // no autogeneration
let del_vectors = old.into_array_of_vectors(); let del_vectors = old.into_array_of_vectors();
let add_vectors = new.into_array_of_vectors(); let add_vectors = new.into_array_of_vectors();
@ -187,6 +210,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
.map(|(_, deladd)| KvReaderDelAdd::new(deladd)) .map(|(_, deladd)| KvReaderDelAdd::new(deladd))
.any(|deladd| deladd.get(DelAdd::Addition).is_some()); .any(|deladd| deladd.get(DelAdd::Addition).is_some());
if document_is_kept { if document_is_kept {
remove_from_user_defined.insert(docid);
// becomes autogenerated // becomes autogenerated
VectorStateDelta::NowGenerated(prompt.render( VectorStateDelta::NowGenerated(prompt.render(
obkv, obkv,
@ -198,6 +222,11 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
} }
} }
(None, Some(new)) => { (None, Some(new)) => {
if new.is_user_provided() {
user_defined.insert(docid);
} else {
remove_from_user_defined.insert(docid);
}
// was possibly autogenerated, remove all vectors for that document // was possibly autogenerated, remove all vectors for that document
let add_vectors = new.into_array_of_vectors(); let add_vectors = new.into_array_of_vectors();
if add_vectors.len() > usize::from(u8::MAX) { if add_vectors.len() > usize::from(u8::MAX) {
@ -239,6 +268,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
VectorStateDelta::NoChange VectorStateDelta::NoChange
} }
} else { } else {
remove_from_user_defined.remove(docid);
VectorStateDelta::NowRemoved VectorStateDelta::NowRemoved
} }
} }
@ -265,18 +295,18 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
manual_vectors_writer, manual_vectors_writer,
prompts_writer, prompts_writer,
remove_vectors_writer, remove_vectors_writer,
user_defined,
remove_from_user_defined,
} in extractors } in extractors
{ {
results.push(ExtractedVectorPoints { results.push(ExtractedVectorPoints {
// docid, _index -> KvWriterDelAdd -> Vector
manual_vectors: writer_into_reader(manual_vectors_writer)?, manual_vectors: writer_into_reader(manual_vectors_writer)?,
// docid -> ()
remove_vectors: writer_into_reader(remove_vectors_writer)?, remove_vectors: writer_into_reader(remove_vectors_writer)?,
// docid -> prompt
prompts: writer_into_reader(prompts_writer)?, prompts: writer_into_reader(prompts_writer)?,
embedder, embedder,
embedder_name, embedder_name,
user_defined,
remove_from_user_defined,
}) })
} }

View File

@ -238,6 +238,8 @@ fn send_original_documents_data(
prompts, prompts,
embedder_name, embedder_name,
embedder, embedder,
user_defined,
remove_from_user_defined: auto_generated,
} in extracted_vectors } in extracted_vectors
{ {
let embeddings = match extract_embeddings( let embeddings = match extract_embeddings(
@ -262,6 +264,8 @@ fn send_original_documents_data(
expected_dimension: embedder.dimensions(), expected_dimension: embedder.dimensions(),
manual_vectors, manual_vectors,
embedder_name, embedder_name,
user_defined,
remove_from_user_defined: auto_generated,
})); }));
} }
} }

View File

@ -501,6 +501,8 @@ where
embeddings, embeddings,
manual_vectors, manual_vectors,
embedder_name, embedder_name,
user_defined,
remove_from_user_defined,
} => { } => {
dimension.insert(embedder_name.clone(), expected_dimension); dimension.insert(embedder_name.clone(), expected_dimension);
TypedChunk::VectorPoints { TypedChunk::VectorPoints {
@ -509,6 +511,8 @@ where
expected_dimension, expected_dimension,
manual_vectors, manual_vectors,
embedder_name, embedder_name,
user_defined,
remove_from_user_defined,
} }
} }
otherwise => otherwise, otherwise => otherwise,
@ -2616,10 +2620,11 @@ mod tests {
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let mut embedding_configs = index.embedding_configs(&rtxn).unwrap(); let mut embedding_configs = index.embedding_configs(&rtxn).unwrap();
let (embedder_name, embedder) = embedding_configs.pop().unwrap(); let (embedder_name, embedder, user_defined) = embedding_configs.pop().unwrap();
insta::assert_snapshot!(embedder_name, @"manual");
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0, 1, 2]>");
let embedder = let embedder =
std::sync::Arc::new(crate::vector::Embedder::new(embedder.embedder_options).unwrap()); std::sync::Arc::new(crate::vector::Embedder::new(embedder.embedder_options).unwrap());
assert_eq!("manual", embedder_name);
let res = index let res = index
.search(&rtxn) .search(&rtxn)
.semantic(embedder_name, embedder, Some([0.0, 1.0, 2.0].to_vec())) .semantic(embedder_name, embedder, Some([0.0, 1.0, 2.0].to_vec()))

View File

@ -90,6 +90,8 @@ pub(crate) enum TypedChunk {
expected_dimension: usize, expected_dimension: usize,
manual_vectors: grenad::Reader<BufReader<File>>, manual_vectors: grenad::Reader<BufReader<File>>,
embedder_name: String, embedder_name: String,
user_defined: RoaringBitmap,
remove_from_user_defined: RoaringBitmap,
}, },
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
} }
@ -155,7 +157,7 @@ pub(crate) fn write_typed_chunk_into_index(
let mut iter = merger.into_stream_merger_iter()?; let mut iter = merger.into_stream_merger_iter()?;
let embedders: BTreeSet<_> = let embedders: BTreeSet<_> =
index.embedding_configs(wtxn)?.into_iter().map(|(k, _v)| k).collect(); index.embedding_configs(wtxn)?.into_iter().map(|(name, _, _)| name).collect();
let mut vectors_buffer = Vec::new(); let mut vectors_buffer = Vec::new();
while let Some((key, reader)) = iter.next()? { while let Some((key, reader)) = iter.next()? {
let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
@ -181,7 +183,7 @@ pub(crate) fn write_typed_chunk_into_index(
// if the `_vectors` field cannot be parsed as map of vectors, just write it as-is // if the `_vectors` field cannot be parsed as map of vectors, just write it as-is
break 'vectors Some(addition); break 'vectors Some(addition);
}; };
vectors.retain_user_provided_vectors(&embedders); vectors.retain_not_embedded_vectors(&embedders);
let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors; let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors;
if vectors.is_empty() { if vectors.is_empty() {
// skip writing empty `_vectors` map // skip writing empty `_vectors` map
@ -619,6 +621,8 @@ pub(crate) fn write_typed_chunk_into_index(
let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn); let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn); let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn); let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn);
let mut user_defined = RoaringBitmap::new();
let mut remove_from_user_defined = RoaringBitmap::new();
let mut params = None; let mut params = None;
for typed_chunk in typed_chunks { for typed_chunk in typed_chunks {
let TypedChunk::VectorPoints { let TypedChunk::VectorPoints {
@ -627,6 +631,8 @@ pub(crate) fn write_typed_chunk_into_index(
embeddings, embeddings,
expected_dimension, expected_dimension,
embedder_name, embedder_name,
user_defined: ud,
remove_from_user_defined: rud,
} = typed_chunk } = typed_chunk
else { else {
unreachable!(); unreachable!();
@ -639,11 +645,21 @@ pub(crate) fn write_typed_chunk_into_index(
if let Some(embeddings) = embeddings { if let Some(embeddings) = embeddings {
embeddings_builder.push(embeddings.into_cursor()?); embeddings_builder.push(embeddings.into_cursor()?);
} }
user_defined |= ud;
remove_from_user_defined |= rud;
} }
// typed chunks has always at least 1 chunk. // typed chunks has always at least 1 chunk.
let Some((expected_dimension, embedder_name)) = params else { unreachable!() }; let Some((expected_dimension, embedder_name)) = params else { unreachable!() };
let mut embedding_configs = index.embedding_configs(&wtxn)?;
let (_name, _conf, ud) =
embedding_configs.iter_mut().find(|config| config.0 == embedder_name).unwrap();
*ud -= remove_from_user_defined;
*ud |= user_defined;
index.put_embedding_configs(wtxn, embedding_configs)?;
let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or( let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None }, InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
)?; )?;

View File

@ -6,6 +6,7 @@ use std::sync::Arc;
use charabia::{Normalize, Tokenizer, TokenizerBuilder}; use charabia::{Normalize, Tokenizer, TokenizerBuilder};
use deserr::{DeserializeError, Deserr}; use deserr::{DeserializeError, Deserr};
use itertools::{EitherOrBoth, Itertools}; use itertools::{EitherOrBoth, Itertools};
use roaring::RoaringBitmap;
use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde::{Deserialize, Deserializer, Serialize, Serializer};
use time::OffsetDateTime; use time::OffsetDateTime;
@ -926,8 +927,13 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
Setting::Set(configs) => { Setting::Set(configs) => {
let mut changed = false; let mut changed = false;
let old_configs = self.index.embedding_configs(self.wtxn)?; let old_configs = self.index.embedding_configs(self.wtxn)?;
let old_configs: BTreeMap<String, Setting<EmbeddingSettings>> = let old_configs: BTreeMap<String, (Setting<EmbeddingSettings>, RoaringBitmap)> =
old_configs.into_iter().map(|(k, v)| (k, Setting::Set(v.into()))).collect(); old_configs
.into_iter()
.map(|(name, setting, user_defined)| {
(name, (Setting::Set(setting.into()), user_defined))
})
.collect();
let mut new_configs = BTreeMap::new(); let mut new_configs = BTreeMap::new();
for joined in old_configs for joined in old_configs
@ -936,15 +942,19 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
{ {
match joined { match joined {
// updated config // updated config
EitherOrBoth::Both((name, mut old), (_, new)) => { EitherOrBoth::Both((name, (mut old, user_defined)), (_, new)) => {
changed |= EmbeddingSettings::apply_and_need_reindex(&mut old, new); changed |= EmbeddingSettings::apply_and_need_reindex(&mut old, new);
if changed { if changed {
tracing::debug!(embedder = name, "need reindex"); tracing::debug!(
embedder = name,
documents = user_defined.len(),
"need reindex"
);
} else { } else {
tracing::debug!(embedder = name, "skip reindex"); tracing::debug!(embedder = name, "skip reindex");
} }
let new = validate_embedding_settings(old, &name)?; let new = validate_embedding_settings(old, &name)?;
new_configs.insert(name, new); new_configs.insert(name, (new, user_defined));
} }
// unchanged config // unchanged config
EitherOrBoth::Left((name, setting)) => { EitherOrBoth::Left((name, setting)) => {
@ -961,21 +971,23 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
); );
let setting = validate_embedding_settings(setting, &name)?; let setting = validate_embedding_settings(setting, &name)?;
changed = true; changed = true;
new_configs.insert(name, setting); new_configs.insert(name, (setting, RoaringBitmap::new()));
} }
} }
} }
let new_configs: Vec<(String, EmbeddingConfig)> = new_configs let new_configs: Vec<(String, EmbeddingConfig, RoaringBitmap)> = new_configs
.into_iter() .into_iter()
.filter_map(|(name, setting)| match setting { .filter_map(|(name, (setting, user_defined))| match setting {
Setting::Set(value) => Some((name, value.into())), Setting::Set(settings) => Some((name, settings.into(), user_defined)),
Setting::Reset => None, Setting::Reset => None,
Setting::NotSet => Some((name, EmbeddingSettings::default().into())), Setting::NotSet => {
Some((name, EmbeddingSettings::default().into(), user_defined))
}
}) })
.collect(); .collect();
self.index.embedder_category_id.clear(self.wtxn)?; self.index.embedder_category_id.clear(self.wtxn)?;
for (index, (embedder_name, _)) in new_configs.iter().enumerate() { for (index, (embedder_name, _, _)) in new_configs.iter().enumerate() {
self.index.embedder_category_id.put_with_flags( self.index.embedder_category_id.put_with_flags(
self.wtxn, self.wtxn,
heed::PutFlags::APPEND, heed::PutFlags::APPEND,
@ -1359,10 +1371,12 @@ impl InnerIndexSettings {
} }
} }
fn embedders(embedding_configs: Vec<(String, EmbeddingConfig)>) -> Result<EmbeddingConfigs> { fn embedders(
embedding_configs: Vec<(String, EmbeddingConfig, RoaringBitmap)>,
) -> Result<EmbeddingConfigs> {
let res: Result<_> = embedding_configs let res: Result<_> = embedding_configs
.into_iter() .into_iter()
.map(|(name, EmbeddingConfig { embedder_options, prompt })| { .map(|(name, EmbeddingConfig { embedder_options, prompt }, _)| {
let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?); let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?);
let embedder = Arc::new( let embedder = Arc::new(

View File

@ -17,6 +17,13 @@ pub enum Vectors {
} }
impl Vectors { impl Vectors {
pub fn is_user_provided(&self) -> bool {
match self {
Vectors::ImplicitlyUserProvided(_) => true,
Vectors::Explicit(ExplicitVectors { user_provided, .. }) => *user_provided,
}
}
pub fn into_array_of_vectors(self) -> Vec<Embedding> { pub fn into_array_of_vectors(self) -> Vec<Embedding> {
match self { match self {
Vectors::ImplicitlyUserProvided(embeddings) Vectors::ImplicitlyUserProvided(embeddings)
@ -89,15 +96,8 @@ impl ParsedVectors {
Ok(ParsedVectors(value)) Ok(ParsedVectors(value))
} }
pub fn retain_user_provided_vectors(&mut self, embedders: &BTreeSet<String>) { pub fn retain_not_embedded_vectors(&mut self, embedders: &BTreeSet<String>) {
self.0.retain(|k, v| match v { self.0.retain(|k, _v| !embedders.contains(k))
Vectors::ImplicitlyUserProvided(_) => true,
Vectors::Explicit(ExplicitVectors { embeddings: _, user_provided }) => {
*user_provided
// if the embedder is not in the config, then never touch it
|| !embedders.contains(k)
}
});
} }
} }