mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-22 11:17:28 +01:00
Remove the vectors from the documents database
This commit is contained in:
parent
7a84697570
commit
84e498299b
2
Cargo.lock
generated
2
Cargo.lock
generated
@ -2455,6 +2455,7 @@ name = "index-scheduler"
|
||||
version = "1.9.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"arroy",
|
||||
"big_s",
|
||||
"bincode",
|
||||
"crossbeam",
|
||||
@ -2465,6 +2466,7 @@ dependencies = [
|
||||
"file-store",
|
||||
"flate2",
|
||||
"insta",
|
||||
"maplit",
|
||||
"meili-snap",
|
||||
"meilisearch-auth",
|
||||
"meilisearch-types",
|
||||
|
@ -40,7 +40,9 @@ ureq = "2.9.7"
|
||||
uuid = { version = "1.6.1", features = ["serde", "v4"] }
|
||||
|
||||
[dev-dependencies]
|
||||
arroy = "0.3.1"
|
||||
big_s = "1.0.2"
|
||||
crossbeam = "0.8.4"
|
||||
insta = { version = "1.34.0", features = ["json", "redactions"] }
|
||||
maplit = "1.0.2"
|
||||
meili-snap = { path = "../meili-snap" }
|
||||
|
@ -1459,11 +1459,11 @@ impl IndexScheduler {
|
||||
// TODO: consider using a type alias or a struct embedder/template
|
||||
pub fn embedders(
|
||||
&self,
|
||||
embedding_configs: Vec<(String, milli::vector::EmbeddingConfig)>,
|
||||
embedding_configs: Vec<(String, milli::vector::EmbeddingConfig, RoaringBitmap)>,
|
||||
) -> Result<EmbeddingConfigs> {
|
||||
let res: Result<_> = embedding_configs
|
||||
.into_iter()
|
||||
.map(|(name, milli::vector::EmbeddingConfig { embedder_options, prompt })| {
|
||||
.map(|(name, milli::vector::EmbeddingConfig { embedder_options, prompt }, _)| {
|
||||
let prompt =
|
||||
Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?);
|
||||
// optimistically return existing embedder
|
||||
@ -1748,6 +1748,9 @@ mod tests {
|
||||
use meilisearch_types::milli::update::IndexDocumentsMethod::{
|
||||
ReplaceDocuments, UpdateDocuments,
|
||||
};
|
||||
use meilisearch_types::milli::update::Setting;
|
||||
use meilisearch_types::milli::vector::settings::EmbeddingSettings;
|
||||
use meilisearch_types::settings::{Checked, Unchecked};
|
||||
use meilisearch_types::tasks::IndexSwap;
|
||||
use meilisearch_types::VERSION_FILE_NAME;
|
||||
use tempfile::{NamedTempFile, TempDir};
|
||||
@ -3052,7 +3055,9 @@ mod tests {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let configs = index.embedding_configs(&rtxn).unwrap();
|
||||
let (_, embedding_config) = configs.first().unwrap();
|
||||
let (name, embedding_config, user_provided) = configs.first().unwrap();
|
||||
insta::assert_snapshot!(name, @"default");
|
||||
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
|
||||
insta::assert_json_snapshot!(embedding_config.embedder_options);
|
||||
}
|
||||
|
||||
@ -5017,13 +5022,15 @@ mod tests {
|
||||
let configs = index.embedding_configs(&rtxn).unwrap();
|
||||
// for consistency with the below
|
||||
#[allow(clippy::get_first)]
|
||||
let (name, fakerest_config) = configs.get(0).unwrap();
|
||||
insta::assert_json_snapshot!(name, @r###""A_fakerest""###);
|
||||
let (name, fakerest_config, user_provided) = configs.get(0).unwrap();
|
||||
insta::assert_snapshot!(name, @"A_fakerest");
|
||||
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
|
||||
insta::assert_json_snapshot!(fakerest_config.embedder_options);
|
||||
let fakerest_name = name.clone();
|
||||
|
||||
let (name, simple_hf_config) = configs.get(1).unwrap();
|
||||
insta::assert_json_snapshot!(name, @r###""B_small_hf""###);
|
||||
let (name, simple_hf_config, user_provided) = configs.get(1).unwrap();
|
||||
insta::assert_snapshot!(name, @"B_small_hf");
|
||||
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
|
||||
insta::assert_json_snapshot!(simple_hf_config.embedder_options);
|
||||
let simple_hf_name = name.clone();
|
||||
|
||||
@ -5091,6 +5098,18 @@ mod tests {
|
||||
let index = index_scheduler.index("doggos").unwrap();
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
// Ensure the document have been inserted into the relevant bitamp
|
||||
let configs = index.embedding_configs(&rtxn).unwrap();
|
||||
// for consistency with the below
|
||||
#[allow(clippy::get_first)]
|
||||
let (name, _config, user_defined) = configs.get(0).unwrap();
|
||||
insta::assert_snapshot!(name, @"A_fakerest");
|
||||
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>");
|
||||
|
||||
let (name, _config, user_defined) = configs.get(1).unwrap();
|
||||
insta::assert_snapshot!(name, @"B_small_hf");
|
||||
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>");
|
||||
|
||||
let embeddings = index.embeddings(&rtxn, 0).unwrap();
|
||||
|
||||
assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true");
|
||||
@ -5153,6 +5172,18 @@ mod tests {
|
||||
let index = index_scheduler.index("doggos").unwrap();
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
// Ensure the document have been inserted into the relevant bitamp
|
||||
let configs = index.embedding_configs(&rtxn).unwrap();
|
||||
// for consistency with the below
|
||||
#[allow(clippy::get_first)]
|
||||
let (name, _config, user_defined) = configs.get(0).unwrap();
|
||||
insta::assert_snapshot!(name, @"A_fakerest");
|
||||
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>");
|
||||
|
||||
let (name, _config, user_defined) = configs.get(1).unwrap();
|
||||
insta::assert_snapshot!(name, @"B_small_hf");
|
||||
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>");
|
||||
|
||||
let embeddings = index.embeddings(&rtxn, 0).unwrap();
|
||||
|
||||
// automatically changed to patou
|
||||
@ -5176,4 +5207,246 @@ mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn import_vectors_first_and_embedder_later() {
|
||||
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
|
||||
|
||||
let content = serde_json::json!(
|
||||
[
|
||||
{
|
||||
"id": 0,
|
||||
"doggo": "kefir",
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"doggo": "intel",
|
||||
"_vectors": {
|
||||
"my_doggo_embedder": vec![1; 384],
|
||||
"unknown embedder": vec![1, 2, 3],
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"doggo": "max",
|
||||
"_vectors": {
|
||||
"my_doggo_embedder": {
|
||||
"userProvided": true,
|
||||
"embeddings": vec![2; 384],
|
||||
},
|
||||
"unknown embedder": vec![4, 5],
|
||||
},
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"doggo": "marcel",
|
||||
"_vectors": {
|
||||
"my_doggo_embedder": {
|
||||
"userProvided": false,
|
||||
"embeddings": vec![3; 384],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"doggo": "sora",
|
||||
"_vectors": {
|
||||
"my_doggo_embedder": {
|
||||
"userProvided": false,
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
);
|
||||
|
||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0 as u128).unwrap();
|
||||
let documents_count =
|
||||
read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file)
|
||||
.unwrap();
|
||||
snapshot!(documents_count, @"5");
|
||||
file.persist().unwrap();
|
||||
|
||||
index_scheduler
|
||||
.register(
|
||||
KindWithContent::DocumentAdditionOrUpdate {
|
||||
index_uid: S("doggos"),
|
||||
primary_key: None,
|
||||
method: ReplaceDocuments,
|
||||
content_file: uuid,
|
||||
documents_count,
|
||||
allow_index_creation: true,
|
||||
},
|
||||
None,
|
||||
false,
|
||||
)
|
||||
.unwrap();
|
||||
index_scheduler.assert_internally_consistent();
|
||||
handle.advance_one_successful_batch();
|
||||
index_scheduler.assert_internally_consistent();
|
||||
|
||||
let index = index_scheduler.index("doggos").unwrap();
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
|
||||
let documents = index
|
||||
.all_documents(&rtxn)
|
||||
.unwrap()
|
||||
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
snapshot!(serde_json::to_string(&documents).unwrap(), name: "documents after initial push");
|
||||
|
||||
let mut setting = meilisearch_types::settings::Settings::<Unchecked>::default();
|
||||
setting.embedders = Setting::Set(maplit::btreemap! {
|
||||
S("my_doggo_embedder") => Setting::Set(EmbeddingSettings {
|
||||
source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace),
|
||||
model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")),
|
||||
revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")),
|
||||
document_template: Setting::Set(S("{{doc.doggo}}")),
|
||||
.. EmbeddingSettings::default()
|
||||
})
|
||||
});
|
||||
index_scheduler
|
||||
.register(
|
||||
KindWithContent::SettingsUpdate {
|
||||
index_uid: S("doggos"),
|
||||
new_settings: Box::new(setting),
|
||||
is_deletion: false,
|
||||
allow_index_creation: false,
|
||||
},
|
||||
None,
|
||||
false,
|
||||
)
|
||||
.unwrap();
|
||||
index_scheduler.assert_internally_consistent();
|
||||
handle.advance_one_successful_batch();
|
||||
index_scheduler.assert_internally_consistent();
|
||||
|
||||
let index = index_scheduler.index("doggos").unwrap();
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
|
||||
let documents = index
|
||||
.all_documents(&rtxn)
|
||||
.unwrap()
|
||||
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
// the all the vectors linked to the new specified embedder have been removed
|
||||
// Only the unknown embedders stays in the document DB
|
||||
snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]"###);
|
||||
let conf = index.embedding_configs(&rtxn).unwrap();
|
||||
// even though we specified the vector for the ID 3, it shouldn't be marked
|
||||
// as user provided since we explicitely marked it as NOT user provided.
|
||||
snapshot!(format!("{conf:#?}"), @r###"
|
||||
[
|
||||
(
|
||||
"my_doggo_embedder",
|
||||
EmbeddingConfig {
|
||||
embedder_options: HuggingFace(
|
||||
EmbedderOptions {
|
||||
model: "sentence-transformers/all-MiniLM-L6-v2",
|
||||
revision: Some(
|
||||
"e4ce9877abf3edfe10b0d82785e83bdcb973e22e",
|
||||
),
|
||||
distribution: None,
|
||||
},
|
||||
),
|
||||
prompt: PromptData {
|
||||
template: "{{doc.doggo}}",
|
||||
},
|
||||
},
|
||||
RoaringBitmap<[1, 2]>,
|
||||
),
|
||||
]
|
||||
"###);
|
||||
let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap();
|
||||
let embeddings = index.embeddings(&rtxn, docid).unwrap();
|
||||
let embedding = &embeddings["my_doggo_embedder"];
|
||||
assert!(!embedding.is_empty(), "{embedding:?}");
|
||||
|
||||
// the document with the id 3 should keep its original embedding
|
||||
let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap();
|
||||
let mut embeddings = Vec::new();
|
||||
|
||||
'vectors: for i in 0..=u8::MAX {
|
||||
let reader = arroy::Reader::open(&rtxn, 0 | (i as u16), index.vector_arroy)
|
||||
.map(Some)
|
||||
.or_else(|e| match e {
|
||||
arroy::Error::MissingMetadata => Ok(None),
|
||||
e => Err(e),
|
||||
})
|
||||
.transpose();
|
||||
|
||||
let Some(reader) = reader else {
|
||||
break 'vectors;
|
||||
};
|
||||
|
||||
let embedding = reader.unwrap().item_vector(&rtxn, docid).unwrap();
|
||||
if let Some(embedding) = embedding {
|
||||
embeddings.push(embedding)
|
||||
} else {
|
||||
break 'vectors;
|
||||
}
|
||||
}
|
||||
|
||||
snapshot!(embeddings.len(), @"1");
|
||||
assert!(embeddings[0].iter().all(|i| *i == 3.0), "{:?}", embeddings[0]);
|
||||
|
||||
// If we update marcel it should regenerate its embedding automatically
|
||||
|
||||
let content = serde_json::json!(
|
||||
[
|
||||
{
|
||||
"id": 3,
|
||||
"doggo": "marvel",
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"doggo": "sorry",
|
||||
},
|
||||
]
|
||||
);
|
||||
|
||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(1 as u128).unwrap();
|
||||
let documents_count =
|
||||
read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file)
|
||||
.unwrap();
|
||||
snapshot!(documents_count, @"2");
|
||||
file.persist().unwrap();
|
||||
|
||||
index_scheduler
|
||||
.register(
|
||||
KindWithContent::DocumentAdditionOrUpdate {
|
||||
index_uid: S("doggos"),
|
||||
primary_key: None,
|
||||
method: UpdateDocuments,
|
||||
content_file: uuid,
|
||||
documents_count,
|
||||
allow_index_creation: true,
|
||||
},
|
||||
None,
|
||||
false,
|
||||
)
|
||||
.unwrap();
|
||||
index_scheduler.assert_internally_consistent();
|
||||
handle.advance_one_successful_batch();
|
||||
index_scheduler.assert_internally_consistent();
|
||||
|
||||
// the document with the id 3 should have its original embedding updated
|
||||
let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap();
|
||||
let embeddings = index.embeddings(&rtxn, docid).unwrap();
|
||||
let embedding = &embeddings["my_doggo_embedder"];
|
||||
|
||||
assert!(!embedding.is_empty());
|
||||
/// TODO: it shouldn’t be equal to 3.0
|
||||
assert!(embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]);
|
||||
|
||||
// the document with the id 4 should generate an embedding
|
||||
// let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap();
|
||||
// let embeddings = index.embeddings(&rtxn, docid).unwrap();
|
||||
// dbg!(&embeddings);
|
||||
// let embedding = &embeddings["my_doggo_embedder"];
|
||||
|
||||
// assert!(!embedding.is_empty());
|
||||
// assert!(embedding[0]);
|
||||
}
|
||||
}
|
||||
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,4 @@
|
||||
---
|
||||
source: index-scheduler/src/lib.rs
|
||||
---
|
||||
[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]
|
@ -672,7 +672,7 @@ pub fn settings(
|
||||
let embedders: BTreeMap<_, _> = index
|
||||
.embedding_configs(rtxn)?
|
||||
.into_iter()
|
||||
.map(|(name, config)| (name, Setting::Set(config.into())))
|
||||
.map(|(name, config, _)| (name, Setting::Set(config.into())))
|
||||
.collect();
|
||||
let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) };
|
||||
|
||||
|
@ -44,7 +44,7 @@ once_cell = "1.19.0"
|
||||
ordered-float = "4.2.0"
|
||||
rand_pcg = { version = "0.3.1", features = ["serde1"] }
|
||||
rayon = "1.8.0"
|
||||
roaring = "0.10.2"
|
||||
roaring = { version = "0.10.2", features = ["serde"] }
|
||||
rstar = { version = "0.11.0", features = ["serde"] }
|
||||
serde = { version = "1.0.195", features = ["derive"] }
|
||||
serde_json = { version = "1.0.111", features = ["preserve_order"] }
|
||||
|
@ -1572,16 +1572,18 @@ impl Index {
|
||||
Ok(script_language)
|
||||
}
|
||||
|
||||
/// Put the embedding configs:
|
||||
/// 1. The name of the embedder
|
||||
/// 2. The configuration option for this embedder
|
||||
/// 3. The list of documents with a user provided embedding
|
||||
pub(crate) fn put_embedding_configs(
|
||||
&self,
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
configs: Vec<(String, EmbeddingConfig)>,
|
||||
configs: Vec<(String, EmbeddingConfig, RoaringBitmap)>,
|
||||
) -> heed::Result<()> {
|
||||
self.main.remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig)>>>().put(
|
||||
wtxn,
|
||||
main_key::EMBEDDING_CONFIGS,
|
||||
&configs,
|
||||
)
|
||||
self.main
|
||||
.remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig, RoaringBitmap)>>>()
|
||||
.put(wtxn, main_key::EMBEDDING_CONFIGS, &configs)
|
||||
}
|
||||
|
||||
pub(crate) fn delete_embedding_configs(&self, wtxn: &mut RwTxn<'_>) -> heed::Result<bool> {
|
||||
@ -1591,10 +1593,10 @@ impl Index {
|
||||
pub fn embedding_configs(
|
||||
&self,
|
||||
rtxn: &RoTxn<'_>,
|
||||
) -> Result<Vec<(String, crate::vector::EmbeddingConfig)>> {
|
||||
) -> Result<Vec<(String, EmbeddingConfig, RoaringBitmap)>> {
|
||||
Ok(self
|
||||
.main
|
||||
.remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig)>>>()
|
||||
.remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig, RoaringBitmap)>>>()
|
||||
.get(rtxn, main_key::EMBEDDING_CONFIGS)?
|
||||
.unwrap_or_default())
|
||||
}
|
||||
|
@ -10,16 +10,16 @@ use bytemuck::cast_slice;
|
||||
use grenad::Writer;
|
||||
use itertools::EitherOrBoth;
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||
use crate::prompt::Prompt;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::try_split_at;
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::vector::parsed_vectors::{ParsedVectorsDiff, RESERVED_VECTORS_FIELD_NAME};
|
||||
use crate::vector::Embedder;
|
||||
use crate::{DocumentId, Result, ThreadPoolNoAbort};
|
||||
use crate::{try_split_array_at, DocumentId, Result, ThreadPoolNoAbort};
|
||||
|
||||
/// The length of the elements that are always in the buffer when inserting new values.
|
||||
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
|
||||
@ -35,6 +35,8 @@ pub struct ExtractedVectorPoints {
|
||||
// embedder
|
||||
pub embedder_name: String,
|
||||
pub embedder: Arc<Embedder>,
|
||||
pub user_defined: RoaringBitmap,
|
||||
pub remove_from_user_defined: RoaringBitmap,
|
||||
}
|
||||
|
||||
enum VectorStateDelta {
|
||||
@ -80,6 +82,11 @@ struct EmbedderVectorExtractor {
|
||||
prompts_writer: Writer<BufWriter<File>>,
|
||||
// (docid) -> ()
|
||||
remove_vectors_writer: Writer<BufWriter<File>>,
|
||||
|
||||
// The docids of the documents that contains a user defined embedding
|
||||
user_defined: RoaringBitmap,
|
||||
// The docids of the documents that contains an auto-generated embedding
|
||||
remove_from_user_defined: RoaringBitmap,
|
||||
}
|
||||
|
||||
/// Extracts the embedding vector contained in each document under the `_vectors` field.
|
||||
@ -134,6 +141,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
manual_vectors_writer,
|
||||
prompts_writer,
|
||||
remove_vectors_writer,
|
||||
user_defined: RoaringBitmap::new(),
|
||||
remove_from_user_defined: RoaringBitmap::new(),
|
||||
});
|
||||
}
|
||||
|
||||
@ -141,13 +150,15 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
// this must always be serialized as (docid, external_docid);
|
||||
const SIZE_OF_DOCUMENTID: usize = std::mem::size_of::<DocumentId>();
|
||||
let (docid_bytes, external_id_bytes) =
|
||||
try_split_at(key, std::mem::size_of::<DocumentId>()).unwrap();
|
||||
try_split_array_at::<u8, SIZE_OF_DOCUMENTID>(key).unwrap();
|
||||
debug_assert!(from_utf8(external_id_bytes).is_ok());
|
||||
let docid = DocumentId::from_be_bytes(docid_bytes);
|
||||
|
||||
let obkv = obkv::KvReader::new(value);
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(docid_bytes);
|
||||
key_buffer.extend_from_slice(docid_bytes.as_slice());
|
||||
|
||||
// since we only need the primary key when we throw an error we create this getter to
|
||||
// lazily get it when needed
|
||||
@ -163,10 +174,22 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
manual_vectors_writer,
|
||||
prompts_writer,
|
||||
remove_vectors_writer,
|
||||
user_defined,
|
||||
remove_from_user_defined,
|
||||
} in extractors.iter_mut()
|
||||
{
|
||||
let delta = match parsed_vectors.remove(embedder_name) {
|
||||
(Some(old), Some(new)) => {
|
||||
match (old.is_user_provided(), new.is_user_provided()) {
|
||||
(true, true) | (false, false) => (),
|
||||
(true, false) => {
|
||||
remove_from_user_defined.insert(docid);
|
||||
}
|
||||
(false, true) => {
|
||||
user_defined.insert(docid);
|
||||
}
|
||||
}
|
||||
|
||||
// no autogeneration
|
||||
let del_vectors = old.into_array_of_vectors();
|
||||
let add_vectors = new.into_array_of_vectors();
|
||||
@ -187,6 +210,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
if document_is_kept {
|
||||
remove_from_user_defined.insert(docid);
|
||||
// becomes autogenerated
|
||||
VectorStateDelta::NowGenerated(prompt.render(
|
||||
obkv,
|
||||
@ -198,6 +222,11 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
}
|
||||
}
|
||||
(None, Some(new)) => {
|
||||
if new.is_user_provided() {
|
||||
user_defined.insert(docid);
|
||||
} else {
|
||||
remove_from_user_defined.insert(docid);
|
||||
}
|
||||
// was possibly autogenerated, remove all vectors for that document
|
||||
let add_vectors = new.into_array_of_vectors();
|
||||
if add_vectors.len() > usize::from(u8::MAX) {
|
||||
@ -239,6 +268,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
VectorStateDelta::NoChange
|
||||
}
|
||||
} else {
|
||||
remove_from_user_defined.remove(docid);
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
}
|
||||
@ -265,18 +295,18 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
manual_vectors_writer,
|
||||
prompts_writer,
|
||||
remove_vectors_writer,
|
||||
user_defined,
|
||||
remove_from_user_defined,
|
||||
} in extractors
|
||||
{
|
||||
results.push(ExtractedVectorPoints {
|
||||
// docid, _index -> KvWriterDelAdd -> Vector
|
||||
manual_vectors: writer_into_reader(manual_vectors_writer)?,
|
||||
// docid -> ()
|
||||
remove_vectors: writer_into_reader(remove_vectors_writer)?,
|
||||
// docid -> prompt
|
||||
prompts: writer_into_reader(prompts_writer)?,
|
||||
|
||||
embedder,
|
||||
embedder_name,
|
||||
user_defined,
|
||||
remove_from_user_defined,
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -238,6 +238,8 @@ fn send_original_documents_data(
|
||||
prompts,
|
||||
embedder_name,
|
||||
embedder,
|
||||
user_defined,
|
||||
remove_from_user_defined: auto_generated,
|
||||
} in extracted_vectors
|
||||
{
|
||||
let embeddings = match extract_embeddings(
|
||||
@ -262,6 +264,8 @@ fn send_original_documents_data(
|
||||
expected_dimension: embedder.dimensions(),
|
||||
manual_vectors,
|
||||
embedder_name,
|
||||
user_defined,
|
||||
remove_from_user_defined: auto_generated,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
@ -501,6 +501,8 @@ where
|
||||
embeddings,
|
||||
manual_vectors,
|
||||
embedder_name,
|
||||
user_defined,
|
||||
remove_from_user_defined,
|
||||
} => {
|
||||
dimension.insert(embedder_name.clone(), expected_dimension);
|
||||
TypedChunk::VectorPoints {
|
||||
@ -509,6 +511,8 @@ where
|
||||
expected_dimension,
|
||||
manual_vectors,
|
||||
embedder_name,
|
||||
user_defined,
|
||||
remove_from_user_defined,
|
||||
}
|
||||
}
|
||||
otherwise => otherwise,
|
||||
@ -2616,10 +2620,11 @@ mod tests {
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let mut embedding_configs = index.embedding_configs(&rtxn).unwrap();
|
||||
let (embedder_name, embedder) = embedding_configs.pop().unwrap();
|
||||
let (embedder_name, embedder, user_defined) = embedding_configs.pop().unwrap();
|
||||
insta::assert_snapshot!(embedder_name, @"manual");
|
||||
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0, 1, 2]>");
|
||||
let embedder =
|
||||
std::sync::Arc::new(crate::vector::Embedder::new(embedder.embedder_options).unwrap());
|
||||
assert_eq!("manual", embedder_name);
|
||||
let res = index
|
||||
.search(&rtxn)
|
||||
.semantic(embedder_name, embedder, Some([0.0, 1.0, 2.0].to_vec()))
|
||||
|
@ -90,6 +90,8 @@ pub(crate) enum TypedChunk {
|
||||
expected_dimension: usize,
|
||||
manual_vectors: grenad::Reader<BufReader<File>>,
|
||||
embedder_name: String,
|
||||
user_defined: RoaringBitmap,
|
||||
remove_from_user_defined: RoaringBitmap,
|
||||
},
|
||||
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
|
||||
}
|
||||
@ -155,7 +157,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
|
||||
let embedders: BTreeSet<_> =
|
||||
index.embedding_configs(wtxn)?.into_iter().map(|(k, _v)| k).collect();
|
||||
index.embedding_configs(wtxn)?.into_iter().map(|(name, _, _)| name).collect();
|
||||
let mut vectors_buffer = Vec::new();
|
||||
while let Some((key, reader)) = iter.next()? {
|
||||
let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
|
||||
@ -181,7 +183,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
// if the `_vectors` field cannot be parsed as map of vectors, just write it as-is
|
||||
break 'vectors Some(addition);
|
||||
};
|
||||
vectors.retain_user_provided_vectors(&embedders);
|
||||
vectors.retain_not_embedded_vectors(&embedders);
|
||||
let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors;
|
||||
if vectors.is_empty() {
|
||||
// skip writing empty `_vectors` map
|
||||
@ -619,6 +621,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
|
||||
let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
|
||||
let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn);
|
||||
let mut user_defined = RoaringBitmap::new();
|
||||
let mut remove_from_user_defined = RoaringBitmap::new();
|
||||
let mut params = None;
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::VectorPoints {
|
||||
@ -627,6 +631,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
embeddings,
|
||||
expected_dimension,
|
||||
embedder_name,
|
||||
user_defined: ud,
|
||||
remove_from_user_defined: rud,
|
||||
} = typed_chunk
|
||||
else {
|
||||
unreachable!();
|
||||
@ -639,11 +645,21 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
if let Some(embeddings) = embeddings {
|
||||
embeddings_builder.push(embeddings.into_cursor()?);
|
||||
}
|
||||
user_defined |= ud;
|
||||
remove_from_user_defined |= rud;
|
||||
}
|
||||
|
||||
// typed chunks has always at least 1 chunk.
|
||||
let Some((expected_dimension, embedder_name)) = params else { unreachable!() };
|
||||
|
||||
let mut embedding_configs = index.embedding_configs(&wtxn)?;
|
||||
let (_name, _conf, ud) =
|
||||
embedding_configs.iter_mut().find(|config| config.0 == embedder_name).unwrap();
|
||||
*ud -= remove_from_user_defined;
|
||||
*ud |= user_defined;
|
||||
|
||||
index.put_embedding_configs(wtxn, embedding_configs)?;
|
||||
|
||||
let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
|
||||
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
|
||||
)?;
|
||||
|
@ -6,6 +6,7 @@ use std::sync::Arc;
|
||||
use charabia::{Normalize, Tokenizer, TokenizerBuilder};
|
||||
use deserr::{DeserializeError, Deserr};
|
||||
use itertools::{EitherOrBoth, Itertools};
|
||||
use roaring::RoaringBitmap;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@ -926,8 +927,13 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
Setting::Set(configs) => {
|
||||
let mut changed = false;
|
||||
let old_configs = self.index.embedding_configs(self.wtxn)?;
|
||||
let old_configs: BTreeMap<String, Setting<EmbeddingSettings>> =
|
||||
old_configs.into_iter().map(|(k, v)| (k, Setting::Set(v.into()))).collect();
|
||||
let old_configs: BTreeMap<String, (Setting<EmbeddingSettings>, RoaringBitmap)> =
|
||||
old_configs
|
||||
.into_iter()
|
||||
.map(|(name, setting, user_defined)| {
|
||||
(name, (Setting::Set(setting.into()), user_defined))
|
||||
})
|
||||
.collect();
|
||||
|
||||
let mut new_configs = BTreeMap::new();
|
||||
for joined in old_configs
|
||||
@ -936,15 +942,19 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
{
|
||||
match joined {
|
||||
// updated config
|
||||
EitherOrBoth::Both((name, mut old), (_, new)) => {
|
||||
EitherOrBoth::Both((name, (mut old, user_defined)), (_, new)) => {
|
||||
changed |= EmbeddingSettings::apply_and_need_reindex(&mut old, new);
|
||||
if changed {
|
||||
tracing::debug!(embedder = name, "need reindex");
|
||||
tracing::debug!(
|
||||
embedder = name,
|
||||
documents = user_defined.len(),
|
||||
"need reindex"
|
||||
);
|
||||
} else {
|
||||
tracing::debug!(embedder = name, "skip reindex");
|
||||
}
|
||||
let new = validate_embedding_settings(old, &name)?;
|
||||
new_configs.insert(name, new);
|
||||
new_configs.insert(name, (new, user_defined));
|
||||
}
|
||||
// unchanged config
|
||||
EitherOrBoth::Left((name, setting)) => {
|
||||
@ -961,21 +971,23 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
);
|
||||
let setting = validate_embedding_settings(setting, &name)?;
|
||||
changed = true;
|
||||
new_configs.insert(name, setting);
|
||||
new_configs.insert(name, (setting, RoaringBitmap::new()));
|
||||
}
|
||||
}
|
||||
}
|
||||
let new_configs: Vec<(String, EmbeddingConfig)> = new_configs
|
||||
let new_configs: Vec<(String, EmbeddingConfig, RoaringBitmap)> = new_configs
|
||||
.into_iter()
|
||||
.filter_map(|(name, setting)| match setting {
|
||||
Setting::Set(value) => Some((name, value.into())),
|
||||
.filter_map(|(name, (setting, user_defined))| match setting {
|
||||
Setting::Set(settings) => Some((name, settings.into(), user_defined)),
|
||||
Setting::Reset => None,
|
||||
Setting::NotSet => Some((name, EmbeddingSettings::default().into())),
|
||||
Setting::NotSet => {
|
||||
Some((name, EmbeddingSettings::default().into(), user_defined))
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
self.index.embedder_category_id.clear(self.wtxn)?;
|
||||
for (index, (embedder_name, _)) in new_configs.iter().enumerate() {
|
||||
for (index, (embedder_name, _, _)) in new_configs.iter().enumerate() {
|
||||
self.index.embedder_category_id.put_with_flags(
|
||||
self.wtxn,
|
||||
heed::PutFlags::APPEND,
|
||||
@ -1359,10 +1371,12 @@ impl InnerIndexSettings {
|
||||
}
|
||||
}
|
||||
|
||||
fn embedders(embedding_configs: Vec<(String, EmbeddingConfig)>) -> Result<EmbeddingConfigs> {
|
||||
fn embedders(
|
||||
embedding_configs: Vec<(String, EmbeddingConfig, RoaringBitmap)>,
|
||||
) -> Result<EmbeddingConfigs> {
|
||||
let res: Result<_> = embedding_configs
|
||||
.into_iter()
|
||||
.map(|(name, EmbeddingConfig { embedder_options, prompt })| {
|
||||
.map(|(name, EmbeddingConfig { embedder_options, prompt }, _)| {
|
||||
let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?);
|
||||
|
||||
let embedder = Arc::new(
|
||||
|
@ -17,6 +17,13 @@ pub enum Vectors {
|
||||
}
|
||||
|
||||
impl Vectors {
|
||||
pub fn is_user_provided(&self) -> bool {
|
||||
match self {
|
||||
Vectors::ImplicitlyUserProvided(_) => true,
|
||||
Vectors::Explicit(ExplicitVectors { user_provided, .. }) => *user_provided,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into_array_of_vectors(self) -> Vec<Embedding> {
|
||||
match self {
|
||||
Vectors::ImplicitlyUserProvided(embeddings)
|
||||
@ -89,15 +96,8 @@ impl ParsedVectors {
|
||||
Ok(ParsedVectors(value))
|
||||
}
|
||||
|
||||
pub fn retain_user_provided_vectors(&mut self, embedders: &BTreeSet<String>) {
|
||||
self.0.retain(|k, v| match v {
|
||||
Vectors::ImplicitlyUserProvided(_) => true,
|
||||
Vectors::Explicit(ExplicitVectors { embeddings: _, user_provided }) => {
|
||||
*user_provided
|
||||
// if the embedder is not in the config, then never touch it
|
||||
|| !embedders.contains(k)
|
||||
}
|
||||
});
|
||||
pub fn retain_not_embedded_vectors(&mut self, embedders: &BTreeSet<String>) {
|
||||
self.0.retain(|k, _v| !embedders.contains(k))
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user