Implement in new document indexer

This commit is contained in:
Louis Dureuil 2025-06-30 00:00:22 +02:00
parent 22d363c05a
commit f8232976ed
No known key found for this signature in database
10 changed files with 886 additions and 391 deletions

View file

@ -138,6 +138,7 @@ pub enum ReceiverAction {
WakeUp, WakeUp,
LargeEntry(LargeEntry), LargeEntry(LargeEntry),
LargeVectors(LargeVectors), LargeVectors(LargeVectors),
LargeVector(LargeVector),
} }
/// An entry that cannot fit in the BBQueue buffers has been /// An entry that cannot fit in the BBQueue buffers has been
@ -174,6 +175,24 @@ impl LargeVectors {
} }
} }
#[derive(Debug)]
pub struct LargeVector {
/// The document id associated to the large embedding.
pub docid: DocumentId,
/// The embedder id in which to insert the large embedding.
pub embedder_id: u8,
/// The extractor id in which to insert the large embedding.
pub extractor_id: u8,
/// The large embedding that must be written.
pub embedding: Mmap,
}
impl LargeVector {
pub fn read_embedding(&self, dimensions: usize) -> &[f32] {
self.embedding.chunks_exact(dimensions).map(bytemuck::cast_slice).next().unwrap()
}
}
impl<'a> WriterBbqueueReceiver<'a> { impl<'a> WriterBbqueueReceiver<'a> {
/// Tries to receive an action to do until the timeout occurs /// Tries to receive an action to do until the timeout occurs
/// and if it does, consider it as a spurious wake up. /// and if it does, consider it as a spurious wake up.
@ -238,6 +257,7 @@ pub enum EntryHeader {
DbOperation(DbOperation), DbOperation(DbOperation),
ArroyDeleteVector(ArroyDeleteVector), ArroyDeleteVector(ArroyDeleteVector),
ArroySetVectors(ArroySetVectors), ArroySetVectors(ArroySetVectors),
ArroySetVector(ArroySetVector),
} }
impl EntryHeader { impl EntryHeader {
@ -250,6 +270,7 @@ impl EntryHeader {
EntryHeader::DbOperation(_) => 0, EntryHeader::DbOperation(_) => 0,
EntryHeader::ArroyDeleteVector(_) => 1, EntryHeader::ArroyDeleteVector(_) => 1,
EntryHeader::ArroySetVectors(_) => 2, EntryHeader::ArroySetVectors(_) => 2,
EntryHeader::ArroySetVector(_) => 3,
} }
} }
@ -274,11 +295,17 @@ impl EntryHeader {
Self::variant_size() + mem::size_of::<ArroySetVectors>() + embedding_size * count Self::variant_size() + mem::size_of::<ArroySetVectors>() + embedding_size * count
} }
fn total_set_vector_size(dimensions: usize) -> usize {
let embedding_size = dimensions * mem::size_of::<f32>();
Self::variant_size() + mem::size_of::<ArroySetVector>() + embedding_size
}
fn header_size(&self) -> usize { fn header_size(&self) -> usize {
let payload_size = match self { let payload_size = match self {
EntryHeader::DbOperation(op) => mem::size_of_val(op), EntryHeader::DbOperation(op) => mem::size_of_val(op),
EntryHeader::ArroyDeleteVector(adv) => mem::size_of_val(adv), EntryHeader::ArroyDeleteVector(adv) => mem::size_of_val(adv),
EntryHeader::ArroySetVectors(asvs) => mem::size_of_val(asvs), EntryHeader::ArroySetVectors(asvs) => mem::size_of_val(asvs),
EntryHeader::ArroySetVector(asv) => mem::size_of_val(asv),
}; };
Self::variant_size() + payload_size Self::variant_size() + payload_size
} }
@ -301,6 +328,11 @@ impl EntryHeader {
let header = checked::pod_read_unaligned(header_bytes); let header = checked::pod_read_unaligned(header_bytes);
EntryHeader::ArroySetVectors(header) EntryHeader::ArroySetVectors(header)
} }
3 => {
let header_bytes = &remaining[..mem::size_of::<ArroySetVector>()];
let header = checked::pod_read_unaligned(header_bytes);
EntryHeader::ArroySetVector(header)
}
id => panic!("invalid variant id: {id}"), id => panic!("invalid variant id: {id}"),
} }
} }
@ -311,6 +343,7 @@ impl EntryHeader {
EntryHeader::DbOperation(op) => bytemuck::bytes_of(op), EntryHeader::DbOperation(op) => bytemuck::bytes_of(op),
EntryHeader::ArroyDeleteVector(adv) => bytemuck::bytes_of(adv), EntryHeader::ArroyDeleteVector(adv) => bytemuck::bytes_of(adv),
EntryHeader::ArroySetVectors(asvs) => bytemuck::bytes_of(asvs), EntryHeader::ArroySetVectors(asvs) => bytemuck::bytes_of(asvs),
EntryHeader::ArroySetVector(asv) => bytemuck::bytes_of(asv),
}; };
*first = self.variant_id(); *first = self.variant_id();
remaining.copy_from_slice(payload_bytes); remaining.copy_from_slice(payload_bytes);
@ -379,6 +412,37 @@ impl ArroySetVectors {
} }
} }
#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)]
#[repr(C)]
/// The embeddings are in the remaining space and represents
/// non-aligned [f32] each with dimensions f32s.
pub struct ArroySetVector {
pub docid: DocumentId,
pub embedder_id: u8,
pub extractor_id: u8,
_padding: [u8; 2],
}
impl ArroySetVector {
fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] {
let skip = EntryHeader::variant_size() + mem::size_of::<Self>();
&frame[skip..]
}
/// Read the embedding and write it into an aligned `f32` Vec.
pub fn read_all_embeddings_into_vec<'v>(
&self,
frame: &FrameGrantR<'_>,
vec: &'v mut Vec<f32>,
) -> &'v [f32] {
let embeddings_bytes = Self::embeddings_bytes(frame);
let embeddings_count = embeddings_bytes.len() / mem::size_of::<f32>();
vec.resize(embeddings_count, 0.0);
bytemuck::cast_slice_mut(vec.as_mut()).copy_from_slice(embeddings_bytes);
&vec[..]
}
}
#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] #[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)]
#[repr(u16)] #[repr(u16)]
pub enum Database { pub enum Database {
@ -398,6 +462,7 @@ pub enum Database {
FacetIdStringDocids, FacetIdStringDocids,
FieldIdDocidFacetStrings, FieldIdDocidFacetStrings,
FieldIdDocidFacetF64s, FieldIdDocidFacetF64s,
VectorEmbedderCategoryId,
} }
impl Database { impl Database {
@ -419,6 +484,7 @@ impl Database {
Database::FacetIdStringDocids => index.facet_id_string_docids.remap_types(), Database::FacetIdStringDocids => index.facet_id_string_docids.remap_types(),
Database::FieldIdDocidFacetStrings => index.field_id_docid_facet_strings.remap_types(), Database::FieldIdDocidFacetStrings => index.field_id_docid_facet_strings.remap_types(),
Database::FieldIdDocidFacetF64s => index.field_id_docid_facet_f64s.remap_types(), Database::FieldIdDocidFacetF64s => index.field_id_docid_facet_f64s.remap_types(),
Database::VectorEmbedderCategoryId => index.embedder_category_id.remap_types(),
} }
} }
@ -440,6 +506,7 @@ impl Database {
Database::FacetIdStringDocids => db_name::FACET_ID_STRING_DOCIDS, Database::FacetIdStringDocids => db_name::FACET_ID_STRING_DOCIDS,
Database::FieldIdDocidFacetStrings => db_name::FIELD_ID_DOCID_FACET_STRINGS, Database::FieldIdDocidFacetStrings => db_name::FIELD_ID_DOCID_FACET_STRINGS,
Database::FieldIdDocidFacetF64s => db_name::FIELD_ID_DOCID_FACET_F64S, Database::FieldIdDocidFacetF64s => db_name::FIELD_ID_DOCID_FACET_F64S,
Database::VectorEmbedderCategoryId => db_name::VECTOR_EMBEDDER_CATEGORY_ID,
} }
} }
} }
@ -568,6 +635,82 @@ impl<'b> ExtractorBbqueueSender<'b> {
Ok(()) Ok(())
} }
fn set_vector_for_extractor(
&self,
docid: u32,
embedder_id: u8,
extractor_id: u8,
embedding: Option<Embedding>,
) -> crate::Result<()> {
let max_grant = self.max_grant;
let refcell = self.producers.get().unwrap();
let mut producer = refcell.0.borrow_mut_or_yield();
// If there are no vectors we specify the dimensions
// to zero to allocate no extra space at all
let dimensions = embedding.as_ref().map_or(0, |emb| emb.len());
let arroy_set_vector =
ArroySetVector { docid, embedder_id, extractor_id, _padding: [0; 2] };
let payload_header = EntryHeader::ArroySetVector(arroy_set_vector);
let total_length = EntryHeader::total_set_vector_size(dimensions);
if total_length > max_grant {
let mut value_file = tempfile::tempfile().map(BufWriter::new)?;
let embedding = embedding.expect("set_vector without a vector does not fit in RAM");
let mut embedding_bytes = bytemuck::cast_slice(&embedding);
io::copy(&mut embedding_bytes, &mut value_file)?;
let value_file = value_file.into_inner().map_err(|ie| ie.into_error())?;
let embedding = unsafe { Mmap::map(&value_file)? };
let large_vectors = LargeVector { docid, embedder_id, extractor_id, embedding };
self.sender.send(ReceiverAction::LargeVector(large_vectors)).unwrap();
return Ok(());
}
// Spin loop to have a frame the size we requested.
reserve_and_write_grant(
&mut producer,
total_length,
&self.sender,
&self.sent_messages_attempts,
&self.blocking_sent_messages_attempts,
|grant| {
let header_size = payload_header.header_size();
let (header_bytes, remaining) = grant.split_at_mut(header_size);
payload_header.serialize_into(header_bytes);
if dimensions != 0 {
let output_iter =
remaining.chunks_exact_mut(dimensions * mem::size_of::<f32>());
for (embedding, output) in embedding.iter().zip(output_iter) {
output.copy_from_slice(bytemuck::cast_slice(embedding));
}
}
Ok(())
},
)?;
Ok(())
}
fn embedding_status(
&self,
name: &str,
infos: crate::vector::db::EmbedderInfo,
) -> crate::Result<()> {
let bytes = infos.to_bytes().map_err(|_| {
InternalError::Serialization(crate::SerializationError::Encoding {
db_name: Some(Database::VectorEmbedderCategoryId.database_name()),
})
})?;
self.write_key_value(Database::VectorEmbedderCategoryId, name.as_bytes(), &bytes)
}
fn write_key_value(&self, database: Database, key: &[u8], value: &[u8]) -> crate::Result<()> { fn write_key_value(&self, database: Database, key: &[u8], value: &[u8]) -> crate::Result<()> {
let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| { let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| {
InternalError::StorePut { InternalError::StorePut {
@ -942,9 +1085,18 @@ impl EmbeddingSender<'_, '_> {
&self, &self,
docid: DocumentId, docid: DocumentId,
embedder_id: u8, embedder_id: u8,
embedding: Embedding, extractor_id: u8,
embedding: Option<Embedding>,
) -> crate::Result<()> { ) -> crate::Result<()> {
self.0.set_vectors(docid, embedder_id, &[embedding]) self.0.set_vector_for_extractor(docid, embedder_id, extractor_id, embedding)
}
pub(crate) fn embedding_status(
&self,
name: &str,
infos: crate::vector::db::EmbedderInfo,
) -> crate::Result<()> {
self.0.embedding_status(name, infos)
} }
} }

View file

@ -12,6 +12,7 @@ use super::vector_document::VectorDocument;
use super::{KvReaderFieldId, KvWriterFieldId}; use super::{KvReaderFieldId, KvWriterFieldId};
use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME}; use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME};
use crate::documents::FieldIdMapper; use crate::documents::FieldIdMapper;
use crate::update::del_add::KvReaderDelAdd;
use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal};
use crate::update::new::vector_document::VectorDocumentFromDb; use crate::update::new::vector_document::VectorDocumentFromDb;
use crate::vector::settings::EmbedderAction; use crate::vector::settings::EmbedderAction;
@ -469,6 +470,110 @@ impl<'doc> Versions<'doc> {
} }
} }
#[derive(Debug)]
pub struct KvDelAddDocument<'a, Mapper: FieldIdMapper> {
document: &'a obkv::KvReaderU16,
side: crate::update::del_add::DelAdd,
fields_ids_map: &'a Mapper,
}
impl<'a, Mapper: FieldIdMapper> KvDelAddDocument<'a, Mapper> {
pub fn new(
document: &'a obkv::KvReaderU16,
side: crate::update::del_add::DelAdd,
fields_ids_map: &'a Mapper,
) -> Self {
Self { document, side, fields_ids_map }
}
fn get(&self, k: &str) -> Result<Option<&'a RawValue>> {
let Some(id) = self.fields_ids_map.id(k) else { return Ok(None) };
let Some(value) = self.document.get(id) else { return Ok(None) };
let Some(value) = KvReaderDelAdd::from_slice(value).get(self.side) else { return Ok(None) };
let value = serde_json::from_slice(value).map_err(crate::InternalError::SerdeJson)?;
Ok(Some(value))
}
}
impl<'a, Mapper: FieldIdMapper> Document<'a> for KvDelAddDocument<'a, Mapper> {
fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'a str, &'a RawValue)>> {
let mut it = self.document.iter();
std::iter::from_fn(move || loop {
let (fid, value) = it.next()?;
let Some(value) = KvReaderDelAdd::from_slice(value).get(self.side) else {
continue;
};
let name = match self.fields_ids_map.name(fid).ok_or(
InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldId {
field_id: fid,
process: "getting current document",
}),
) {
Ok(name) => name,
Err(error) => return Some(Err(error.into())),
};
if name == RESERVED_VECTORS_FIELD_NAME || name == RESERVED_GEO_FIELD_NAME {
continue;
}
let res = (|| {
let value =
serde_json::from_slice(value).map_err(crate::InternalError::SerdeJson)?;
Ok((name, value))
})();
return Some(res);
})
}
fn top_level_fields_count(&self) -> usize {
let mut it = self.document.iter();
std::iter::from_fn(move || loop {
let (fid, value) = it.next()?;
let Some(_) = KvReaderDelAdd::from_slice(value).get(self.side) else {
continue;
};
let name = match self.fields_ids_map.name(fid).ok_or(
InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldId {
field_id: fid,
process: "getting current document",
}),
) {
Ok(name) => name,
Err(_) => return Some(()),
};
if name == RESERVED_VECTORS_FIELD_NAME || name == RESERVED_GEO_FIELD_NAME {
continue;
}
return Some(());
})
.count()
}
fn top_level_field(&self, k: &str) -> Result<Option<&'a RawValue>> {
if k == RESERVED_VECTORS_FIELD_NAME || k == RESERVED_GEO_FIELD_NAME {
return Ok(None);
}
self.get(k)
}
fn vectors_field(&self) -> Result<Option<&'a RawValue>> {
self.get(RESERVED_VECTORS_FIELD_NAME)
}
fn geo_field(&self) -> Result<Option<&'a RawValue>> {
self.get(RESERVED_GEO_FIELD_NAME)
}
}
pub struct DocumentIdentifiers<'doc> { pub struct DocumentIdentifiers<'doc> {
docid: DocumentId, docid: DocumentId,
external_document_id: &'doc str, external_document_id: &'doc str,

View file

@ -11,7 +11,7 @@ use super::vector_document::{
use crate::attribute_patterns::PatternMatch; use crate::attribute_patterns::PatternMatch;
use crate::documents::FieldIdMapper; use crate::documents::FieldIdMapper;
use crate::update::new::document::DocumentIdentifiers; use crate::update::new::document::DocumentIdentifiers;
use crate::vector::EmbeddingConfigs; use crate::vector::RuntimeEmbedders;
use crate::{DocumentId, Index, InternalError, Result}; use crate::{DocumentId, Index, InternalError, Result};
pub enum DocumentChange<'doc> { pub enum DocumentChange<'doc> {
@ -70,7 +70,7 @@ impl<'doc> Insertion<'doc> {
pub fn inserted_vectors( pub fn inserted_vectors(
&self, &self,
doc_alloc: &'doc Bump, doc_alloc: &'doc Bump,
embedders: &'doc EmbeddingConfigs, embedders: &'doc RuntimeEmbedders,
) -> Result<Option<VectorDocumentFromVersions<'doc>>> { ) -> Result<Option<VectorDocumentFromVersions<'doc>>> {
VectorDocumentFromVersions::new(self.external_document_id, &self.new, doc_alloc, embedders) VectorDocumentFromVersions::new(self.external_document_id, &self.new, doc_alloc, embedders)
} }
@ -241,7 +241,7 @@ impl<'doc> Update<'doc> {
pub fn only_changed_vectors( pub fn only_changed_vectors(
&self, &self,
doc_alloc: &'doc Bump, doc_alloc: &'doc Bump,
embedders: &'doc EmbeddingConfigs, embedders: &'doc RuntimeEmbedders,
) -> Result<Option<VectorDocumentFromVersions<'doc>>> { ) -> Result<Option<VectorDocumentFromVersions<'doc>>> {
VectorDocumentFromVersions::new(self.external_document_id, &self.new, doc_alloc, embedders) VectorDocumentFromVersions::new(self.external_document_id, &self.new, doc_alloc, embedders)
} }
@ -252,7 +252,7 @@ impl<'doc> Update<'doc> {
index: &'doc Index, index: &'doc Index,
mapper: &'doc Mapper, mapper: &'doc Mapper,
doc_alloc: &'doc Bump, doc_alloc: &'doc Bump,
embedders: &'doc EmbeddingConfigs, embedders: &'doc RuntimeEmbedders,
) -> Result<Option<MergedVectorDocument<'doc>>> { ) -> Result<Option<MergedVectorDocument<'doc>>> {
if self.from_scratch { if self.from_scratch {
MergedVectorDocument::without_db( MergedVectorDocument::without_db(

View file

@ -7,8 +7,7 @@ use hashbrown::HashMap;
use super::DelAddRoaringBitmap; use super::DelAddRoaringBitmap;
use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::constants::RESERVED_GEO_FIELD_NAME;
use crate::update::new::channel::{DocumentsSender, ExtractorBbqueueSender}; use crate::update::new::channel::{DocumentsSender, ExtractorBbqueueSender};
use crate::update::new::document::{write_to_obkv, Document}; use crate::update::new::document::{write_to_obkv, Document, DocumentContext, DocumentIdentifiers};
use crate::update::new::document::{DocumentContext, DocumentIdentifiers};
use crate::update::new::indexer::document_changes::{Extractor, IndexingContext}; use crate::update::new::indexer::document_changes::{Extractor, IndexingContext};
use crate::update::new::indexer::settings_changes::{ use crate::update::new::indexer::settings_changes::{
settings_change_extract, DocumentsIndentifiers, SettingsChangeExtractor, settings_change_extract, DocumentsIndentifiers, SettingsChangeExtractor,
@ -19,16 +18,16 @@ use crate::update::new::vector_document::VectorDocument;
use crate::update::new::DocumentChange; use crate::update::new::DocumentChange;
use crate::update::settings::SettingsDelta; use crate::update::settings::SettingsDelta;
use crate::vector::settings::EmbedderAction; use crate::vector::settings::EmbedderAction;
use crate::vector::EmbeddingConfigs; use crate::vector::RuntimeEmbedders;
use crate::Result; use crate::Result;
pub struct DocumentsExtractor<'a, 'b> { pub struct DocumentsExtractor<'a, 'b> {
document_sender: DocumentsSender<'a, 'b>, document_sender: DocumentsSender<'a, 'b>,
embedders: &'a EmbeddingConfigs, embedders: &'a RuntimeEmbedders,
} }
impl<'a, 'b> DocumentsExtractor<'a, 'b> { impl<'a, 'b> DocumentsExtractor<'a, 'b> {
pub fn new(document_sender: DocumentsSender<'a, 'b>, embedders: &'a EmbeddingConfigs) -> Self { pub fn new(document_sender: DocumentsSender<'a, 'b>, embedders: &'a RuntimeEmbedders) -> Self {
Self { document_sender, embedders } Self { document_sender, embedders }
} }
} }

File diff suppressed because it is too large Load diff

View file

@ -13,21 +13,17 @@ use super::super::thread_local::{FullySend, ThreadLocal};
use super::super::FacetFieldIdsDelta; use super::super::FacetFieldIdsDelta;
use super::document_changes::{extract, DocumentChanges, IndexingContext}; use super::document_changes::{extract, DocumentChanges, IndexingContext};
use super::settings_changes::settings_change_extract; use super::settings_changes::settings_change_extract;
use crate::documents::FieldIdMapper; use crate::documents::{FieldIdMapper, PrimaryKey};
use crate::documents::PrimaryKey; use crate::progress::{EmbedderStats, MergingWordCache};
use crate::index::IndexEmbeddingConfig;
use crate::progress::EmbedderStats;
use crate::progress::MergingWordCache;
use crate::proximity::ProximityPrecision; use crate::proximity::ProximityPrecision;
use crate::update::new::extract::EmbeddingExtractor; use crate::update::new::extract::EmbeddingExtractor;
use crate::update::new::indexer::settings_changes::DocumentsIndentifiers; use crate::update::new::indexer::settings_changes::DocumentsIndentifiers;
use crate::update::new::merger::merge_and_send_rtree; use crate::update::new::merger::merge_and_send_rtree;
use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases}; use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases};
use crate::update::settings::SettingsDelta; use crate::update::settings::SettingsDelta;
use crate::vector::EmbeddingConfigs; use crate::vector::db::IndexEmbeddingConfig;
use crate::Index; use crate::vector::RuntimeEmbedders;
use crate::InternalError; use crate::{Index, InternalError, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
use crate::{Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
pub(super) fn extract_all<'pl, 'extractor, DC, MSP>( pub(super) fn extract_all<'pl, 'extractor, DC, MSP>(
@ -35,7 +31,7 @@ pub(super) fn extract_all<'pl, 'extractor, DC, MSP>(
indexing_context: IndexingContext<MSP>, indexing_context: IndexingContext<MSP>,
indexer_span: Span, indexer_span: Span,
extractor_sender: ExtractorBbqueueSender, extractor_sender: ExtractorBbqueueSender,
embedders: &EmbeddingConfigs, embedders: &RuntimeEmbedders,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>, extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
finished_extraction: &AtomicBool, finished_extraction: &AtomicBool,
field_distribution: &mut BTreeMap<String, u64>, field_distribution: &mut BTreeMap<String, u64>,
@ -275,14 +271,19 @@ where
let span = tracing::debug_span!(target: "indexing::documents::merge", "vectors"); let span = tracing::debug_span!(target: "indexing::documents::merge", "vectors");
let _entered = span.enter(); let _entered = span.enter();
let embedder_configs = index.embedding_configs();
for config in &mut index_embeddings { for config in &mut index_embeddings {
let mut infos = embedder_configs.embedder_info(&rtxn, &config.name)?.unwrap();
'data: for data in datastore.iter_mut() { 'data: for data in datastore.iter_mut() {
let data = &mut data.get_mut().0; let data = &mut data.get_mut().0;
let Some(deladd) = data.remove(&config.name) else { let Some(delta) = data.remove(&config.name) else {
continue 'data; continue 'data;
}; };
deladd.apply_to(&mut config.user_provided, modified_docids); delta.apply_to(&mut infos.embedding_status);
} }
extractor_sender.embeddings().embedding_status(&config.name, infos).unwrap();
} }
} }
} }

View file

@ -24,7 +24,7 @@ use crate::progress::{EmbedderStats, Progress};
use crate::update::settings::SettingsDelta; use crate::update::settings::SettingsDelta;
use crate::update::GrenadParameters; use crate::update::GrenadParameters;
use crate::vector::settings::{EmbedderAction, WriteBackToDocuments}; use crate::vector::settings::{EmbedderAction, WriteBackToDocuments};
use crate::vector::{ArroyWrapper, Embedder, EmbeddingConfigs}; use crate::vector::{ArroyWrapper, Embedder, RuntimeEmbedders};
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort}; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort};
pub(crate) mod de; pub(crate) mod de;
@ -54,7 +54,7 @@ pub fn index<'pl, 'indexer, 'index, DC, MSP>(
new_fields_ids_map: FieldsIdsMap, new_fields_ids_map: FieldsIdsMap,
new_primary_key: Option<PrimaryKey<'pl>>, new_primary_key: Option<PrimaryKey<'pl>>,
document_changes: &DC, document_changes: &DC,
embedders: EmbeddingConfigs, embedders: RuntimeEmbedders,
must_stop_processing: &'indexer MSP, must_stop_processing: &'indexer MSP,
progress: &'indexer Progress, progress: &'indexer Progress,
embedder_stats: &'indexer EmbedderStats, embedder_stats: &'indexer EmbedderStats,
@ -93,7 +93,7 @@ where
grenad_parameters: &grenad_parameters, grenad_parameters: &grenad_parameters,
}; };
let index_embeddings = index.embedding_configs(wtxn)?; let index_embeddings = index.embedding_configs().embedding_configs(wtxn)?;
let mut field_distribution = index.field_distribution(wtxn)?; let mut field_distribution = index.field_distribution(wtxn)?;
let mut document_ids = index.documents_ids(wtxn)?; let mut document_ids = index.documents_ids(wtxn)?;
let mut modified_docids = roaring::RoaringBitmap::new(); let mut modified_docids = roaring::RoaringBitmap::new();
@ -133,20 +133,21 @@ where
let arroy_writers: Result<HashMap<_, _>> = embedders let arroy_writers: Result<HashMap<_, _>> = embedders
.inner_as_ref() .inner_as_ref()
.iter() .iter()
.map(|(embedder_name, (embedder, _, was_quantized))| { .map(|(embedder_name, runtime)| {
let embedder_index = index.embedder_category_id.get(wtxn, embedder_name)?.ok_or( let embedder_index = index
InternalError::DatabaseMissingEntry { .embedding_configs()
.embedder_id(wtxn, embedder_name)?
.ok_or(InternalError::DatabaseMissingEntry {
db_name: "embedder_category_id", db_name: "embedder_category_id",
key: None, key: None,
}, })?;
)?;
let dimensions = embedder.dimensions(); let dimensions = runtime.embedder.dimensions();
let writer = ArroyWrapper::new(vector_arroy, embedder_index, *was_quantized); let writer = ArroyWrapper::new(vector_arroy, embedder_index, runtime.is_quantized);
Ok(( Ok((
embedder_index, embedder_index,
(embedder_name.as_str(), embedder.as_ref(), writer, dimensions), (embedder_name.as_str(), &*runtime.embedder, writer, dimensions),
)) ))
}) })
.collect(); .collect();

View file

@ -11,11 +11,11 @@ use super::super::channel::*;
use crate::database_stats::DatabaseStats; use crate::database_stats::DatabaseStats;
use crate::documents::PrimaryKey; use crate::documents::PrimaryKey;
use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
use crate::index::IndexEmbeddingConfig;
use crate::progress::Progress; use crate::progress::Progress;
use crate::update::settings::InnerIndexSettings; use crate::update::settings::InnerIndexSettings;
use crate::vector::db::IndexEmbeddingConfig;
use crate::vector::settings::EmbedderAction; use crate::vector::settings::EmbedderAction;
use crate::vector::{ArroyWrapper, Embedder, EmbeddingConfigs, Embeddings}; use crate::vector::{ArroyWrapper, Embedder, Embeddings, RuntimeEmbedders};
use crate::{Error, Index, InternalError, Result, UserError}; use crate::{Error, Index, InternalError, Result, UserError};
pub fn write_to_db( pub fn write_to_db(
@ -64,6 +64,14 @@ pub fn write_to_db(
writer.del_items(wtxn, *dimensions, docid)?; writer.del_items(wtxn, *dimensions, docid)?;
writer.add_items(wtxn, docid, &embeddings)?; writer.add_items(wtxn, docid, &embeddings)?;
} }
ReceiverAction::LargeVector(
large_vector @ LargeVector { docid, embedder_id, extractor_id, .. },
) => {
let (_, _, writer, dimensions) =
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
let embedding = large_vector.read_embedding(*dimensions);
writer.add_item_in_store(wtxn, docid, extractor_id, embedding)?;
}
} }
// Every time the is a message in the channel we search // Every time the is a message in the channel we search
@ -137,7 +145,7 @@ where
)?; )?;
} }
index.put_embedding_configs(wtxn, index_embeddings)?; index.embedding_configs().put_embedding_configs(wtxn, index_embeddings)?;
Ok(()) Ok(())
} }
@ -147,7 +155,7 @@ pub(super) fn update_index(
wtxn: &mut RwTxn<'_>, wtxn: &mut RwTxn<'_>,
new_fields_ids_map: FieldIdMapWithMetadata, new_fields_ids_map: FieldIdMapWithMetadata,
new_primary_key: Option<PrimaryKey<'_>>, new_primary_key: Option<PrimaryKey<'_>>,
embedders: EmbeddingConfigs, embedders: RuntimeEmbedders,
field_distribution: std::collections::BTreeMap<String, u64>, field_distribution: std::collections::BTreeMap<String, u64>,
document_ids: roaring::RoaringBitmap, document_ids: roaring::RoaringBitmap,
) -> Result<()> { ) -> Result<()> {
@ -226,14 +234,36 @@ pub fn write_from_bbqueue(
arroy_writers.get(&embedder_id).expect("requested a missing embedder"); arroy_writers.get(&embedder_id).expect("requested a missing embedder");
let mut embeddings = Embeddings::new(*dimensions); let mut embeddings = Embeddings::new(*dimensions);
let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding); let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding);
if embeddings.append(all_embeddings.to_vec()).is_err() {
return Err(Error::UserError(UserError::InvalidVectorDimensions {
expected: *dimensions,
found: all_embeddings.len(),
}));
}
writer.del_items(wtxn, *dimensions, docid)?; writer.del_items(wtxn, *dimensions, docid)?;
writer.add_items(wtxn, docid, &embeddings)?; if !all_embeddings.is_empty() {
if embeddings.append(all_embeddings.to_vec()).is_err() {
return Err(Error::UserError(UserError::InvalidVectorDimensions {
expected: *dimensions,
found: all_embeddings.len(),
}));
}
writer.add_items(wtxn, docid, &embeddings)?;
}
}
EntryHeader::ArroySetVector(
asv @ ArroySetVector { docid, embedder_id, extractor_id, .. },
) => {
let frame = frame_with_header.frame();
let (_, _, writer, dimensions) =
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
let embedding = asv.read_all_embeddings_into_vec(frame, aligned_embedding);
if embedding.is_empty() {
writer.del_item_in_store(wtxn, docid, extractor_id, *dimensions)?;
} else {
if embedding.len() != *dimensions {
return Err(Error::UserError(UserError::InvalidVectorDimensions {
expected: *dimensions,
found: embedding.len(),
}));
}
writer.add_item_in_store(wtxn, docid, extractor_id, embedding)?;
}
} }
} }
} }

View file

@ -12,9 +12,9 @@ use super::document::{Document, DocumentFromDb, DocumentFromVersions, Versions};
use super::indexer::de::DeserrRawValue; use super::indexer::de::DeserrRawValue;
use crate::constants::RESERVED_VECTORS_FIELD_NAME; use crate::constants::RESERVED_VECTORS_FIELD_NAME;
use crate::documents::FieldIdMapper; use crate::documents::FieldIdMapper;
use crate::index::IndexEmbeddingConfig; use crate::vector::db::{EmbeddingStatus, IndexEmbeddingConfig};
use crate::vector::parsed_vectors::{RawVectors, RawVectorsError, VectorOrArrayOfVectors}; use crate::vector::parsed_vectors::{RawVectors, RawVectorsError, VectorOrArrayOfVectors};
use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfigs}; use crate::vector::{ArroyWrapper, Embedding, RuntimeEmbedders};
use crate::{DocumentId, Index, InternalError, Result, UserError}; use crate::{DocumentId, Index, InternalError, Result, UserError};
#[derive(Serialize)] #[derive(Serialize)]
@ -109,7 +109,7 @@ impl<'t> VectorDocumentFromDb<'t> {
None => None, None => None,
}; };
let embedding_config = index.embedding_configs(rtxn)?; let embedding_config = index.embedding_configs().embedding_configs(rtxn)?;
Ok(Some(Self { docid, embedding_config, index, vectors_field, rtxn, doc_alloc })) Ok(Some(Self { docid, embedding_config, index, vectors_field, rtxn, doc_alloc }))
} }
@ -118,6 +118,7 @@ impl<'t> VectorDocumentFromDb<'t> {
&self, &self,
embedder_id: u8, embedder_id: u8,
config: &IndexEmbeddingConfig, config: &IndexEmbeddingConfig,
status: &EmbeddingStatus,
) -> Result<VectorEntry<'t>> { ) -> Result<VectorEntry<'t>> {
let reader = let reader =
ArroyWrapper::new(self.index.vector_arroy, embedder_id, config.config.quantized()); ArroyWrapper::new(self.index.vector_arroy, embedder_id, config.config.quantized());
@ -126,7 +127,7 @@ impl<'t> VectorDocumentFromDb<'t> {
Ok(VectorEntry { Ok(VectorEntry {
has_configured_embedder: true, has_configured_embedder: true,
embeddings: Some(Embeddings::FromDb(vectors)), embeddings: Some(Embeddings::FromDb(vectors)),
regenerate: !config.user_provided.contains(self.docid), regenerate: status.must_regenerate(self.docid),
implicit: false, implicit: false,
}) })
} }
@ -137,9 +138,9 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> {
self.embedding_config self.embedding_config
.iter() .iter()
.map(|config| { .map(|config| {
let embedder_id = let info =
self.index.embedder_category_id.get(self.rtxn, &config.name)?.unwrap(); self.index.embedding_configs().embedder_info(self.rtxn, &config.name)?.unwrap();
let entry = self.entry_from_db(embedder_id, config)?; let entry = self.entry_from_db(info.embedder_id, config, &info.embedding_status)?;
let config_name = self.doc_alloc.alloc_str(config.name.as_str()); let config_name = self.doc_alloc.alloc_str(config.name.as_str());
Ok((&*config_name, entry)) Ok((&*config_name, entry))
}) })
@ -156,11 +157,11 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> {
} }
fn vectors_for_key(&self, key: &str) -> Result<Option<VectorEntry<'t>>> { fn vectors_for_key(&self, key: &str) -> Result<Option<VectorEntry<'t>>> {
Ok(match self.index.embedder_category_id.get(self.rtxn, key)? { Ok(match self.index.embedding_configs().embedder_info(self.rtxn, key)? {
Some(embedder_id) => { Some(info) => {
let config = let config =
self.embedding_config.iter().find(|config| config.name == key).unwrap(); self.embedding_config.iter().find(|config| config.name == key).unwrap();
Some(self.entry_from_db(embedder_id, config)?) Some(self.entry_from_db(info.embedder_id, config, &info.embedding_status)?)
} }
None => match self.vectors_field.as_ref().and_then(|obkv| obkv.get(key)) { None => match self.vectors_field.as_ref().and_then(|obkv| obkv.get(key)) {
Some(embedding_from_doc) => { Some(embedding_from_doc) => {
@ -222,7 +223,7 @@ fn entry_from_raw_value(
pub struct VectorDocumentFromVersions<'doc> { pub struct VectorDocumentFromVersions<'doc> {
external_document_id: &'doc str, external_document_id: &'doc str,
vectors: RawMap<'doc, FxBuildHasher>, vectors: RawMap<'doc, FxBuildHasher>,
embedders: &'doc EmbeddingConfigs, embedders: &'doc RuntimeEmbedders,
} }
impl<'doc> VectorDocumentFromVersions<'doc> { impl<'doc> VectorDocumentFromVersions<'doc> {
@ -230,7 +231,7 @@ impl<'doc> VectorDocumentFromVersions<'doc> {
external_document_id: &'doc str, external_document_id: &'doc str,
versions: &Versions<'doc>, versions: &Versions<'doc>,
bump: &'doc Bump, bump: &'doc Bump,
embedders: &'doc EmbeddingConfigs, embedders: &'doc RuntimeEmbedders,
) -> Result<Option<Self>> { ) -> Result<Option<Self>> {
let document = DocumentFromVersions::new(versions); let document = DocumentFromVersions::new(versions);
if let Some(vectors_field) = document.vectors_field()? { if let Some(vectors_field) = document.vectors_field()? {
@ -283,7 +284,7 @@ impl<'doc> MergedVectorDocument<'doc> {
db_fields_ids_map: &'doc Mapper, db_fields_ids_map: &'doc Mapper,
versions: &Versions<'doc>, versions: &Versions<'doc>,
doc_alloc: &'doc Bump, doc_alloc: &'doc Bump,
embedders: &'doc EmbeddingConfigs, embedders: &'doc RuntimeEmbedders,
) -> Result<Option<Self>> { ) -> Result<Option<Self>> {
let db = VectorDocumentFromDb::new(docid, index, rtxn, db_fields_ids_map, doc_alloc)?; let db = VectorDocumentFromDb::new(docid, index, rtxn, db_fields_ids_map, doc_alloc)?;
let new_doc = let new_doc =
@ -295,7 +296,7 @@ impl<'doc> MergedVectorDocument<'doc> {
external_document_id: &'doc str, external_document_id: &'doc str,
versions: &Versions<'doc>, versions: &Versions<'doc>,
doc_alloc: &'doc Bump, doc_alloc: &'doc Bump,
embedders: &'doc EmbeddingConfigs, embedders: &'doc RuntimeEmbedders,
) -> Result<Option<Self>> { ) -> Result<Option<Self>> {
let Some(new_doc) = let Some(new_doc) =
VectorDocumentFromVersions::new(external_document_id, versions, doc_alloc, embedders)? VectorDocumentFromVersions::new(external_document_id, versions, doc_alloc, embedders)?

View file

@ -3,6 +3,7 @@ use bumpalo::Bump;
use serde_json::Value; use serde_json::Value;
use super::{EmbedError, Embedder, Embedding}; use super::{EmbedError, Embedder, Embedding};
use crate::progress::EmbedderStats;
use crate::{DocumentId, Result, ThreadPoolNoAbort}; use crate::{DocumentId, Result, ThreadPoolNoAbort};
type ExtractorId = u8; type ExtractorId = u8;
@ -43,6 +44,8 @@ pub struct EmbedSession<'doc, C, I> {
embedder_name: &'doc str, embedder_name: &'doc str,
embedder_stats: &'doc EmbedderStats,
on_embed: C, on_embed: C,
} }
@ -51,6 +54,7 @@ pub trait Input: Sized {
inputs: &[Self], inputs: &[Self],
embedder: &Embedder, embedder: &Embedder,
threads: &ThreadPoolNoAbort, threads: &ThreadPoolNoAbort,
embedder_stats: &EmbedderStats,
) -> std::result::Result<Vec<Embedding>, EmbedError>; ) -> std::result::Result<Vec<Embedding>, EmbedError>;
} }
@ -59,8 +63,9 @@ impl Input for &'_ str {
inputs: &[Self], inputs: &[Self],
embedder: &Embedder, embedder: &Embedder,
threads: &ThreadPoolNoAbort, threads: &ThreadPoolNoAbort,
embedder_stats: &EmbedderStats,
) -> std::result::Result<Vec<Embedding>, EmbedError> { ) -> std::result::Result<Vec<Embedding>, EmbedError> {
embedder.embed_index_ref(inputs, threads) embedder.embed_index_ref(inputs, threads, embedder_stats)
} }
} }
@ -69,8 +74,9 @@ impl Input for Value {
inputs: &[Value], inputs: &[Value],
embedder: &Embedder, embedder: &Embedder,
threads: &ThreadPoolNoAbort, threads: &ThreadPoolNoAbort,
embedder_stats: &EmbedderStats,
) -> std::result::Result<Vec<Embedding>, EmbedError> { ) -> std::result::Result<Vec<Embedding>, EmbedError> {
embedder.embed_index_ref_fragments(inputs, threads) embedder.embed_index_ref_fragments(inputs, threads, embedder_stats)
} }
} }
@ -81,12 +87,21 @@ impl<'doc, C: OnEmbed<'doc>, I: Input> EmbedSession<'doc, C, I> {
embedder_name: &'doc str, embedder_name: &'doc str,
threads: &'doc ThreadPoolNoAbort, threads: &'doc ThreadPoolNoAbort,
doc_alloc: &'doc Bump, doc_alloc: &'doc Bump,
embedder_stats: &'doc EmbedderStats,
on_embed: C, on_embed: C,
) -> Self { ) -> Self {
let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint(); let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint();
let texts = BVec::with_capacity_in(capacity, doc_alloc); let texts = BVec::with_capacity_in(capacity, doc_alloc);
let ids = BVec::with_capacity_in(capacity, doc_alloc); let ids = BVec::with_capacity_in(capacity, doc_alloc);
Self { inputs: texts, metadata: ids, embedder, threads, embedder_name, on_embed } Self {
inputs: texts,
metadata: ids,
embedder,
threads,
embedder_name,
embedder_stats,
on_embed,
}
} }
pub fn request_embedding( pub fn request_embedding(
@ -114,7 +129,12 @@ impl<'doc, C: OnEmbed<'doc>, I: Input> EmbedSession<'doc, C, I> {
if self.inputs.is_empty() { if self.inputs.is_empty() {
return Ok(()); return Ok(());
} }
let res = match I::embed_ref(self.inputs.as_slice(), self.embedder, self.threads) { let res = match I::embed_ref(
self.inputs.as_slice(),
self.embedder,
self.threads,
self.embedder_stats,
) {
Ok(embeddings) => { Ok(embeddings) => {
for (metadata, embedding) in self.metadata.iter().copied().zip(embeddings) { for (metadata, embedding) in self.metadata.iter().copied().zip(embeddings) {
self.on_embed.process_embedding_response(EmbeddingResponse { self.on_embed.process_embedding_response(EmbeddingResponse {