mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 04:17:10 +02:00
Implement in new document indexer
This commit is contained in:
parent
22d363c05a
commit
f8232976ed
10 changed files with 886 additions and 391 deletions
|
@ -138,6 +138,7 @@ pub enum ReceiverAction {
|
|||
WakeUp,
|
||||
LargeEntry(LargeEntry),
|
||||
LargeVectors(LargeVectors),
|
||||
LargeVector(LargeVector),
|
||||
}
|
||||
|
||||
/// An entry that cannot fit in the BBQueue buffers has been
|
||||
|
@ -174,6 +175,24 @@ impl LargeVectors {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct LargeVector {
|
||||
/// The document id associated to the large embedding.
|
||||
pub docid: DocumentId,
|
||||
/// The embedder id in which to insert the large embedding.
|
||||
pub embedder_id: u8,
|
||||
/// The extractor id in which to insert the large embedding.
|
||||
pub extractor_id: u8,
|
||||
/// The large embedding that must be written.
|
||||
pub embedding: Mmap,
|
||||
}
|
||||
|
||||
impl LargeVector {
|
||||
pub fn read_embedding(&self, dimensions: usize) -> &[f32] {
|
||||
self.embedding.chunks_exact(dimensions).map(bytemuck::cast_slice).next().unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> WriterBbqueueReceiver<'a> {
|
||||
/// Tries to receive an action to do until the timeout occurs
|
||||
/// and if it does, consider it as a spurious wake up.
|
||||
|
@ -238,6 +257,7 @@ pub enum EntryHeader {
|
|||
DbOperation(DbOperation),
|
||||
ArroyDeleteVector(ArroyDeleteVector),
|
||||
ArroySetVectors(ArroySetVectors),
|
||||
ArroySetVector(ArroySetVector),
|
||||
}
|
||||
|
||||
impl EntryHeader {
|
||||
|
@ -250,6 +270,7 @@ impl EntryHeader {
|
|||
EntryHeader::DbOperation(_) => 0,
|
||||
EntryHeader::ArroyDeleteVector(_) => 1,
|
||||
EntryHeader::ArroySetVectors(_) => 2,
|
||||
EntryHeader::ArroySetVector(_) => 3,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -274,11 +295,17 @@ impl EntryHeader {
|
|||
Self::variant_size() + mem::size_of::<ArroySetVectors>() + embedding_size * count
|
||||
}
|
||||
|
||||
fn total_set_vector_size(dimensions: usize) -> usize {
|
||||
let embedding_size = dimensions * mem::size_of::<f32>();
|
||||
Self::variant_size() + mem::size_of::<ArroySetVector>() + embedding_size
|
||||
}
|
||||
|
||||
fn header_size(&self) -> usize {
|
||||
let payload_size = match self {
|
||||
EntryHeader::DbOperation(op) => mem::size_of_val(op),
|
||||
EntryHeader::ArroyDeleteVector(adv) => mem::size_of_val(adv),
|
||||
EntryHeader::ArroySetVectors(asvs) => mem::size_of_val(asvs),
|
||||
EntryHeader::ArroySetVector(asv) => mem::size_of_val(asv),
|
||||
};
|
||||
Self::variant_size() + payload_size
|
||||
}
|
||||
|
@ -301,6 +328,11 @@ impl EntryHeader {
|
|||
let header = checked::pod_read_unaligned(header_bytes);
|
||||
EntryHeader::ArroySetVectors(header)
|
||||
}
|
||||
3 => {
|
||||
let header_bytes = &remaining[..mem::size_of::<ArroySetVector>()];
|
||||
let header = checked::pod_read_unaligned(header_bytes);
|
||||
EntryHeader::ArroySetVector(header)
|
||||
}
|
||||
id => panic!("invalid variant id: {id}"),
|
||||
}
|
||||
}
|
||||
|
@ -311,6 +343,7 @@ impl EntryHeader {
|
|||
EntryHeader::DbOperation(op) => bytemuck::bytes_of(op),
|
||||
EntryHeader::ArroyDeleteVector(adv) => bytemuck::bytes_of(adv),
|
||||
EntryHeader::ArroySetVectors(asvs) => bytemuck::bytes_of(asvs),
|
||||
EntryHeader::ArroySetVector(asv) => bytemuck::bytes_of(asv),
|
||||
};
|
||||
*first = self.variant_id();
|
||||
remaining.copy_from_slice(payload_bytes);
|
||||
|
@ -379,6 +412,37 @@ impl ArroySetVectors {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)]
|
||||
#[repr(C)]
|
||||
/// The embeddings are in the remaining space and represents
|
||||
/// non-aligned [f32] each with dimensions f32s.
|
||||
pub struct ArroySetVector {
|
||||
pub docid: DocumentId,
|
||||
pub embedder_id: u8,
|
||||
pub extractor_id: u8,
|
||||
_padding: [u8; 2],
|
||||
}
|
||||
|
||||
impl ArroySetVector {
|
||||
fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] {
|
||||
let skip = EntryHeader::variant_size() + mem::size_of::<Self>();
|
||||
&frame[skip..]
|
||||
}
|
||||
|
||||
/// Read the embedding and write it into an aligned `f32` Vec.
|
||||
pub fn read_all_embeddings_into_vec<'v>(
|
||||
&self,
|
||||
frame: &FrameGrantR<'_>,
|
||||
vec: &'v mut Vec<f32>,
|
||||
) -> &'v [f32] {
|
||||
let embeddings_bytes = Self::embeddings_bytes(frame);
|
||||
let embeddings_count = embeddings_bytes.len() / mem::size_of::<f32>();
|
||||
vec.resize(embeddings_count, 0.0);
|
||||
bytemuck::cast_slice_mut(vec.as_mut()).copy_from_slice(embeddings_bytes);
|
||||
&vec[..]
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)]
|
||||
#[repr(u16)]
|
||||
pub enum Database {
|
||||
|
@ -398,6 +462,7 @@ pub enum Database {
|
|||
FacetIdStringDocids,
|
||||
FieldIdDocidFacetStrings,
|
||||
FieldIdDocidFacetF64s,
|
||||
VectorEmbedderCategoryId,
|
||||
}
|
||||
|
||||
impl Database {
|
||||
|
@ -419,6 +484,7 @@ impl Database {
|
|||
Database::FacetIdStringDocids => index.facet_id_string_docids.remap_types(),
|
||||
Database::FieldIdDocidFacetStrings => index.field_id_docid_facet_strings.remap_types(),
|
||||
Database::FieldIdDocidFacetF64s => index.field_id_docid_facet_f64s.remap_types(),
|
||||
Database::VectorEmbedderCategoryId => index.embedder_category_id.remap_types(),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -440,6 +506,7 @@ impl Database {
|
|||
Database::FacetIdStringDocids => db_name::FACET_ID_STRING_DOCIDS,
|
||||
Database::FieldIdDocidFacetStrings => db_name::FIELD_ID_DOCID_FACET_STRINGS,
|
||||
Database::FieldIdDocidFacetF64s => db_name::FIELD_ID_DOCID_FACET_F64S,
|
||||
Database::VectorEmbedderCategoryId => db_name::VECTOR_EMBEDDER_CATEGORY_ID,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -568,6 +635,82 @@ impl<'b> ExtractorBbqueueSender<'b> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn set_vector_for_extractor(
|
||||
&self,
|
||||
docid: u32,
|
||||
embedder_id: u8,
|
||||
extractor_id: u8,
|
||||
embedding: Option<Embedding>,
|
||||
) -> crate::Result<()> {
|
||||
let max_grant = self.max_grant;
|
||||
let refcell = self.producers.get().unwrap();
|
||||
let mut producer = refcell.0.borrow_mut_or_yield();
|
||||
|
||||
// If there are no vectors we specify the dimensions
|
||||
// to zero to allocate no extra space at all
|
||||
let dimensions = embedding.as_ref().map_or(0, |emb| emb.len());
|
||||
|
||||
let arroy_set_vector =
|
||||
ArroySetVector { docid, embedder_id, extractor_id, _padding: [0; 2] };
|
||||
let payload_header = EntryHeader::ArroySetVector(arroy_set_vector);
|
||||
let total_length = EntryHeader::total_set_vector_size(dimensions);
|
||||
if total_length > max_grant {
|
||||
let mut value_file = tempfile::tempfile().map(BufWriter::new)?;
|
||||
let embedding = embedding.expect("set_vector without a vector does not fit in RAM");
|
||||
|
||||
let mut embedding_bytes = bytemuck::cast_slice(&embedding);
|
||||
io::copy(&mut embedding_bytes, &mut value_file)?;
|
||||
|
||||
let value_file = value_file.into_inner().map_err(|ie| ie.into_error())?;
|
||||
let embedding = unsafe { Mmap::map(&value_file)? };
|
||||
|
||||
let large_vectors = LargeVector { docid, embedder_id, extractor_id, embedding };
|
||||
self.sender.send(ReceiverAction::LargeVector(large_vectors)).unwrap();
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Spin loop to have a frame the size we requested.
|
||||
reserve_and_write_grant(
|
||||
&mut producer,
|
||||
total_length,
|
||||
&self.sender,
|
||||
&self.sent_messages_attempts,
|
||||
&self.blocking_sent_messages_attempts,
|
||||
|grant| {
|
||||
let header_size = payload_header.header_size();
|
||||
let (header_bytes, remaining) = grant.split_at_mut(header_size);
|
||||
payload_header.serialize_into(header_bytes);
|
||||
|
||||
if dimensions != 0 {
|
||||
let output_iter =
|
||||
remaining.chunks_exact_mut(dimensions * mem::size_of::<f32>());
|
||||
|
||||
for (embedding, output) in embedding.iter().zip(output_iter) {
|
||||
output.copy_from_slice(bytemuck::cast_slice(embedding));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
},
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn embedding_status(
|
||||
&self,
|
||||
name: &str,
|
||||
infos: crate::vector::db::EmbedderInfo,
|
||||
) -> crate::Result<()> {
|
||||
let bytes = infos.to_bytes().map_err(|_| {
|
||||
InternalError::Serialization(crate::SerializationError::Encoding {
|
||||
db_name: Some(Database::VectorEmbedderCategoryId.database_name()),
|
||||
})
|
||||
})?;
|
||||
self.write_key_value(Database::VectorEmbedderCategoryId, name.as_bytes(), &bytes)
|
||||
}
|
||||
|
||||
fn write_key_value(&self, database: Database, key: &[u8], value: &[u8]) -> crate::Result<()> {
|
||||
let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| {
|
||||
InternalError::StorePut {
|
||||
|
@ -942,9 +1085,18 @@ impl EmbeddingSender<'_, '_> {
|
|||
&self,
|
||||
docid: DocumentId,
|
||||
embedder_id: u8,
|
||||
embedding: Embedding,
|
||||
extractor_id: u8,
|
||||
embedding: Option<Embedding>,
|
||||
) -> crate::Result<()> {
|
||||
self.0.set_vectors(docid, embedder_id, &[embedding])
|
||||
self.0.set_vector_for_extractor(docid, embedder_id, extractor_id, embedding)
|
||||
}
|
||||
|
||||
pub(crate) fn embedding_status(
|
||||
&self,
|
||||
name: &str,
|
||||
infos: crate::vector::db::EmbedderInfo,
|
||||
) -> crate::Result<()> {
|
||||
self.0.embedding_status(name, infos)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -12,6 +12,7 @@ use super::vector_document::VectorDocument;
|
|||
use super::{KvReaderFieldId, KvWriterFieldId};
|
||||
use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME};
|
||||
use crate::documents::FieldIdMapper;
|
||||
use crate::update::del_add::KvReaderDelAdd;
|
||||
use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal};
|
||||
use crate::update::new::vector_document::VectorDocumentFromDb;
|
||||
use crate::vector::settings::EmbedderAction;
|
||||
|
@ -469,6 +470,110 @@ impl<'doc> Versions<'doc> {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct KvDelAddDocument<'a, Mapper: FieldIdMapper> {
|
||||
document: &'a obkv::KvReaderU16,
|
||||
side: crate::update::del_add::DelAdd,
|
||||
fields_ids_map: &'a Mapper,
|
||||
}
|
||||
|
||||
impl<'a, Mapper: FieldIdMapper> KvDelAddDocument<'a, Mapper> {
|
||||
pub fn new(
|
||||
document: &'a obkv::KvReaderU16,
|
||||
side: crate::update::del_add::DelAdd,
|
||||
fields_ids_map: &'a Mapper,
|
||||
) -> Self {
|
||||
Self { document, side, fields_ids_map }
|
||||
}
|
||||
|
||||
fn get(&self, k: &str) -> Result<Option<&'a RawValue>> {
|
||||
let Some(id) = self.fields_ids_map.id(k) else { return Ok(None) };
|
||||
let Some(value) = self.document.get(id) else { return Ok(None) };
|
||||
let Some(value) = KvReaderDelAdd::from_slice(value).get(self.side) else { return Ok(None) };
|
||||
|
||||
let value = serde_json::from_slice(value).map_err(crate::InternalError::SerdeJson)?;
|
||||
|
||||
Ok(Some(value))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, Mapper: FieldIdMapper> Document<'a> for KvDelAddDocument<'a, Mapper> {
|
||||
fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'a str, &'a RawValue)>> {
|
||||
let mut it = self.document.iter();
|
||||
|
||||
std::iter::from_fn(move || loop {
|
||||
let (fid, value) = it.next()?;
|
||||
let Some(value) = KvReaderDelAdd::from_slice(value).get(self.side) else {
|
||||
continue;
|
||||
};
|
||||
let name = match self.fields_ids_map.name(fid).ok_or(
|
||||
InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldId {
|
||||
field_id: fid,
|
||||
process: "getting current document",
|
||||
}),
|
||||
) {
|
||||
Ok(name) => name,
|
||||
Err(error) => return Some(Err(error.into())),
|
||||
};
|
||||
|
||||
if name == RESERVED_VECTORS_FIELD_NAME || name == RESERVED_GEO_FIELD_NAME {
|
||||
continue;
|
||||
}
|
||||
|
||||
let res = (|| {
|
||||
let value =
|
||||
serde_json::from_slice(value).map_err(crate::InternalError::SerdeJson)?;
|
||||
|
||||
Ok((name, value))
|
||||
})();
|
||||
|
||||
return Some(res);
|
||||
})
|
||||
}
|
||||
|
||||
fn top_level_fields_count(&self) -> usize {
|
||||
let mut it = self.document.iter();
|
||||
|
||||
std::iter::from_fn(move || loop {
|
||||
let (fid, value) = it.next()?;
|
||||
let Some(_) = KvReaderDelAdd::from_slice(value).get(self.side) else {
|
||||
continue;
|
||||
};
|
||||
let name = match self.fields_ids_map.name(fid).ok_or(
|
||||
InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldId {
|
||||
field_id: fid,
|
||||
process: "getting current document",
|
||||
}),
|
||||
) {
|
||||
Ok(name) => name,
|
||||
Err(_) => return Some(()),
|
||||
};
|
||||
|
||||
if name == RESERVED_VECTORS_FIELD_NAME || name == RESERVED_GEO_FIELD_NAME {
|
||||
continue;
|
||||
}
|
||||
|
||||
return Some(());
|
||||
})
|
||||
.count()
|
||||
}
|
||||
|
||||
fn top_level_field(&self, k: &str) -> Result<Option<&'a RawValue>> {
|
||||
if k == RESERVED_VECTORS_FIELD_NAME || k == RESERVED_GEO_FIELD_NAME {
|
||||
return Ok(None);
|
||||
}
|
||||
self.get(k)
|
||||
}
|
||||
|
||||
fn vectors_field(&self) -> Result<Option<&'a RawValue>> {
|
||||
self.get(RESERVED_VECTORS_FIELD_NAME)
|
||||
}
|
||||
|
||||
fn geo_field(&self) -> Result<Option<&'a RawValue>> {
|
||||
self.get(RESERVED_GEO_FIELD_NAME)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocumentIdentifiers<'doc> {
|
||||
docid: DocumentId,
|
||||
external_document_id: &'doc str,
|
||||
|
|
|
@ -11,7 +11,7 @@ use super::vector_document::{
|
|||
use crate::attribute_patterns::PatternMatch;
|
||||
use crate::documents::FieldIdMapper;
|
||||
use crate::update::new::document::DocumentIdentifiers;
|
||||
use crate::vector::EmbeddingConfigs;
|
||||
use crate::vector::RuntimeEmbedders;
|
||||
use crate::{DocumentId, Index, InternalError, Result};
|
||||
|
||||
pub enum DocumentChange<'doc> {
|
||||
|
@ -70,7 +70,7 @@ impl<'doc> Insertion<'doc> {
|
|||
pub fn inserted_vectors(
|
||||
&self,
|
||||
doc_alloc: &'doc Bump,
|
||||
embedders: &'doc EmbeddingConfigs,
|
||||
embedders: &'doc RuntimeEmbedders,
|
||||
) -> Result<Option<VectorDocumentFromVersions<'doc>>> {
|
||||
VectorDocumentFromVersions::new(self.external_document_id, &self.new, doc_alloc, embedders)
|
||||
}
|
||||
|
@ -241,7 +241,7 @@ impl<'doc> Update<'doc> {
|
|||
pub fn only_changed_vectors(
|
||||
&self,
|
||||
doc_alloc: &'doc Bump,
|
||||
embedders: &'doc EmbeddingConfigs,
|
||||
embedders: &'doc RuntimeEmbedders,
|
||||
) -> Result<Option<VectorDocumentFromVersions<'doc>>> {
|
||||
VectorDocumentFromVersions::new(self.external_document_id, &self.new, doc_alloc, embedders)
|
||||
}
|
||||
|
@ -252,7 +252,7 @@ impl<'doc> Update<'doc> {
|
|||
index: &'doc Index,
|
||||
mapper: &'doc Mapper,
|
||||
doc_alloc: &'doc Bump,
|
||||
embedders: &'doc EmbeddingConfigs,
|
||||
embedders: &'doc RuntimeEmbedders,
|
||||
) -> Result<Option<MergedVectorDocument<'doc>>> {
|
||||
if self.from_scratch {
|
||||
MergedVectorDocument::without_db(
|
||||
|
|
|
@ -7,8 +7,7 @@ use hashbrown::HashMap;
|
|||
use super::DelAddRoaringBitmap;
|
||||
use crate::constants::RESERVED_GEO_FIELD_NAME;
|
||||
use crate::update::new::channel::{DocumentsSender, ExtractorBbqueueSender};
|
||||
use crate::update::new::document::{write_to_obkv, Document};
|
||||
use crate::update::new::document::{DocumentContext, DocumentIdentifiers};
|
||||
use crate::update::new::document::{write_to_obkv, Document, DocumentContext, DocumentIdentifiers};
|
||||
use crate::update::new::indexer::document_changes::{Extractor, IndexingContext};
|
||||
use crate::update::new::indexer::settings_changes::{
|
||||
settings_change_extract, DocumentsIndentifiers, SettingsChangeExtractor,
|
||||
|
@ -19,16 +18,16 @@ use crate::update::new::vector_document::VectorDocument;
|
|||
use crate::update::new::DocumentChange;
|
||||
use crate::update::settings::SettingsDelta;
|
||||
use crate::vector::settings::EmbedderAction;
|
||||
use crate::vector::EmbeddingConfigs;
|
||||
use crate::vector::RuntimeEmbedders;
|
||||
use crate::Result;
|
||||
|
||||
pub struct DocumentsExtractor<'a, 'b> {
|
||||
document_sender: DocumentsSender<'a, 'b>,
|
||||
embedders: &'a EmbeddingConfigs,
|
||||
embedders: &'a RuntimeEmbedders,
|
||||
}
|
||||
|
||||
impl<'a, 'b> DocumentsExtractor<'a, 'b> {
|
||||
pub fn new(document_sender: DocumentsSender<'a, 'b>, embedders: &'a EmbeddingConfigs) -> Self {
|
||||
pub fn new(document_sender: DocumentsSender<'a, 'b>, embedders: &'a RuntimeEmbedders) -> Self {
|
||||
Self { document_sender, embedders }
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -13,21 +13,17 @@ use super::super::thread_local::{FullySend, ThreadLocal};
|
|||
use super::super::FacetFieldIdsDelta;
|
||||
use super::document_changes::{extract, DocumentChanges, IndexingContext};
|
||||
use super::settings_changes::settings_change_extract;
|
||||
use crate::documents::FieldIdMapper;
|
||||
use crate::documents::PrimaryKey;
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::progress::EmbedderStats;
|
||||
use crate::progress::MergingWordCache;
|
||||
use crate::documents::{FieldIdMapper, PrimaryKey};
|
||||
use crate::progress::{EmbedderStats, MergingWordCache};
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::update::new::extract::EmbeddingExtractor;
|
||||
use crate::update::new::indexer::settings_changes::DocumentsIndentifiers;
|
||||
use crate::update::new::merger::merge_and_send_rtree;
|
||||
use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases};
|
||||
use crate::update::settings::SettingsDelta;
|
||||
use crate::vector::EmbeddingConfigs;
|
||||
use crate::Index;
|
||||
use crate::InternalError;
|
||||
use crate::{Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
|
||||
use crate::vector::db::IndexEmbeddingConfig;
|
||||
use crate::vector::RuntimeEmbedders;
|
||||
use crate::{Index, InternalError, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(super) fn extract_all<'pl, 'extractor, DC, MSP>(
|
||||
|
@ -35,7 +31,7 @@ pub(super) fn extract_all<'pl, 'extractor, DC, MSP>(
|
|||
indexing_context: IndexingContext<MSP>,
|
||||
indexer_span: Span,
|
||||
extractor_sender: ExtractorBbqueueSender,
|
||||
embedders: &EmbeddingConfigs,
|
||||
embedders: &RuntimeEmbedders,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
finished_extraction: &AtomicBool,
|
||||
field_distribution: &mut BTreeMap<String, u64>,
|
||||
|
@ -275,14 +271,19 @@ where
|
|||
let span = tracing::debug_span!(target: "indexing::documents::merge", "vectors");
|
||||
let _entered = span.enter();
|
||||
|
||||
let embedder_configs = index.embedding_configs();
|
||||
for config in &mut index_embeddings {
|
||||
let mut infos = embedder_configs.embedder_info(&rtxn, &config.name)?.unwrap();
|
||||
|
||||
'data: for data in datastore.iter_mut() {
|
||||
let data = &mut data.get_mut().0;
|
||||
let Some(deladd) = data.remove(&config.name) else {
|
||||
let Some(delta) = data.remove(&config.name) else {
|
||||
continue 'data;
|
||||
};
|
||||
deladd.apply_to(&mut config.user_provided, modified_docids);
|
||||
delta.apply_to(&mut infos.embedding_status);
|
||||
}
|
||||
|
||||
extractor_sender.embeddings().embedding_status(&config.name, infos).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,7 +24,7 @@ use crate::progress::{EmbedderStats, Progress};
|
|||
use crate::update::settings::SettingsDelta;
|
||||
use crate::update::GrenadParameters;
|
||||
use crate::vector::settings::{EmbedderAction, WriteBackToDocuments};
|
||||
use crate::vector::{ArroyWrapper, Embedder, EmbeddingConfigs};
|
||||
use crate::vector::{ArroyWrapper, Embedder, RuntimeEmbedders};
|
||||
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort};
|
||||
|
||||
pub(crate) mod de;
|
||||
|
@ -54,7 +54,7 @@ pub fn index<'pl, 'indexer, 'index, DC, MSP>(
|
|||
new_fields_ids_map: FieldsIdsMap,
|
||||
new_primary_key: Option<PrimaryKey<'pl>>,
|
||||
document_changes: &DC,
|
||||
embedders: EmbeddingConfigs,
|
||||
embedders: RuntimeEmbedders,
|
||||
must_stop_processing: &'indexer MSP,
|
||||
progress: &'indexer Progress,
|
||||
embedder_stats: &'indexer EmbedderStats,
|
||||
|
@ -93,7 +93,7 @@ where
|
|||
grenad_parameters: &grenad_parameters,
|
||||
};
|
||||
|
||||
let index_embeddings = index.embedding_configs(wtxn)?;
|
||||
let index_embeddings = index.embedding_configs().embedding_configs(wtxn)?;
|
||||
let mut field_distribution = index.field_distribution(wtxn)?;
|
||||
let mut document_ids = index.documents_ids(wtxn)?;
|
||||
let mut modified_docids = roaring::RoaringBitmap::new();
|
||||
|
@ -133,20 +133,21 @@ where
|
|||
let arroy_writers: Result<HashMap<_, _>> = embedders
|
||||
.inner_as_ref()
|
||||
.iter()
|
||||
.map(|(embedder_name, (embedder, _, was_quantized))| {
|
||||
let embedder_index = index.embedder_category_id.get(wtxn, embedder_name)?.ok_or(
|
||||
InternalError::DatabaseMissingEntry {
|
||||
.map(|(embedder_name, runtime)| {
|
||||
let embedder_index = index
|
||||
.embedding_configs()
|
||||
.embedder_id(wtxn, embedder_name)?
|
||||
.ok_or(InternalError::DatabaseMissingEntry {
|
||||
db_name: "embedder_category_id",
|
||||
key: None,
|
||||
},
|
||||
)?;
|
||||
})?;
|
||||
|
||||
let dimensions = embedder.dimensions();
|
||||
let writer = ArroyWrapper::new(vector_arroy, embedder_index, *was_quantized);
|
||||
let dimensions = runtime.embedder.dimensions();
|
||||
let writer = ArroyWrapper::new(vector_arroy, embedder_index, runtime.is_quantized);
|
||||
|
||||
Ok((
|
||||
embedder_index,
|
||||
(embedder_name.as_str(), embedder.as_ref(), writer, dimensions),
|
||||
(embedder_name.as_str(), &*runtime.embedder, writer, dimensions),
|
||||
))
|
||||
})
|
||||
.collect();
|
||||
|
|
|
@ -11,11 +11,11 @@ use super::super::channel::*;
|
|||
use crate::database_stats::DatabaseStats;
|
||||
use crate::documents::PrimaryKey;
|
||||
use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::progress::Progress;
|
||||
use crate::update::settings::InnerIndexSettings;
|
||||
use crate::vector::db::IndexEmbeddingConfig;
|
||||
use crate::vector::settings::EmbedderAction;
|
||||
use crate::vector::{ArroyWrapper, Embedder, EmbeddingConfigs, Embeddings};
|
||||
use crate::vector::{ArroyWrapper, Embedder, Embeddings, RuntimeEmbedders};
|
||||
use crate::{Error, Index, InternalError, Result, UserError};
|
||||
|
||||
pub fn write_to_db(
|
||||
|
@ -64,6 +64,14 @@ pub fn write_to_db(
|
|||
writer.del_items(wtxn, *dimensions, docid)?;
|
||||
writer.add_items(wtxn, docid, &embeddings)?;
|
||||
}
|
||||
ReceiverAction::LargeVector(
|
||||
large_vector @ LargeVector { docid, embedder_id, extractor_id, .. },
|
||||
) => {
|
||||
let (_, _, writer, dimensions) =
|
||||
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
|
||||
let embedding = large_vector.read_embedding(*dimensions);
|
||||
writer.add_item_in_store(wtxn, docid, extractor_id, embedding)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Every time the is a message in the channel we search
|
||||
|
@ -137,7 +145,7 @@ where
|
|||
)?;
|
||||
}
|
||||
|
||||
index.put_embedding_configs(wtxn, index_embeddings)?;
|
||||
index.embedding_configs().put_embedding_configs(wtxn, index_embeddings)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -147,7 +155,7 @@ pub(super) fn update_index(
|
|||
wtxn: &mut RwTxn<'_>,
|
||||
new_fields_ids_map: FieldIdMapWithMetadata,
|
||||
new_primary_key: Option<PrimaryKey<'_>>,
|
||||
embedders: EmbeddingConfigs,
|
||||
embedders: RuntimeEmbedders,
|
||||
field_distribution: std::collections::BTreeMap<String, u64>,
|
||||
document_ids: roaring::RoaringBitmap,
|
||||
) -> Result<()> {
|
||||
|
@ -226,14 +234,36 @@ pub fn write_from_bbqueue(
|
|||
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
|
||||
let mut embeddings = Embeddings::new(*dimensions);
|
||||
let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding);
|
||||
if embeddings.append(all_embeddings.to_vec()).is_err() {
|
||||
return Err(Error::UserError(UserError::InvalidVectorDimensions {
|
||||
expected: *dimensions,
|
||||
found: all_embeddings.len(),
|
||||
}));
|
||||
}
|
||||
writer.del_items(wtxn, *dimensions, docid)?;
|
||||
writer.add_items(wtxn, docid, &embeddings)?;
|
||||
if !all_embeddings.is_empty() {
|
||||
if embeddings.append(all_embeddings.to_vec()).is_err() {
|
||||
return Err(Error::UserError(UserError::InvalidVectorDimensions {
|
||||
expected: *dimensions,
|
||||
found: all_embeddings.len(),
|
||||
}));
|
||||
}
|
||||
writer.add_items(wtxn, docid, &embeddings)?;
|
||||
}
|
||||
}
|
||||
EntryHeader::ArroySetVector(
|
||||
asv @ ArroySetVector { docid, embedder_id, extractor_id, .. },
|
||||
) => {
|
||||
let frame = frame_with_header.frame();
|
||||
let (_, _, writer, dimensions) =
|
||||
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
|
||||
let embedding = asv.read_all_embeddings_into_vec(frame, aligned_embedding);
|
||||
|
||||
if embedding.is_empty() {
|
||||
writer.del_item_in_store(wtxn, docid, extractor_id, *dimensions)?;
|
||||
} else {
|
||||
if embedding.len() != *dimensions {
|
||||
return Err(Error::UserError(UserError::InvalidVectorDimensions {
|
||||
expected: *dimensions,
|
||||
found: embedding.len(),
|
||||
}));
|
||||
}
|
||||
writer.add_item_in_store(wtxn, docid, extractor_id, embedding)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -12,9 +12,9 @@ use super::document::{Document, DocumentFromDb, DocumentFromVersions, Versions};
|
|||
use super::indexer::de::DeserrRawValue;
|
||||
use crate::constants::RESERVED_VECTORS_FIELD_NAME;
|
||||
use crate::documents::FieldIdMapper;
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::vector::db::{EmbeddingStatus, IndexEmbeddingConfig};
|
||||
use crate::vector::parsed_vectors::{RawVectors, RawVectorsError, VectorOrArrayOfVectors};
|
||||
use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfigs};
|
||||
use crate::vector::{ArroyWrapper, Embedding, RuntimeEmbedders};
|
||||
use crate::{DocumentId, Index, InternalError, Result, UserError};
|
||||
|
||||
#[derive(Serialize)]
|
||||
|
@ -109,7 +109,7 @@ impl<'t> VectorDocumentFromDb<'t> {
|
|||
None => None,
|
||||
};
|
||||
|
||||
let embedding_config = index.embedding_configs(rtxn)?;
|
||||
let embedding_config = index.embedding_configs().embedding_configs(rtxn)?;
|
||||
|
||||
Ok(Some(Self { docid, embedding_config, index, vectors_field, rtxn, doc_alloc }))
|
||||
}
|
||||
|
@ -118,6 +118,7 @@ impl<'t> VectorDocumentFromDb<'t> {
|
|||
&self,
|
||||
embedder_id: u8,
|
||||
config: &IndexEmbeddingConfig,
|
||||
status: &EmbeddingStatus,
|
||||
) -> Result<VectorEntry<'t>> {
|
||||
let reader =
|
||||
ArroyWrapper::new(self.index.vector_arroy, embedder_id, config.config.quantized());
|
||||
|
@ -126,7 +127,7 @@ impl<'t> VectorDocumentFromDb<'t> {
|
|||
Ok(VectorEntry {
|
||||
has_configured_embedder: true,
|
||||
embeddings: Some(Embeddings::FromDb(vectors)),
|
||||
regenerate: !config.user_provided.contains(self.docid),
|
||||
regenerate: status.must_regenerate(self.docid),
|
||||
implicit: false,
|
||||
})
|
||||
}
|
||||
|
@ -137,9 +138,9 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> {
|
|||
self.embedding_config
|
||||
.iter()
|
||||
.map(|config| {
|
||||
let embedder_id =
|
||||
self.index.embedder_category_id.get(self.rtxn, &config.name)?.unwrap();
|
||||
let entry = self.entry_from_db(embedder_id, config)?;
|
||||
let info =
|
||||
self.index.embedding_configs().embedder_info(self.rtxn, &config.name)?.unwrap();
|
||||
let entry = self.entry_from_db(info.embedder_id, config, &info.embedding_status)?;
|
||||
let config_name = self.doc_alloc.alloc_str(config.name.as_str());
|
||||
Ok((&*config_name, entry))
|
||||
})
|
||||
|
@ -156,11 +157,11 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> {
|
|||
}
|
||||
|
||||
fn vectors_for_key(&self, key: &str) -> Result<Option<VectorEntry<'t>>> {
|
||||
Ok(match self.index.embedder_category_id.get(self.rtxn, key)? {
|
||||
Some(embedder_id) => {
|
||||
Ok(match self.index.embedding_configs().embedder_info(self.rtxn, key)? {
|
||||
Some(info) => {
|
||||
let config =
|
||||
self.embedding_config.iter().find(|config| config.name == key).unwrap();
|
||||
Some(self.entry_from_db(embedder_id, config)?)
|
||||
Some(self.entry_from_db(info.embedder_id, config, &info.embedding_status)?)
|
||||
}
|
||||
None => match self.vectors_field.as_ref().and_then(|obkv| obkv.get(key)) {
|
||||
Some(embedding_from_doc) => {
|
||||
|
@ -222,7 +223,7 @@ fn entry_from_raw_value(
|
|||
pub struct VectorDocumentFromVersions<'doc> {
|
||||
external_document_id: &'doc str,
|
||||
vectors: RawMap<'doc, FxBuildHasher>,
|
||||
embedders: &'doc EmbeddingConfigs,
|
||||
embedders: &'doc RuntimeEmbedders,
|
||||
}
|
||||
|
||||
impl<'doc> VectorDocumentFromVersions<'doc> {
|
||||
|
@ -230,7 +231,7 @@ impl<'doc> VectorDocumentFromVersions<'doc> {
|
|||
external_document_id: &'doc str,
|
||||
versions: &Versions<'doc>,
|
||||
bump: &'doc Bump,
|
||||
embedders: &'doc EmbeddingConfigs,
|
||||
embedders: &'doc RuntimeEmbedders,
|
||||
) -> Result<Option<Self>> {
|
||||
let document = DocumentFromVersions::new(versions);
|
||||
if let Some(vectors_field) = document.vectors_field()? {
|
||||
|
@ -283,7 +284,7 @@ impl<'doc> MergedVectorDocument<'doc> {
|
|||
db_fields_ids_map: &'doc Mapper,
|
||||
versions: &Versions<'doc>,
|
||||
doc_alloc: &'doc Bump,
|
||||
embedders: &'doc EmbeddingConfigs,
|
||||
embedders: &'doc RuntimeEmbedders,
|
||||
) -> Result<Option<Self>> {
|
||||
let db = VectorDocumentFromDb::new(docid, index, rtxn, db_fields_ids_map, doc_alloc)?;
|
||||
let new_doc =
|
||||
|
@ -295,7 +296,7 @@ impl<'doc> MergedVectorDocument<'doc> {
|
|||
external_document_id: &'doc str,
|
||||
versions: &Versions<'doc>,
|
||||
doc_alloc: &'doc Bump,
|
||||
embedders: &'doc EmbeddingConfigs,
|
||||
embedders: &'doc RuntimeEmbedders,
|
||||
) -> Result<Option<Self>> {
|
||||
let Some(new_doc) =
|
||||
VectorDocumentFromVersions::new(external_document_id, versions, doc_alloc, embedders)?
|
||||
|
|
|
@ -3,6 +3,7 @@ use bumpalo::Bump;
|
|||
use serde_json::Value;
|
||||
|
||||
use super::{EmbedError, Embedder, Embedding};
|
||||
use crate::progress::EmbedderStats;
|
||||
use crate::{DocumentId, Result, ThreadPoolNoAbort};
|
||||
type ExtractorId = u8;
|
||||
|
||||
|
@ -43,6 +44,8 @@ pub struct EmbedSession<'doc, C, I> {
|
|||
|
||||
embedder_name: &'doc str,
|
||||
|
||||
embedder_stats: &'doc EmbedderStats,
|
||||
|
||||
on_embed: C,
|
||||
}
|
||||
|
||||
|
@ -51,6 +54,7 @@ pub trait Input: Sized {
|
|||
inputs: &[Self],
|
||||
embedder: &Embedder,
|
||||
threads: &ThreadPoolNoAbort,
|
||||
embedder_stats: &EmbedderStats,
|
||||
) -> std::result::Result<Vec<Embedding>, EmbedError>;
|
||||
}
|
||||
|
||||
|
@ -59,8 +63,9 @@ impl Input for &'_ str {
|
|||
inputs: &[Self],
|
||||
embedder: &Embedder,
|
||||
threads: &ThreadPoolNoAbort,
|
||||
embedder_stats: &EmbedderStats,
|
||||
) -> std::result::Result<Vec<Embedding>, EmbedError> {
|
||||
embedder.embed_index_ref(inputs, threads)
|
||||
embedder.embed_index_ref(inputs, threads, embedder_stats)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -69,8 +74,9 @@ impl Input for Value {
|
|||
inputs: &[Value],
|
||||
embedder: &Embedder,
|
||||
threads: &ThreadPoolNoAbort,
|
||||
embedder_stats: &EmbedderStats,
|
||||
) -> std::result::Result<Vec<Embedding>, EmbedError> {
|
||||
embedder.embed_index_ref_fragments(inputs, threads)
|
||||
embedder.embed_index_ref_fragments(inputs, threads, embedder_stats)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -81,12 +87,21 @@ impl<'doc, C: OnEmbed<'doc>, I: Input> EmbedSession<'doc, C, I> {
|
|||
embedder_name: &'doc str,
|
||||
threads: &'doc ThreadPoolNoAbort,
|
||||
doc_alloc: &'doc Bump,
|
||||
embedder_stats: &'doc EmbedderStats,
|
||||
on_embed: C,
|
||||
) -> Self {
|
||||
let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint();
|
||||
let texts = BVec::with_capacity_in(capacity, doc_alloc);
|
||||
let ids = BVec::with_capacity_in(capacity, doc_alloc);
|
||||
Self { inputs: texts, metadata: ids, embedder, threads, embedder_name, on_embed }
|
||||
Self {
|
||||
inputs: texts,
|
||||
metadata: ids,
|
||||
embedder,
|
||||
threads,
|
||||
embedder_name,
|
||||
embedder_stats,
|
||||
on_embed,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn request_embedding(
|
||||
|
@ -114,7 +129,12 @@ impl<'doc, C: OnEmbed<'doc>, I: Input> EmbedSession<'doc, C, I> {
|
|||
if self.inputs.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
let res = match I::embed_ref(self.inputs.as_slice(), self.embedder, self.threads) {
|
||||
let res = match I::embed_ref(
|
||||
self.inputs.as_slice(),
|
||||
self.embedder,
|
||||
self.threads,
|
||||
self.embedder_stats,
|
||||
) {
|
||||
Ok(embeddings) => {
|
||||
for (metadata, embedding) in self.metadata.iter().copied().zip(embeddings) {
|
||||
self.on_embed.process_embedding_response(EmbeddingResponse {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue