438 lines
14 KiB
Rust
Raw Normal View History

use std::marker::PhantomData;
use std::sync::atomic::{AtomicUsize, Ordering};
use crossbeam_channel::{IntoIter, Receiver, SendError, Sender};
2024-09-03 11:02:39 +02:00
use grenad::Merger;
2024-10-29 17:43:36 +01:00
use hashbrown::HashMap;
2024-08-29 15:07:59 +02:00
use heed::types::Bytes;
2024-10-09 14:39:27 +02:00
use roaring::RoaringBitmap;
2024-08-29 15:07:59 +02:00
2024-09-16 09:34:10 +02:00
use super::extract::FacetKind;
2024-08-29 15:07:59 +02:00
use super::StdResult;
use crate::index::main_key::DOCUMENTS_IDS_KEY;
use crate::update::new::KvReaderFieldId;
2024-09-03 11:02:39 +02:00
use crate::update::MergeDeladdCboRoaringBitmaps;
2024-10-28 14:23:02 +01:00
use crate::vector::Embedding;
2024-09-04 14:30:09 +02:00
use crate::{DocumentId, Index};
2024-08-29 15:07:59 +02:00
/// The capacity of the channel is currently in number of messages.
pub fn extractor_writer_channel(cap: usize) -> (ExtractorSender, WriterReceiver) {
2024-08-29 15:07:59 +02:00
let (sender, receiver) = crossbeam_channel::bounded(cap);
(
ExtractorSender {
sender,
send_count: Default::default(),
writer_contentious_count: Default::default(),
extractor_contentious_count: Default::default(),
},
WriterReceiver(receiver),
)
2024-08-29 15:07:59 +02:00
}
pub struct KeyValueEntry {
pub key_length: usize,
pub data: Box<[u8]>,
2024-08-29 15:07:59 +02:00
}
impl KeyValueEntry {
2024-09-04 14:30:09 +02:00
pub fn from_small_key_value(key: &[u8], value: &[u8]) -> Self {
let mut data = Vec::with_capacity(key.len() + value.len());
data.extend_from_slice(key);
data.extend_from_slice(value);
KeyValueEntry { key_length: key.len(), data: data.into_boxed_slice() }
2024-09-04 14:30:09 +02:00
}
2024-10-09 14:39:27 +02:00
pub fn from_small_key_bitmap(key: &[u8], bitmap: RoaringBitmap) -> Self {
let mut data = Vec::with_capacity(key.len() + bitmap.serialized_size());
data.extend_from_slice(key);
bitmap.serialize_into(&mut data).unwrap();
KeyValueEntry { key_length: key.len(), data: data.into_boxed_slice() }
}
pub fn key(&self) -> &[u8] {
&self.data[..self.key_length]
}
pub fn value(&self) -> &[u8] {
&self.data[self.key_length..]
2024-08-29 15:07:59 +02:00
}
}
pub struct KeyEntry {
data: Box<[u8]>,
}
impl KeyEntry {
pub fn from_key(key: &[u8]) -> Self {
KeyEntry { data: key.to_vec().into_boxed_slice() }
}
pub fn entry(&self) -> &[u8] {
self.data.as_ref()
}
}
pub enum EntryOperation {
Delete(KeyEntry),
Write(KeyValueEntry),
}
2024-10-29 17:43:36 +01:00
pub enum WriterOperation {
DbOperation(DbOperation),
ArroyOperation(ArroyOperation),
}
pub enum ArroyOperation {
/// TODO: call when deleting regular documents
DeleteVectors {
docid: DocumentId,
},
SetVectors {
docid: DocumentId,
embedder_id: u8,
embeddings: Vec<Embedding>,
},
SetVector {
docid: DocumentId,
embedder_id: u8,
embedding: Embedding,
},
Finish {
user_provided: HashMap<String, RoaringBitmap>,
},
}
pub struct DbOperation {
database: Database,
entry: EntryOperation,
}
#[derive(Debug)]
pub enum Database {
Documents,
ExternalDocumentsIds,
2024-09-04 12:17:13 +02:00
ExactWordDocids,
FidWordCountDocids,
Main,
WordDocids,
WordFidDocids,
WordPairProximityDocids,
2024-09-04 12:17:13 +02:00
WordPositionDocids,
2024-09-16 09:34:10 +02:00
FacetIdIsNullDocids,
FacetIdIsEmptyDocids,
FacetIdExistsDocids,
FacetIdF64NumberDocids,
FacetIdStringDocids,
2024-08-29 15:07:59 +02:00
}
2024-09-16 09:34:10 +02:00
impl Database {
2024-08-29 15:07:59 +02:00
pub fn database(&self, index: &Index) -> heed::Database<Bytes, Bytes> {
2024-09-16 09:34:10 +02:00
match self {
Database::Documents => index.documents.remap_types(),
Database::ExternalDocumentsIds => index.external_documents_ids.remap_types(),
2024-09-04 12:17:13 +02:00
Database::ExactWordDocids => index.exact_word_docids.remap_types(),
Database::Main => index.main.remap_types(),
Database::WordDocids => index.word_docids.remap_types(),
Database::WordFidDocids => index.word_fid_docids.remap_types(),
2024-09-04 12:17:13 +02:00
Database::WordPositionDocids => index.word_position_docids.remap_types(),
Database::FidWordCountDocids => index.field_id_word_count_docids.remap_types(),
Database::WordPairProximityDocids => index.word_pair_proximity_docids.remap_types(),
2024-09-16 09:34:10 +02:00
Database::FacetIdIsNullDocids => index.facet_id_is_null_docids.remap_types(),
Database::FacetIdIsEmptyDocids => index.facet_id_is_empty_docids.remap_types(),
Database::FacetIdExistsDocids => index.facet_id_exists_docids.remap_types(),
Database::FacetIdF64NumberDocids => index.facet_id_f64_docids.remap_types(),
Database::FacetIdStringDocids => index.facet_id_string_docids.remap_types(),
2024-08-29 15:07:59 +02:00
}
}
2024-09-16 09:34:10 +02:00
}
impl From<FacetKind> for Database {
fn from(value: FacetKind) -> Self {
match value {
FacetKind::Number => Database::FacetIdF64NumberDocids,
FacetKind::String => Database::FacetIdStringDocids,
FacetKind::Null => Database::FacetIdIsNullDocids,
FacetKind::Empty => Database::FacetIdIsEmptyDocids,
FacetKind::Exists => Database::FacetIdExistsDocids,
}
}
}
2024-10-29 17:43:36 +01:00
impl DbOperation {
2024-09-16 09:34:10 +02:00
pub fn database(&self, index: &Index) -> heed::Database<Bytes, Bytes> {
self.database.database(index)
}
pub fn entry(self) -> EntryOperation {
self.entry
}
2024-08-29 15:07:59 +02:00
}
pub struct WriterReceiver(Receiver<WriterOperation>);
impl IntoIterator for WriterReceiver {
type Item = WriterOperation;
type IntoIter = IntoIter<Self::Item>;
fn into_iter(self) -> Self::IntoIter {
self.0.into_iter()
2024-08-29 15:07:59 +02:00
}
}
pub struct ExtractorSender {
sender: Sender<WriterOperation>,
/// The number of message we sent in total in the channel.
send_count: AtomicUsize,
/// The number of times we sent something in a channel that was full.
writer_contentious_count: AtomicUsize,
/// The number of times we sent something in a channel that was empty.
extractor_contentious_count: AtomicUsize,
}
impl Drop for ExtractorSender {
fn drop(&mut self) {
let send_count = *self.send_count.get_mut();
let writer_contentious_count = *self.writer_contentious_count.get_mut();
let extractor_contentious_count = *self.extractor_contentious_count.get_mut();
2024-09-24 18:21:58 +02:00
eprintln!(
"Extractor channel stats: {send_count} sends, \
{writer_contentious_count} writer contentions ({}%), \
{extractor_contentious_count} extractor contentions ({}%)",
(writer_contentious_count as f32 / send_count as f32) * 100.0,
(extractor_contentious_count as f32 / send_count as f32) * 100.0
)
}
}
2024-08-29 15:07:59 +02:00
impl ExtractorSender {
2024-09-16 09:34:10 +02:00
pub fn docids<D: DatabaseType>(&self) -> WordDocidsSender<'_, D> {
WordDocidsSender { sender: self, _marker: PhantomData }
2024-09-16 09:34:10 +02:00
}
pub fn facet_docids(&self) -> FacetDocidsSender<'_> {
FacetDocidsSender { sender: self }
}
pub fn documents(&self) -> DocumentsSender<'_> {
DocumentsSender(self)
}
2024-10-09 14:39:27 +02:00
pub fn send_documents_ids(&self, documents_ids: RoaringBitmap) -> StdResult<(), SendError<()>> {
let entry = EntryOperation::Write(KeyValueEntry::from_small_key_bitmap(
DOCUMENTS_IDS_KEY.as_bytes(),
2024-10-09 14:39:27 +02:00
documents_ids,
));
2024-10-29 17:43:36 +01:00
match self.send_db_operation(DbOperation { database: Database::Main, entry }) {
Ok(()) => Ok(()),
Err(SendError(_)) => Err(SendError(())),
}
}
2024-10-29 17:43:36 +01:00
fn send_db_operation(&self, op: DbOperation) -> StdResult<(), SendError<()>> {
if self.sender.is_full() {
self.writer_contentious_count.fetch_add(1, Ordering::SeqCst);
}
if self.sender.is_empty() {
self.extractor_contentious_count.fetch_add(1, Ordering::SeqCst);
}
self.send_count.fetch_add(1, Ordering::SeqCst);
match self.sender.send(WriterOperation::DbOperation(op)) {
Ok(()) => Ok(()),
Err(SendError(_)) => Err(SendError(())),
}
}
}
2024-09-04 12:17:13 +02:00
pub enum ExactWordDocids {}
pub enum FidWordCountDocids {}
pub enum WordDocids {}
pub enum WordFidDocids {}
pub enum WordPairProximityDocids {}
2024-09-04 12:17:13 +02:00
pub enum WordPositionDocids {}
pub trait DatabaseType {
2024-09-04 12:17:13 +02:00
const DATABASE: Database;
2024-09-16 09:34:10 +02:00
}
2024-09-04 12:17:13 +02:00
impl DatabaseType for ExactWordDocids {
const DATABASE: Database = Database::ExactWordDocids;
2024-09-16 09:34:10 +02:00
}
2024-09-04 12:17:13 +02:00
impl DatabaseType for FidWordCountDocids {
const DATABASE: Database = Database::FidWordCountDocids;
2024-09-16 09:34:10 +02:00
}
2024-09-04 12:17:13 +02:00
impl DatabaseType for WordDocids {
const DATABASE: Database = Database::WordDocids;
2024-09-16 09:34:10 +02:00
}
impl DatabaseType for WordFidDocids {
2024-09-04 12:17:13 +02:00
const DATABASE: Database = Database::WordFidDocids;
2024-09-16 09:34:10 +02:00
}
2024-09-04 12:17:13 +02:00
impl DatabaseType for WordPairProximityDocids {
const DATABASE: Database = Database::WordPairProximityDocids;
2024-09-16 09:34:10 +02:00
}
2024-09-04 12:17:13 +02:00
impl DatabaseType for WordPositionDocids {
const DATABASE: Database = Database::WordPositionDocids;
2024-09-16 09:34:10 +02:00
}
2024-09-04 12:17:13 +02:00
2024-09-16 09:34:10 +02:00
pub trait DocidsSender {
fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>>;
fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>>;
}
pub struct WordDocidsSender<'a, D> {
sender: &'a ExtractorSender,
_marker: PhantomData<D>,
}
2024-09-16 09:34:10 +02:00
impl<D: DatabaseType> DocidsSender for WordDocidsSender<'_, D> {
fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> {
2024-09-04 14:30:09 +02:00
let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value));
2024-10-29 17:43:36 +01:00
match self.sender.send_db_operation(DbOperation { database: D::DATABASE, entry }) {
Ok(()) => Ok(()),
Err(SendError(_)) => Err(SendError(())),
}
}
2024-09-16 09:34:10 +02:00
fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
let entry = EntryOperation::Delete(KeyEntry::from_key(key));
2024-10-29 17:43:36 +01:00
match self.sender.send_db_operation(DbOperation { database: D::DATABASE, entry }) {
Ok(()) => Ok(()),
Err(SendError(_)) => Err(SendError(())),
}
}
}
2024-09-16 09:34:10 +02:00
pub struct FacetDocidsSender<'a> {
sender: &'a ExtractorSender,
2024-09-16 09:34:10 +02:00
}
impl DocidsSender for FacetDocidsSender<'_> {
fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> {
let (facet_kind, key) = FacetKind::extract_from_key(key);
let database = Database::from(facet_kind);
// let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value));
let entry = match facet_kind {
// skip level group size
FacetKind::String | FacetKind::Number => {
// add facet group size
let value = [&[1], value].concat();
EntryOperation::Write(KeyValueEntry::from_small_key_value(key, &value))
}
_ => EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)),
};
2024-10-29 17:43:36 +01:00
match self.sender.send_db_operation(DbOperation { database, entry }) {
2024-09-16 09:34:10 +02:00
Ok(()) => Ok(()),
Err(SendError(_)) => Err(SendError(())),
}
}
fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
let (facet_kind, key) = FacetKind::extract_from_key(key);
let database = Database::from(facet_kind);
2024-09-16 09:34:10 +02:00
let entry = EntryOperation::Delete(KeyEntry::from_key(key));
2024-10-29 17:43:36 +01:00
match self.sender.send_db_operation(DbOperation { database, entry }) {
2024-09-16 09:34:10 +02:00
Ok(()) => Ok(()),
Err(SendError(_)) => Err(SendError(())),
}
}
}
pub struct DocumentsSender<'a>(&'a ExtractorSender);
impl DocumentsSender<'_> {
/// TODO do that efficiently
pub fn uncompressed(
&self,
docid: DocumentId,
external_id: String,
document: &KvReaderFieldId,
) -> StdResult<(), SendError<()>> {
2024-09-04 14:30:09 +02:00
let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(
&docid.to_be_bytes(),
document.as_bytes(),
));
2024-10-29 17:43:36 +01:00
match self.0.send_db_operation(DbOperation { database: Database::Documents, entry }) {
Ok(()) => Ok(()),
Err(SendError(_)) => Err(SendError(())),
}?;
let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(
external_id.as_bytes(),
&docid.to_be_bytes(),
));
2024-10-29 17:43:36 +01:00
match self
.0
.send_db_operation(DbOperation { database: Database::ExternalDocumentsIds, entry })
{
Ok(()) => Ok(()),
Err(SendError(_)) => Err(SendError(())),
}
}
2024-08-29 15:07:59 +02:00
pub fn delete(&self, docid: DocumentId, external_id: String) -> StdResult<(), SendError<()>> {
let entry = EntryOperation::Delete(KeyEntry::from_key(&docid.to_be_bytes()));
2024-10-29 17:43:36 +01:00
match self.0.send_db_operation(DbOperation { database: Database::Documents, entry }) {
2024-08-29 15:07:59 +02:00
Ok(()) => Ok(()),
Err(SendError(_)) => Err(SendError(())),
}?;
let entry = EntryOperation::Delete(KeyEntry::from_key(external_id.as_bytes()));
2024-10-29 17:43:36 +01:00
match self
.0
.send_db_operation(DbOperation { database: Database::ExternalDocumentsIds, entry })
{
Ok(()) => Ok(()),
Err(SendError(_)) => Err(SendError(())),
2024-08-29 15:07:59 +02:00
}
}
}
2024-10-29 17:43:36 +01:00
pub struct EmbeddingSender<'a>(&'a Sender<WriterOperation>);
2024-10-21 10:35:56 +02:00
impl EmbeddingSender<'_> {
2024-10-28 14:23:02 +01:00
pub fn set_vectors(
&self,
docid: DocumentId,
embedder_id: u8,
embeddings: Vec<Embedding>,
) -> StdResult<(), SendError<()>> {
2024-10-29 17:43:36 +01:00
self.0
.send(WriterOperation::ArroyOperation(ArroyOperation::SetVectors {
docid,
embedder_id,
embeddings,
}))
.map_err(|_| SendError(()))
2024-10-28 14:23:02 +01:00
}
pub fn set_vector(
&self,
docid: DocumentId,
embedder_id: u8,
embedding: Embedding,
) -> StdResult<(), SendError<()>> {
2024-10-29 17:43:36 +01:00
self.0
.send(WriterOperation::ArroyOperation(ArroyOperation::SetVector {
docid,
embedder_id,
embedding,
}))
.map_err(|_| SendError(()))
}
/// Marks all embedders as "to be built"
pub fn finish(
self,
user_provided: HashMap<String, RoaringBitmap>,
2024-10-21 10:35:56 +02:00
) -> StdResult<(), SendError<()>> {
2024-10-29 17:43:36 +01:00
self.0
.send(WriterOperation::ArroyOperation(ArroyOperation::Finish { user_provided }))
.map_err(|_| SendError(()))
2024-10-21 10:35:56 +02:00
}
}