mirror of
synced 2025-03-06 16:11:33 +01:00
completely file backed udpates
This commit is contained in:
@ -62,7 +62,7 @@ impl Data {
let path = options.db_path.clone();
//let indexer_opts = options.indexer_options.clone();
let index_controller = ActorIndexController::new();
let index_controller = ActorIndexController::new(&path);
let index_controller = Arc::new(index_controller);
let mut api_keys = ApiKeys {
@ -3,9 +3,6 @@
use milli::update::{IndexDocumentsMethod, UpdateFormat};
//use tokio::io::AsyncWriteExt;
use actix_web::web::Payload;
use tokio::fs::File;
use tokio::io::{AsyncWriteExt, AsyncSeekExt};
use futures::prelude::stream::StreamExt;
use crate::index_controller::UpdateStatus;
use crate::index_controller::{Settings, IndexMetadata};
@ -17,18 +14,11 @@ impl Data {
index: impl AsRef<str> + Send + Sync + 'static,
method: IndexDocumentsMethod,
format: UpdateFormat,
mut stream: Payload,
stream: Payload,
primary_key: Option<String>,
) -> anyhow::Result<UpdateStatus>
let file = tempfile::tempfile_in(".")?;
let mut file = File::from_std(file);
while let Some(item) = stream.next().await {
let update_status = self.index_controller.add_documents(index.as_ref().to_string(), method, format, file, primary_key).await?;
let update_status = self.index_controller.add_documents(index.as_ref().to_string(), method, format, stream, primary_key).await?;
@ -4,25 +4,29 @@ mod uuid_resolver;
mod update_store;
mod update_handler;
use tokio::sync::oneshot;
use std::path::Path;
use tokio::sync::{mpsc, oneshot};
use super::IndexController;
use uuid::Uuid;
use super::IndexMetadata;
use tokio::fs::File;
use futures::stream::StreamExt;
use actix_web::web::Payload;
use super::UpdateMeta;
use crate::data::{SearchResult, SearchQuery};
use actix_web::web::Bytes;
pub struct ActorIndexController {
uuid_resolver: uuid_resolver::UuidResolverHandle,
index_handle: index_actor::IndexActorHandle,
update_handle: update_actor::UpdateActorHandle,
update_handle: update_actor::UpdateActorHandle<Bytes>,
impl ActorIndexController {
pub fn new() -> Self {
pub fn new(path: impl AsRef<Path>) -> Self {
let uuid_resolver = uuid_resolver::UuidResolverHandle::new();
let index_actor = index_actor::IndexActorHandle::new();
let update_handle = update_actor::UpdateActorHandle::new(index_actor.clone());
let update_handle = update_actor::UpdateActorHandle::new(index_actor.clone(), &path);
Self { uuid_resolver, index_handle: index_actor, update_handle }
@ -43,12 +47,22 @@ impl IndexController for ActorIndexController {
index: String,
method: milli::update::IndexDocumentsMethod,
format: milli::update::UpdateFormat,
data: File,
mut payload: Payload,
primary_key: Option<String>,
) -> anyhow::Result<super::UpdateStatus> {
let uuid = self.uuid_resolver.get_or_create(index).await?;
let meta = UpdateMeta::DocumentsAddition { method, format, primary_key };
let status = self.update_handle.update(meta, Some(data), uuid).await?;
let (sender, receiver) = mpsc::channel(10);
// It is necessary to spawn a local task to senf the payload to the update handle to
// prevent dead_locking between the update_handle::update that waits for the update to be
// registered and the update_actor that waits for the the payload to be sent to it.
tokio::task::spawn_local(async move {
while let Some(bytes) = payload.next().await {
let status = self.update_handle.update(meta, receiver, uuid).await?;
@ -1,22 +1,24 @@
use super::index_actor::IndexActorHandle;
use uuid::Uuid;
use tokio::sync::{mpsc, oneshot};
use crate::index_controller::{UpdateMeta, UpdateStatus, UpdateResult};
use thiserror::Error;
use tokio::io::AsyncReadExt;
use log::info;
use tokio::fs::File;
use std::path::PathBuf;
use std::fs::create_dir_all;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use log::info;
use super::index_actor::IndexActorHandle;
use thiserror::Error;
use tokio::sync::{mpsc, oneshot};
use uuid::Uuid;
use tokio::fs::File;
use tokio::io::AsyncWriteExt;
use crate::index_controller::{UpdateMeta, UpdateStatus, UpdateResult, updates::Pending};
pub type Result<T> = std::result::Result<T, UpdateError>;
type UpdateStore = super::update_store::UpdateStore<UpdateMeta, UpdateResult, String>;
#[derive(Debug, Error)]
pub enum UpdateError {}
enum UpdateMsg {
enum UpdateMsg<D> {
uuid: Uuid,
ret: oneshot::Sender<Result<()>>,
@ -24,20 +26,30 @@ enum UpdateMsg {
Update {
uuid: Uuid,
meta: UpdateMeta,
payload: Option<File>,
data: mpsc::Receiver<D>,
ret: oneshot::Sender<Result<UpdateStatus>>
struct UpdateActor {
struct UpdateActor<D> {
path: PathBuf,
store: Arc<UpdateStore>,
inbox: mpsc::Receiver<UpdateMsg>,
inbox: mpsc::Receiver<UpdateMsg<D>>,
index_handle: IndexActorHandle,
impl UpdateActor {
fn new(store: Arc<UpdateStore>, inbox: mpsc::Receiver<UpdateMsg>, index_handle: IndexActorHandle) -> Self {
Self { store, inbox, index_handle }
impl<D> UpdateActor<D>
where D: AsRef<[u8]> + Sized + 'static,
fn new(
store: Arc<UpdateStore>,
inbox: mpsc::Receiver<UpdateMsg<D>>,
index_handle: IndexActorHandle,
path: impl AsRef<Path>,
) -> Self {
let path = path.as_ref().to_owned().join("update_files");
Self { store, inbox, index_handle, path }
async fn run(mut self) {
@ -45,29 +57,43 @@ impl UpdateActor {
loop {
match self.inbox.recv().await {
Some(UpdateMsg::Update { uuid, meta, payload, ret }) => self.handle_update(uuid, meta, payload, ret).await,
Some(UpdateMsg::Update { uuid, meta, data, ret }) => self.handle_update(uuid, meta, data, ret).await,
Some(_) => {}
None => {}
async fn handle_update(&self, uuid: Uuid, meta: UpdateMeta, payload: Option<File>, ret: oneshot::Sender<Result<UpdateStatus>>) {
let mut buf = Vec::new();
let mut payload = payload.unwrap();
payload.read_to_end(&mut buf).await.unwrap();
let result = self.store.register_update(meta, &buf, uuid).unwrap();
async fn handle_update(&self, uuid: Uuid, meta: UpdateMeta, mut payload: mpsc::Receiver<D>, ret: oneshot::Sender<Result<UpdateStatus>>) {
let store = self.store.clone();
let update_file_id = uuid::Uuid::new_v4();
let path = self.path.join(format!("update_{}", update_file_id));
let mut file = File::create(&path).await.unwrap();
while let Some(bytes) = payload.recv().await {
let file = file.into_std().await;
let result = tokio::task::spawn_blocking(move || -> anyhow::Result<Pending<UpdateMeta>> {
Ok(store.register_update(meta, path, uuid)?)
let _ = ret.send(Ok(UpdateStatus::Pending(result)));
pub struct UpdateActorHandle {
sender: mpsc::Sender<UpdateMsg>,
pub struct UpdateActorHandle<D> {
sender: mpsc::Sender<UpdateMsg<D>>,
impl UpdateActorHandle {
pub fn new(index_handle: IndexActorHandle) -> Self {
impl<D> UpdateActorHandle<D>
where D: AsRef<[u8]> + Sized + 'static,
pub fn new(index_handle: IndexActorHandle, path: impl AsRef<Path>) -> Self {
let (sender, receiver) = mpsc::channel(100);
let mut options = heed::EnvOpenOptions::new();
options.map_size(4096 * 100_000);
@ -79,16 +105,16 @@ impl UpdateActorHandle {
let store = UpdateStore::open(options, &path, move |meta, file| {
futures::executor::block_on(index_handle_clone.update(meta, file))
let actor = UpdateActor::new(store, receiver, index_handle);
let actor = UpdateActor::new(store, receiver, index_handle, path);
Self { sender }
pub async fn update(&self, meta: UpdateMeta, payload: Option<File>, uuid: Uuid) -> Result<UpdateStatus> {
pub async fn update(&self, meta: UpdateMeta, data: mpsc::Receiver<D>, uuid: Uuid) -> Result<UpdateStatus> {
let (ret, receiver) = oneshot::channel();
let msg = UpdateMsg::Update {
@ -1,9 +1,9 @@
use std::path::Path;
use std::path::{Path, PathBuf};
use std::sync::{Arc, RwLock};
use std::io::{Cursor, SeekFrom, Seek, Write};
use std::fs::remove_file;
use crossbeam_channel::Sender;
use heed::types::{OwnedType, DecodeIgnore, SerdeJson, ByteSlice};
use heed::types::{OwnedType, DecodeIgnore, SerdeJson};
use heed::{EnvOpenOptions, Env, Database};
use serde::{Serialize, Deserialize};
use std::fs::File;
@ -17,7 +17,7 @@ type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>;
pub struct UpdateStore<M, N, E> {
env: Env,
pending_meta: Database<OwnedType<BEU64>, SerdeJson<Pending<M>>>,
pending: Database<OwnedType<BEU64>, ByteSlice>,
pending: Database<OwnedType<BEU64>, SerdeJson<PathBuf>>,
processed_meta: Database<OwnedType<BEU64>, SerdeJson<Processed<M, N>>>,
failed_meta: Database<OwnedType<BEU64>, SerdeJson<Failed<M, E>>>,
aborted_meta: Database<OwnedType<BEU64>, SerdeJson<Aborted<M>>>,
@ -140,7 +140,7 @@ where
pub fn register_update(
meta: M,
content: &[u8],
content: impl AsRef<Path>,
index_uuid: Uuid,
) -> heed::Result<Pending<M>> {
let mut wtxn = self.env.write_txn()?;
@ -154,7 +154,7 @@ where
let meta = Pending::new(meta, update_id, index_uuid);
self.pending_meta.put(&mut wtxn, &update_key, &meta)?;
self.pending.put(&mut wtxn, &update_key, content)?;
self.pending.put(&mut wtxn, &update_key, &content.as_ref().to_owned())?;
@ -178,7 +178,7 @@ where
// a reader while processing it, not a writer.
match first_meta {
Some((first_id, pending)) => {
let first_content = self.pending
let content_path = self.pending
.get(&rtxn, &first_id)?
.expect("associated update content");
@ -190,12 +190,7 @@ where
let mut cursor = Cursor::new(first_content);
let mut file = tempfile::tempfile()?;
let n = std::io::copy(&mut cursor, &mut file)?;
println!("copied count: {}", n);
let file = File::open(&content_path)?;
// Process the pending update using the provided user function.
let result = handler.handle_update(processing, file);
@ -209,6 +204,7 @@ where
self.pending_meta.delete(&mut wtxn, &first_id)?;
self.pending.delete(&mut wtxn, &first_id)?;
match result {
Ok(processed) => self.processed_meta.put(&mut wtxn, &first_id, &processed)?,
@ -12,7 +12,7 @@ use milli::Index;
use milli::update::{IndexDocumentsMethod, UpdateFormat, DocumentAdditionResult};
use serde::{Serialize, Deserialize, de::Deserializer};
use uuid::Uuid;
use tokio::fs::File;
use actix_web::web::Payload;
use crate::data::SearchResult;
use crate::data::SearchQuery;
@ -133,7 +133,7 @@ pub trait IndexController {
index: String,
method: IndexDocumentsMethod,
format: UpdateFormat,
data: File,
data: Payload,
primary_key: Option<String>,
) -> anyhow::Result<UpdateStatus>;
Reference in New Issue
Block a user