feat(lib): auto-batching

This commit is contained in:
mpostma 2022-01-19 11:21:19 +01:00 committed by ad hoc
parent 622c15e825
commit c9a236b0af
No known key found for this signature in database
GPG key ID: 4F00A782990CC643
28 changed files with 1181 additions and 777 deletions

View file

@ -10,7 +10,7 @@ use tokio::sync::{mpsc, oneshot, RwLock};
use super::error::{DumpActorError, Result};
use super::{DumpInfo, DumpJob, DumpMsg, DumpStatus};
use crate::tasks::TaskStore;
use crate::tasks::Scheduler;
use crate::update_file_store::UpdateFileStore;
pub const CONCURRENT_DUMP_MSG: usize = 10;
@ -18,7 +18,7 @@ pub const CONCURRENT_DUMP_MSG: usize = 10;
pub struct DumpActor {
inbox: Option<mpsc::Receiver<DumpMsg>>,
update_file_store: UpdateFileStore,
task_store: TaskStore,
scheduler: Arc<RwLock<Scheduler>>,
dump_path: PathBuf,
analytics_path: PathBuf,
lock: Arc<Mutex<()>>,
@ -36,7 +36,7 @@ impl DumpActor {
pub fn new(
inbox: mpsc::Receiver<DumpMsg>,
update_file_store: UpdateFileStore,
task_store: TaskStore,
scheduler: Arc<RwLock<Scheduler>>,
dump_path: impl AsRef<Path>,
analytics_path: impl AsRef<Path>,
index_db_size: usize,
@ -46,7 +46,7 @@ impl DumpActor {
let lock = Arc::new(Mutex::new(()));
Self {
inbox: Some(inbox),
task_store,
scheduler,
update_file_store,
dump_path: dump_path.as_ref().into(),
analytics_path: analytics_path.as_ref().into(),
@ -118,13 +118,13 @@ impl DumpActor {
dump_path: self.dump_path.clone(),
db_path: self.analytics_path.clone(),
update_file_store: self.update_file_store.clone(),
task_store: self.task_store.clone(),
scheduler: self.scheduler.clone(),
uid: uid.clone(),
update_db_size: self.update_db_size,
index_db_size: self.index_db_size,
};
let task_result = tokio::task::spawn(task.run()).await;
let task_result = tokio::task::spawn_local(task.run()).await;
let mut dump_infos = self.dump_infos.write().await;
let dump_infos = dump_infos

View file

@ -1,5 +1,6 @@
use std::fs::File;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use anyhow::bail;
use chrono::{DateTime, Utc};
@ -12,7 +13,7 @@ use meilisearch_auth::AuthController;
pub use message::DumpMsg;
use tempfile::TempDir;
use tokio::fs::create_dir_all;
use tokio::sync::oneshot;
use tokio::sync::{oneshot, RwLock};
use crate::analytics;
use crate::compression::{from_tar_gz, to_tar_gz};
@ -20,7 +21,7 @@ use crate::index_controller::dump_actor::error::DumpActorError;
use crate::index_controller::dump_actor::loaders::{v2, v3, v4};
use crate::options::IndexerOpts;
use crate::tasks::task::Job;
use crate::tasks::TaskStore;
use crate::tasks::Scheduler;
use crate::update_file_store::UpdateFileStore;
use error::Result;
@ -319,7 +320,7 @@ struct DumpJob {
dump_path: PathBuf,
db_path: PathBuf,
update_file_store: UpdateFileStore,
task_store: TaskStore,
scheduler: Arc<RwLock<Scheduler>>,
uid: String,
update_db_size: usize,
index_db_size: usize,
@ -344,21 +345,28 @@ impl DumpJob {
let (sender, receiver) = oneshot::channel();
self.task_store
.register_job(Job::Dump {
self.scheduler
.write()
.await
.schedule_job(Job::Dump {
ret: sender,
path: temp_dump_path.clone(),
})
.await;
receiver.await??;
self.task_store
.dump(&temp_dump_path, self.update_file_store.clone())
.await?;
// wait until the job has started performing before finishing the dump process
let sender = receiver.await??;
AuthController::dump(&self.db_path, &temp_dump_path)?;
//TODO(marin): this is not right, the scheduler should dump itself, not do it here...
self.scheduler
.read()
.await
.dump(&temp_dump_path, self.update_file_store.clone())
.await?;
let dump_path = tokio::task::spawn_blocking(move || -> Result<PathBuf> {
let _ = &self;
// for now we simply copy the updates/updates_files
// FIXME: We may copy more files than necessary, if new files are added while we are
// performing the dump. We need a way to filter them out.
@ -374,6 +382,9 @@ impl DumpJob {
})
.await??;
// notify the update loop that we are finished performing the dump.
let _ = sender.send(());
info!("Created dump in {:?}.", dump_path);
Ok(())
@ -382,19 +393,15 @@ impl DumpJob {
#[cfg(test)]
mod test {
use std::collections::HashSet;
use futures::future::{err, ok};
use nelson::Mocker;
use once_cell::sync::Lazy;
use uuid::Uuid;
use super::*;
use crate::index::error::Result as IndexResult;
use crate::index::Index;
use crate::index_resolver::error::IndexResolverError;
use crate::index_resolver::index_store::MockIndexStore;
use crate::index_resolver::meta_store::MockIndexMetaStore;
use crate::options::SchedulerConfig;
use crate::tasks::error::Result as TaskResult;
use crate::tasks::task::{Task, TaskId};
use crate::tasks::{MockTaskPerformer, TaskFilter, TaskStore};
use crate::update_file_store::UpdateFileStore;
fn setup() {
@ -411,86 +418,91 @@ mod test {
}
#[actix_rt::test]
#[ignore]
async fn test_dump_normal() {
setup();
let tmp = tempfile::tempdir().unwrap();
let uuids = std::iter::repeat_with(Uuid::new_v4)
.take(4)
.collect::<HashSet<_>>();
let mut uuid_store = MockIndexMetaStore::new();
uuid_store
.expect_dump()
.once()
.returning(move |_| Box::pin(ok(())));
let mut index_store = MockIndexStore::new();
index_store.expect_get().times(4).returning(move |uuid| {
let mocker = Mocker::default();
let uuids_clone = uuids.clone();
mocker.when::<(), Uuid>("uuid").once().then(move |_| {
assert!(uuids_clone.contains(&uuid));
uuid
});
mocker
.when::<&Path, IndexResult<()>>("dump")
.once()
.then(move |_| Ok(()));
Box::pin(ok(Some(Index::mock(mocker))))
});
let mocker = Mocker::default();
let update_file_store = UpdateFileStore::mock(mocker);
//let update_sender =
// create_update_handler(index_resolver.clone(), tmp.path(), 4096 * 100).unwrap();
//TODO: fix dump tests
let mut performer = MockTaskPerformer::new();
performer
.expect_process_job()
.once()
.returning(|j| match j {
Job::Dump { ret, .. } => {
let (sender, _receiver) = oneshot::channel();
ret.send(Ok(sender)).unwrap();
}
_ => unreachable!(),
});
let performer = Arc::new(performer);
let mocker = Mocker::default();
let task_store = TaskStore::mock(mocker);
mocker
.when::<(&Path, UpdateFileStore), TaskResult<()>>("dump")
.then(|_| Ok(()));
mocker
.when::<(Option<TaskId>, Option<TaskFilter>, Option<usize>), TaskResult<Vec<Task>>>(
"list_tasks",
)
.then(|_| Ok(Vec::new()));
let store = TaskStore::mock(mocker);
let config = SchedulerConfig::default();
let scheduler = Scheduler::new(store, performer, config).unwrap();
let task = DumpJob {
dump_path: tmp.path().into(),
// this should do nothing
update_file_store,
db_path: tmp.path().into(),
task_store,
uid: String::from("test"),
update_db_size: 4096 * 10,
index_db_size: 4096 * 10,
scheduler,
};
task.run().await.unwrap();
}
#[actix_rt::test]
#[ignore]
async fn error_performing_dump() {
let tmp = tempfile::tempdir().unwrap();
let mut uuid_store = MockIndexMetaStore::new();
uuid_store
.expect_dump()
.once()
.returning(move |_| Box::pin(err(IndexResolverError::ExistingPrimaryKey)));
let mocker = Mocker::default();
let file_store = UpdateFileStore::mock(mocker);
let mocker = Mocker::default();
mocker
.when::<(Option<TaskId>, Option<TaskFilter>, Option<usize>), TaskResult<Vec<Task>>>(
"list_tasks",
)
.then(|_| Ok(Vec::new()));
let task_store = TaskStore::mock(mocker);
let mut performer = MockTaskPerformer::new();
performer
.expect_process_job()
.once()
.returning(|job| match job {
Job::Dump { ret, .. } => drop(ret.send(Err(IndexResolverError::BadlyFormatted(
"blabla".to_string(),
)))),
_ => unreachable!(),
});
let performer = Arc::new(performer);
let scheduler = Scheduler::new(task_store, performer, SchedulerConfig::default()).unwrap();
let task = DumpJob {
dump_path: tmp.path().into(),
// this should do nothing
db_path: tmp.path().into(),
update_file_store: file_store,
task_store,
uid: String::from("test"),
update_db_size: 4096 * 10,
index_db_size: 4096 * 10,
scheduler,
};
assert!(task.run().await.is_err());

View file

@ -13,7 +13,7 @@ use futures::Stream;
use futures::StreamExt;
use milli::update::IndexDocumentsMethod;
use serde::{Deserialize, Serialize};
use tokio::sync::mpsc;
use tokio::sync::{mpsc, RwLock};
use tokio::task::spawn_blocking;
use tokio::time::sleep;
use uuid::Uuid;
@ -23,12 +23,11 @@ use crate::index::{
Checked, Document, IndexMeta, IndexStats, SearchQuery, SearchResult, Settings, Unchecked,
};
use crate::index_controller::dump_actor::{load_dump, DumpActor, DumpActorHandleImpl};
use crate::options::IndexerOpts;
use crate::options::{IndexerOpts, SchedulerConfig};
use crate::snapshot::{load_snapshot, SnapshotService};
use crate::tasks::create_task_store;
use crate::tasks::error::TaskError;
use crate::tasks::task::{DocumentDeletion, Task, TaskContent, TaskId};
use crate::tasks::{TaskFilter, TaskStore};
use crate::tasks::{Scheduler, TaskFilter, TaskStore};
use error::Result;
use self::dump_actor::{DumpActorHandle, DumpInfo};
@ -68,6 +67,7 @@ pub struct IndexSettings {
pub struct IndexController<U, I> {
index_resolver: Arc<IndexResolver<U, I>>,
scheduler: Arc<RwLock<Scheduler>>,
task_store: TaskStore,
dump_handle: dump_actor::DumpActorHandleImpl,
update_file_store: UpdateFileStore,
@ -78,9 +78,10 @@ impl<U, I> Clone for IndexController<U, I> {
fn clone(&self) -> Self {
Self {
index_resolver: self.index_resolver.clone(),
task_store: self.task_store.clone(),
scheduler: self.scheduler.clone(),
dump_handle: self.dump_handle.clone(),
update_file_store: self.update_file_store.clone(),
task_store: self.task_store.clone(),
}
}
}
@ -160,6 +161,7 @@ impl IndexControllerBuilder {
self,
db_path: impl AsRef<Path>,
indexer_options: IndexerOpts,
scheduler_config: SchedulerConfig,
) -> anyhow::Result<MeiliSearch> {
let index_size = self
.max_index_size
@ -217,8 +219,9 @@ impl IndexControllerBuilder {
update_file_store.clone(),
)?);
let task_store =
create_task_store(meta_env, index_resolver.clone()).map_err(|e| anyhow::anyhow!(e))?;
let task_store = TaskStore::new(meta_env)?;
let scheduler =
Scheduler::new(task_store.clone(), index_resolver.clone(), scheduler_config)?;
let dump_path = self
.dump_dst
@ -229,14 +232,14 @@ impl IndexControllerBuilder {
let actor = DumpActor::new(
receiver,
update_file_store.clone(),
task_store.clone(),
scheduler.clone(),
dump_path,
analytics_path,
index_size,
task_store_size,
);
tokio::task::spawn(actor.run());
tokio::task::spawn_local(actor.run());
DumpActorHandleImpl { sender }
};
@ -255,17 +258,18 @@ impl IndexControllerBuilder {
snapshot_path,
index_size,
meta_env_size: task_store_size,
task_store: task_store.clone(),
scheduler: scheduler.clone(),
};
tokio::task::spawn(snapshot_service.run());
tokio::task::spawn_local(snapshot_service.run());
}
Ok(IndexController {
index_resolver,
task_store,
scheduler,
dump_handle,
update_file_store,
task_store,
})
}
@ -415,12 +419,13 @@ where
};
let task = self.task_store.register(uid, content).await?;
self.scheduler.read().await.notify();
Ok(task)
}
pub async fn get_task(&self, id: TaskId, filter: Option<TaskFilter>) -> Result<Task> {
let task = self.task_store.get_task(id, filter).await?;
let task = self.scheduler.read().await.get_task(id, filter).await?;
Ok(task)
}
@ -435,7 +440,12 @@ where
let mut filter = TaskFilter::default();
filter.filter_index(index_uid);
let task = self.task_store.get_task(task_id, Some(filter)).await?;
let task = self
.scheduler
.read()
.await
.get_task(task_id, Some(filter))
.await?;
Ok(task)
}
@ -446,7 +456,12 @@ where
limit: Option<usize>,
offset: Option<TaskId>,
) -> Result<Vec<Task>> {
let tasks = self.task_store.list_tasks(offset, filter, limit).await?;
let tasks = self
.scheduler
.read()
.await
.list_tasks(offset, filter, limit)
.await?;
Ok(tasks)
}
@ -466,7 +481,9 @@ where
filter.filter_index(index_uid);
let tasks = self
.task_store
.scheduler
.read()
.await
.list_tasks(
Some(offset.unwrap_or_default() + task_id),
Some(filter),
@ -547,10 +564,11 @@ where
}
pub async fn get_index_stats(&self, uid: String) -> Result<IndexStats> {
let last_task = self.task_store.get_processing_task().await?;
let processing_tasks = self.scheduler.read().await.get_processing_tasks().await?;
// Check if the currently indexing update is from our index.
let is_indexing = last_task
.map(|task| task.index_uid.into_inner() == uid)
let is_indexing = processing_tasks
.first()
.map(|task| task.index_uid.as_str() == uid)
.unwrap_or_default();
let index = self.index_resolver.get_index(uid).await?;
@ -564,7 +582,7 @@ where
let mut last_task: Option<DateTime<_>> = None;
let mut indexes = BTreeMap::new();
let mut database_size = 0;
let processing_task = self.task_store.get_processing_task().await?;
let processing_tasks = self.scheduler.read().await.get_processing_tasks().await?;
for (index_uid, index) in self.index_resolver.list().await? {
if !search_rules.is_index_authorized(&index_uid) {
@ -584,8 +602,8 @@ where
});
// Check if the currently indexing update is from our index.
stats.is_indexing = processing_task
.as_ref()
stats.is_indexing = processing_tasks
.first()
.map(|p| p.index_uid.as_str() == index_uid)
.or(Some(false));
@ -637,16 +655,18 @@ mod test {
impl IndexController<MockIndexMetaStore, MockIndexStore> {
pub fn mock(
index_resolver: IndexResolver<MockIndexMetaStore, MockIndexStore>,
index_resolver: Arc<IndexResolver<MockIndexMetaStore, MockIndexStore>>,
task_store: TaskStore,
update_file_store: UpdateFileStore,
dump_handle: DumpActorHandleImpl,
scheduler: Arc<RwLock<Scheduler>>,
) -> Self {
IndexController {
index_resolver: Arc::new(index_resolver),
index_resolver,
task_store,
dump_handle,
update_file_store,
scheduler,
}
}
}
@ -719,13 +739,27 @@ mod test {
let task_store_mocker = nelson::Mocker::default();
let mocker = Mocker::default();
let update_file_store = UpdateFileStore::mock(mocker);
let index_resolver = IndexResolver::new(uuid_store, index_store, update_file_store.clone());
let index_resolver = Arc::new(IndexResolver::new(
uuid_store,
index_store,
update_file_store.clone(),
));
let task_store = TaskStore::mock(task_store_mocker);
// let dump_actor = MockDumpActorHandle::new();
let scheduler = Scheduler::new(
task_store.clone(),
index_resolver.clone(),
SchedulerConfig::default(),
)
.unwrap();
let (sender, _) = mpsc::channel(1);
let dump_handle = DumpActorHandleImpl { sender };
let index_controller =
IndexController::mock(index_resolver, task_store, update_file_store, dump_handle);
let index_controller = IndexController::mock(
index_resolver,
task_store,
update_file_store,
dump_handle,
scheduler,
);
let r = index_controller
.search(index_uid.to_owned(), query.clone())