write a test ensuring the index-scheduler is effectively down when the upgrade task fail and try to process it when it restarts. There is a bug when deleting this task

This commit is contained in:
Tamo 2025-01-21 15:00:48 +01:00 committed by Louis Dureuil
parent 20ac59c946
commit 5458850d21
No known key found for this signature in database
11 changed files with 550 additions and 17 deletions

View file

@ -1,10 +1,18 @@
use std::io::{BufWriter, Write};
use std::sync::Arc;
use std::time::Duration;
use big_s::S;
use crossbeam_channel::RecvTimeoutError;
use file_store::File;
use meilisearch_types::document_formats::DocumentFormatError;
use meilisearch_types::milli::update::IndexDocumentsMethod::ReplaceDocuments;
use meilisearch_types::milli::update::IndexerConfig;
use meilisearch_types::tasks::KindWithContent;
use meilisearch_types::VERSION_FILE_NAME;
use tempfile::{NamedTempFile, TempDir};
use uuid::Uuid;
use Breakpoint::*;
use crate::insta_snapshot::snapshot_index_scheduler;
use crate::{Error, IndexScheduler, IndexSchedulerOptions};
@ -28,20 +36,13 @@ pub(crate) enum FailureLocation {
InsideCreateBatch,
InsideProcessBatch,
PanicInsideProcessBatch,
ProcessUpgrade,
AcquiringWtxn,
UpdatingTaskAfterProcessBatchSuccess { task_uid: u32 },
UpdatingTaskAfterProcessBatchFailure,
CommittingWtxn,
}
use big_s::S;
use crossbeam_channel::RecvTimeoutError;
use meilisearch_types::milli::update::IndexerConfig;
use meilisearch_types::tasks::KindWithContent;
use meilisearch_types::VERSION_FILE_NAME;
use tempfile::{NamedTempFile, TempDir};
use Breakpoint::*;
impl IndexScheduler {
/// Blocks the thread until the test handle asks to progress to/through this breakpoint.
///
@ -55,7 +56,6 @@ impl IndexScheduler {
/// As soon as we find it, the index scheduler is unblocked but then wait again on the call to
/// `test_breakpoint_sdr.send(b, true)`. This message will only be able to send once the
/// test asks to progress to the next `(b2, false)`.
#[cfg(test)]
pub(crate) fn breakpoint(&self, b: Breakpoint) {
// We send two messages. The first one will sync with the call
// to `handle.wait_until(b)`. The second one will block until the
@ -225,6 +225,46 @@ pub struct IndexSchedulerHandle {
}
impl IndexSchedulerHandle {
/// Restarts the index-scheduler on the same database.
/// To use this function you must give back the index-scheduler that was given to you when
/// creating the handle the first time.
/// If the index-scheduler has been cloned in the meantime you must drop all copy otherwise
/// the function will panic.
pub(crate) fn restart(
self,
index_scheduler: IndexScheduler,
autobatching_enabled: bool,
planned_failures: Vec<(usize, FailureLocation)>,
) -> (IndexScheduler, Self) {
drop(index_scheduler);
let Self { _tempdir: tempdir, index_scheduler, test_breakpoint_rcv, last_breakpoint: _ } =
self;
drop(index_scheduler);
// We must ensure that the `run` function has stopped running before restarting the index scheduler
loop {
match test_breakpoint_rcv.recv_timeout(Duration::from_secs(5)) {
Ok(_) => continue,
Err(RecvTimeoutError::Timeout) => panic!("The indexing loop is stuck somewhere"),
Err(RecvTimeoutError::Disconnected) => break,
}
}
let (scheduler, mut handle) =
IndexScheduler::test_with_custom_config(planned_failures, |config| {
config.autobatching_enabled = autobatching_enabled;
config.version_file_path = tempdir.path().join(VERSION_FILE_NAME);
config.auth_path = tempdir.path().join("auth");
config.tasks_path = tempdir.path().join("db_path");
config.update_file_path = tempdir.path().join("file_store");
config.indexes_path = tempdir.path().join("indexes");
config.snapshots_path = tempdir.path().join("snapshots");
config.dumps_path = tempdir.path().join("dumps");
});
handle._tempdir = tempdir;
(scheduler, handle)
}
/// Advance the scheduler to the next tick.
/// Panic
/// * If the scheduler is waiting for a task to be registered.
@ -350,4 +390,18 @@ impl IndexSchedulerHandle {
}
self.advance_till([AfterProcessing]);
}
// Wait for one failed batch.
#[track_caller]
pub(crate) fn scheduler_is_down(&mut self) {
loop {
match self
.test_breakpoint_rcv
.recv_timeout(std::time::Duration::from_secs(1)) {
Ok((_, true)) => continue,
Ok((b, false)) => panic!("The scheduler was supposed to be down but successfully moved to the next breakpoint: {b:?}"),
Err(RecvTimeoutError::Timeout | RecvTimeoutError::Disconnected) => break,
}
}
}
}