From 2f1afc1296621624f34aa6a9db03ef29c741ba6c Mon Sep 17 00:00:00 2001 From: NIIBE Yutaka Date: Sat, 19 Mar 2022 14:47:59 +0900 Subject: [PATCH] common: Fix another race condition, and address the other one. * common/dotlock.c (dotlock_take_unix): Do same when same PID process detects stale lockfile. Add comment. -- GnuPG-bug-id: 5884 Signed-off-by: NIIBE Yutaka --- common/dotlock.c | 46 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/common/dotlock.c b/common/dotlock.c index 3d948f718..80d80deec 100644 --- a/common/dotlock.c +++ b/common/dotlock.c @@ -1118,30 +1118,52 @@ dotlock_take_unix (dotlock_t h, long timeout) my_info_0 ("lockfile disappeared\n"); goto again; } - else if ( pid == getpid() && same_node ) - { - my_info_0 ("Oops: lock already held by us\n"); - h->locked = 1; - - close (fd); - return 0; /* okay */ - } - else if ( same_node && kill (pid, 0) && errno == ESRCH ) + else if ( (pid == getpid() && same_node) + || (same_node && kill (pid, 0) && errno == ESRCH) ) + /* Stale lockfile is detected. */ { struct stat sb; /* Check if it's unlocked during examining the lockfile. */ if (fstat (fd, &sb) || sb.st_nlink == 0) { - /* It's gone already. */ + /* It's gone already by another process. */ close (fd); goto again; } - close (fd); + /* + * Here, although it's quite _rare_, we have a race condition. + * + * When multiple processes race on a stale lockfile, detecting + * AND removing should be done atomically. That is, to work + * correctly, the file to be removed should be the one which is + * examined for detection. + * + * But, when it's not atomic, consider the case for us where it + * takes some time between the detection and the removal of the + * lockfile. + * + * In this situation, it is possible that the file which was + * detected as stale is already removed by another process and + * then new lockfile is created (by that process or other one). + * + * And it is newly created valid lockfile which is going to be + * removed by us. + * + * Consider this long comment as it expresses possible (long) + * time between fstat above and unlink below; Meanwhile, the + * lockfile in question may be removed and there may be new + * valid one. + * + * In short, when you see the message of removing stale lockfile + * when there are multiple processes for the work, there is + * (very) little possibility something went wrong. + */ - my_info_1 (_("removing stale lockfile (created by %d)\n"), pid); unlink (h->lockname); + my_info_1 (_("removing stale lockfile (created by %d)\n"), pid); + close (fd); goto again; }