summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJulius Goryavsky <julius.goryavsky@mariadb.com>2021-06-09 03:41:37 +0200
committerJulius Goryavsky <julius.goryavsky@mariadb.com>2021-06-15 14:27:22 +0200
commit18d5be5b54b1a05e6107a1c5828d9eed9cf18636 (patch)
treea8332d06a06906e09c1e59fbd9b2c6998d8edc11
parent1c35a3f6fd92704d7a135a81c7752f5058aaede5 (diff)
downloadmariadb-git-18d5be5b54b1a05e6107a1c5828d9eed9cf18636.tar.gz
MDEV-25880: rsync may be mistakenly killed when overlapping SST
This commit fixes a bug was originally discovered during the galera_nbo_sst_slave mtr test for 10.6 branch. However it is relevant for all versions and can lead to intermittent SST crashes via rsync on very fast server restarts - when a new SST process (for example, after starting a new server instance) overlaps the old SST process started by the previous, already terminated server. This overlap can result in the new rsync being killed instead of the old rsync, or the pid file from the new rsync being killed, which then lead to problems.
-rw-r--r--scripts/wsrep_sst_common.sh2
-rw-r--r--scripts/wsrep_sst_rsync.sh28
2 files changed, 25 insertions, 5 deletions
diff --git a/scripts/wsrep_sst_common.sh b/scripts/wsrep_sst_common.sh
index 05944ef6035..c2f31b2818d 100644
--- a/scripts/wsrep_sst_common.sh
+++ b/scripts/wsrep_sst_common.sh
@@ -1190,7 +1190,6 @@ trim_string()
check_pid()
{
local pid_file="$1"
- local remove=${2:-0}
if [ -r "$pid_file" ]; then
local pid=$(cat "$pid_file" 2>/dev/null)
if [ -n "$pid" ]; then
@@ -1201,6 +1200,7 @@ check_pid()
fi
fi
fi
+ local remove=${2:-0}
if [ $remove -eq 1 ]; then
rm -f "$pid_file"
fi
diff --git a/scripts/wsrep_sst_rsync.sh b/scripts/wsrep_sst_rsync.sh
index 19a4d19fded..a602af79af0 100644
--- a/scripts/wsrep_sst_rsync.sh
+++ b/scripts/wsrep_sst_rsync.sh
@@ -68,6 +68,8 @@ cleanup_joiner()
if [ "$WSREP_SST_OPT_ROLE" = 'joiner' ]; then
wsrep_cleanup_progress_file
fi
+
+ [ -f "$SST_PID" ] && rm -f "$SST_PID"
}
check_pid_and_port()
@@ -281,6 +283,7 @@ then
*)
wsrep_log_error "Unrecognized ssl-mode option: '$SSLMODE'"
exit 22 # EINVAL
+ ;;
esac
if [ -z "$CAFILE_OPT" ]; then
wsrep_log_error "Can't have ssl-mode='$SSLMODE' without CA file"
@@ -499,6 +502,21 @@ elif [ "$WSREP_SST_OPT_ROLE" = 'joiner' ]
then
check_sockets_utils
+ SST_PID="$WSREP_SST_OPT_DATA/wsrep_rsync_sst.pid"
+
+ # give some time for lingering stunnel from previous SST to complete
+ check_round=0
+ while check_pid "$SST_PID" 0
+ do
+ wsrep_log_info "previous SST not completed, waiting for it to exit"
+ check_round=$(( check_round + 1 ))
+ if [ $check_round -eq 10 ]; then
+ wsrep_log_error "SST script already running."
+ exit 114 # EALREADY
+ fi
+ sleep 1
+ done
+
# give some time for lingering stunnel from previous SST to complete
check_round=0
while check_pid "$STUNNEL_PID" 1
@@ -583,12 +601,14 @@ EOF
RSYNC_ADDR="*"
fi
+ echo $$ > "$SST_PID"
+
if [ -z "$STUNNEL" ]
then
rsync --daemon --no-detach --port "$RSYNC_PORT" --config "$RSYNC_CONF" $RSYNC_EXTRA_ARGS &
RSYNC_REAL_PID=$!
- TRANSFER_REAL_PID="$RSYNC_REAL_PID"
- TRANSFER_PID=$RSYNC_PID
+ TRANSFER_REAL_PID=$RSYNC_REAL_PID
+ TRANSFER_PID="$RSYNC_PID"
else
# Let's check if the path to the config file contains a space?
if [ "${RSYNC_CONF#* }" = "$RSYNC_CONF" ]; then
@@ -631,8 +651,8 @@ EOF
fi
stunnel "$STUNNEL_CONF" &
STUNNEL_REAL_PID=$!
- TRANSFER_REAL_PID="$STUNNEL_REAL_PID"
- TRANSFER_PID=$STUNNEL_PID
+ TRANSFER_REAL_PID=$STUNNEL_REAL_PID
+ TRANSFER_PID="$STUNNEL_PID"
fi
if [ "${SSLMODE#VERIFY}" != "$SSLMODE" ]