From 18d5be5b54b1a05e6107a1c5828d9eed9cf18636 Mon Sep 17 00:00:00 2001 From: Julius Goryavsky Date: Wed, 9 Jun 2021 03:41:37 +0200 Subject: MDEV-25880: rsync may be mistakenly killed when overlapping SST This commit fixes a bug was originally discovered during the galera_nbo_sst_slave mtr test for 10.6 branch. However it is relevant for all versions and can lead to intermittent SST crashes via rsync on very fast server restarts - when a new SST process (for example, after starting a new server instance) overlaps the old SST process started by the previous, already terminated server. This overlap can result in the new rsync being killed instead of the old rsync, or the pid file from the new rsync being killed, which then lead to problems. --- scripts/wsrep_sst_common.sh | 2 +- scripts/wsrep_sst_rsync.sh | 28 ++++++++++++++++++++++++---- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/scripts/wsrep_sst_common.sh b/scripts/wsrep_sst_common.sh index 05944ef6035..c2f31b2818d 100644 --- a/scripts/wsrep_sst_common.sh +++ b/scripts/wsrep_sst_common.sh @@ -1190,7 +1190,6 @@ trim_string() check_pid() { local pid_file="$1" - local remove=${2:-0} if [ -r "$pid_file" ]; then local pid=$(cat "$pid_file" 2>/dev/null) if [ -n "$pid" ]; then @@ -1201,6 +1200,7 @@ check_pid() fi fi fi + local remove=${2:-0} if [ $remove -eq 1 ]; then rm -f "$pid_file" fi diff --git a/scripts/wsrep_sst_rsync.sh b/scripts/wsrep_sst_rsync.sh index 19a4d19fded..a602af79af0 100644 --- a/scripts/wsrep_sst_rsync.sh +++ b/scripts/wsrep_sst_rsync.sh @@ -68,6 +68,8 @@ cleanup_joiner() if [ "$WSREP_SST_OPT_ROLE" = 'joiner' ]; then wsrep_cleanup_progress_file fi + + [ -f "$SST_PID" ] && rm -f "$SST_PID" } check_pid_and_port() @@ -281,6 +283,7 @@ then *) wsrep_log_error "Unrecognized ssl-mode option: '$SSLMODE'" exit 22 # EINVAL + ;; esac if [ -z "$CAFILE_OPT" ]; then wsrep_log_error "Can't have ssl-mode='$SSLMODE' without CA file" @@ -499,6 +502,21 @@ elif [ "$WSREP_SST_OPT_ROLE" = 'joiner' ] then check_sockets_utils + SST_PID="$WSREP_SST_OPT_DATA/wsrep_rsync_sst.pid" + + # give some time for lingering stunnel from previous SST to complete + check_round=0 + while check_pid "$SST_PID" 0 + do + wsrep_log_info "previous SST not completed, waiting for it to exit" + check_round=$(( check_round + 1 )) + if [ $check_round -eq 10 ]; then + wsrep_log_error "SST script already running." + exit 114 # EALREADY + fi + sleep 1 + done + # give some time for lingering stunnel from previous SST to complete check_round=0 while check_pid "$STUNNEL_PID" 1 @@ -583,12 +601,14 @@ EOF RSYNC_ADDR="*" fi + echo $$ > "$SST_PID" + if [ -z "$STUNNEL" ] then rsync --daemon --no-detach --port "$RSYNC_PORT" --config "$RSYNC_CONF" $RSYNC_EXTRA_ARGS & RSYNC_REAL_PID=$! - TRANSFER_REAL_PID="$RSYNC_REAL_PID" - TRANSFER_PID=$RSYNC_PID + TRANSFER_REAL_PID=$RSYNC_REAL_PID + TRANSFER_PID="$RSYNC_PID" else # Let's check if the path to the config file contains a space? if [ "${RSYNC_CONF#* }" = "$RSYNC_CONF" ]; then @@ -631,8 +651,8 @@ EOF fi stunnel "$STUNNEL_CONF" & STUNNEL_REAL_PID=$! - TRANSFER_REAL_PID="$STUNNEL_REAL_PID" - TRANSFER_PID=$STUNNEL_PID + TRANSFER_REAL_PID=$STUNNEL_REAL_PID + TRANSFER_PID="$STUNNEL_PID" fi if [ "${SSLMODE#VERIFY}" != "$SSLMODE" ] -- cgit v1.2.1