From fa6d710b85bb5d689661ce4cd69e09d9464f397a Mon Sep 17 00:00:00 2001 From: sjaakola Date: Wed, 18 Nov 2020 17:47:39 +0200 Subject: MDEV-24097 node restart overlaps with earlier still ongoing SST process MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In galera_3nodes.galera_safe_to_bootstrap node restart can happen too soon, when earlier SST joiner process is still active in the node. Similar issue may hurt other mtr tests as well. This is second variant of fix for this issue. Here we only change rsync SST script to wait a little bit if lingering SST rsync is observed to be in execution. We assume that the previous mysqld and SST processes have been already signaled to abort during earlier stataup attempt. If other SST methods (than rsync) suffer from similar overlapping SST execution, they should be sorted out separately within each SST method handler scripts. Reviewed-by: Jan Lindström --- scripts/wsrep_sst_rsync.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scripts/wsrep_sst_rsync.sh b/scripts/wsrep_sst_rsync.sh index f50f94d6560..7e3f7b73301 100644 --- a/scripts/wsrep_sst_rsync.sh +++ b/scripts/wsrep_sst_rsync.sh @@ -398,6 +398,14 @@ then MODULE="rsync_sst" RSYNC_PID="$WSREP_SST_OPT_DATA/$MODULE.pid" + # give some time for lingering rsync from previous SST to complete + check_round=0 + while check_pid $RSYNC_PID && [ $check_round -lt 10 ] + do + wsrep_log_info "lingering rsync daemon found at startup, waiting for it to exit" + check_round=$(( check_round + 1 )) + sleep 1 + done if check_pid $RSYNC_PID then -- cgit v1.2.1