From c8b39f7ee2a9910fae2f03563b3ea337c45132c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Thu, 21 Oct 2021 13:48:59 +0300 Subject: MDEV-25114: Crash: WSREP: invalid state ROLLED_BACK (FATAL) Revert "MDEV-23328 Server hang due to Galera lock conflict resolution" This reverts commit 29bbcac0ee841faaa68eeb09c86ff825eabbe6b6. --- sql/wsrep_mysqld.cc | 2 + storage/innobase/handler/ha_innodb.cc | 184 +++++++++++++--------------------- 2 files changed, 74 insertions(+), 112 deletions(-) diff --git a/sql/wsrep_mysqld.cc b/sql/wsrep_mysqld.cc index f22d8bf0f5a..e60100e2e90 100644 --- a/sql/wsrep_mysqld.cc +++ b/sql/wsrep_mysqld.cc @@ -2762,7 +2762,9 @@ extern "C" void wsrep_thd_awake(THD *thd, my_bool signal) { if (signal) { + mysql_mutex_lock(&thd->LOCK_thd_data); thd->awake(KILL_QUERY); + mysql_mutex_unlock(&thd->LOCK_thd_data); } else { diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 37748cb497a..57e7ec236c0 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -60,7 +60,6 @@ this program; if not, write to the Free Software Foundation, Inc., #include #include -#include /* Include necessary InnoDB headers */ #include "btr0btr.h" @@ -19513,68 +19512,61 @@ wsrep_abort_slave_trx( (long long)bf_seqno, (long long)victim_seqno); abort(); } - -struct bg_wsrep_kill_trx_arg { - my_thread_id thd_id; - trx_id_t trx_id; - int64_t bf_seqno; - ibool signal; -}; - -static void bg_wsrep_kill_trx( - void *void_arg) +/*******************************************************************//** +This function is used to kill one transaction in BF. */ +UNIV_INTERN +void +wsrep_innobase_kill_one_trx( +/*========================*/ + MYSQL_THD const bf_thd, + const trx_t * const bf_trx, + trx_t *victim_trx, + ibool signal) { - bg_wsrep_kill_trx_arg *arg = (bg_wsrep_kill_trx_arg*)void_arg; - THD *thd = find_thread_by_id(arg->thd_id, false); - trx_t *victim_trx = NULL; - bool awake = false; - DBUG_ENTER("bg_wsrep_kill_trx"); + ut_ad(bf_thd); + ut_ad(victim_trx); + ut_ad(lock_mutex_own()); + ut_ad(trx_mutex_own(victim_trx)); - if (thd) { - victim_trx= thd_to_trx(thd); - /* Victim trx might not exist e.g. on MDL-conflict. */ - if (victim_trx) { - lock_mutex_enter(); - trx_mutex_enter(victim_trx); - if (victim_trx->id != arg->trx_id || - victim_trx->state == TRX_STATE_COMMITTED_IN_MEMORY) - { - /* Victim was meanwhile rolled back or - committed */ - trx_mutex_exit(victim_trx); - lock_mutex_exit(); - wsrep_thd_UNLOCK(thd); - victim_trx= NULL; - } - } else { - /* find_thread_by_id locked - THD::LOCK_thd_data */ - wsrep_thd_UNLOCK(thd); - } - } + DBUG_ENTER("wsrep_innobase_kill_one_trx"); + THD *thd = (THD *) victim_trx->mysql_thd; + int64_t bf_seqno = wsrep_thd_trx_seqno(bf_thd); - if (!victim_trx) { - /* Victim trx might not exist (MDL-conflict) or victim - was meanwhile rolled back or committed because of - a KILL statement or a disconnect. */ - goto ret; + if (!thd) { + DBUG_PRINT("wsrep", ("no thd for conflicting lock")); + WSREP_WARN("no THD for trx: " TRX_ID_FMT, victim_trx->id); + DBUG_VOID_RETURN; } + WSREP_LOG_CONFLICT(bf_thd, thd, TRUE); + WSREP_DEBUG("BF kill (" ULINTPF ", seqno: " INT64PF "), victim: (%lu) trx: " TRX_ID_FMT, - arg->signal, arg->bf_seqno, + signal, bf_seqno, thd_get_thread_id(thd), victim_trx->id); WSREP_DEBUG("Aborting query: %s conf %d trx: %" PRId64, - (wsrep_thd_query(thd)) ? wsrep_thd_query(thd) : "void", + (thd && wsrep_thd_query(thd)) ? wsrep_thd_query(thd) : "void", wsrep_thd_conflict_state(thd, FALSE), wsrep_thd_ws_handle(thd)->trx_id); + wsrep_thd_LOCK(thd); + DBUG_EXECUTE_IF("sync.wsrep_after_BF_victim_lock", + { + const char act[]= + "now " + "wait_for signal.wsrep_after_BF_victim_lock"; + DBUG_ASSERT(!debug_sync_set_action(bf_thd, + STRING_WITH_LEN(act))); + };); + + if (wsrep_thd_query_state(thd) == QUERY_EXITING) { WSREP_DEBUG("kill trx EXITING for " TRX_ID_FMT, victim_trx->id); - goto ret_unlock; + wsrep_thd_UNLOCK(thd); + DBUG_VOID_RETURN; } if (wsrep_thd_exec_mode(thd) != LOCAL_STATE) { @@ -19590,13 +19582,18 @@ static void bg_wsrep_kill_trx( case MUST_ABORT: WSREP_DEBUG("victim " TRX_ID_FMT " in MUST ABORT state", victim_trx->id); - goto ret_awake; + wsrep_thd_UNLOCK(thd); + wsrep_thd_awake(thd, signal); + DBUG_VOID_RETURN; + break; case ABORTED: case ABORTING: // fall through default: WSREP_DEBUG("victim " TRX_ID_FMT " in state %d", victim_trx->id, wsrep_thd_get_conflict_state(thd)); - goto ret_unlock; + wsrep_thd_UNLOCK(thd); + DBUG_VOID_RETURN; + break; } switch (wsrep_thd_query_state(thd)) { @@ -19609,12 +19606,12 @@ static void bg_wsrep_kill_trx( victim_trx->id); if (wsrep_thd_exec_mode(thd) == REPL_RECV) { - wsrep_abort_slave_trx(arg->bf_seqno, + wsrep_abort_slave_trx(bf_seqno, wsrep_thd_trx_seqno(thd)); } else { wsrep_t *wsrep= get_wsrep(); rcode = wsrep->abort_pre_commit( - wsrep, arg->bf_seqno, + wsrep, bf_seqno, (wsrep_trx_id_t)wsrep_thd_ws_handle(thd)->trx_id ); @@ -19623,7 +19620,10 @@ static void bg_wsrep_kill_trx( WSREP_DEBUG("cancel commit warning: " TRX_ID_FMT, victim_trx->id); - goto ret_awake; + wsrep_thd_UNLOCK(thd); + wsrep_thd_awake(thd, signal); + DBUG_VOID_RETURN; + break; case WSREP_OK: break; default: @@ -19636,9 +19636,12 @@ static void bg_wsrep_kill_trx( * kill the lock holder first. */ abort(); + break; } } - goto ret_awake; + wsrep_thd_UNLOCK(thd); + wsrep_thd_awake(thd, signal); + break; case QUERY_EXEC: /* it is possible that victim trx is itself waiting for some * other lock. We need to cancel this waiting @@ -19659,20 +19662,26 @@ static void bg_wsrep_kill_trx( lock_cancel_waiting_and_release(wait_lock); } + wsrep_thd_UNLOCK(thd); + wsrep_thd_awake(thd, signal); } else { /* abort currently executing query */ DBUG_PRINT("wsrep",("sending KILL_QUERY to: %lu", thd_get_thread_id(thd))); WSREP_DEBUG("kill query for: %ld", thd_get_thread_id(thd)); + /* Note that innobase_kill_query will take lock_mutex + and trx_mutex */ + wsrep_thd_UNLOCK(thd); + wsrep_thd_awake(thd, signal); /* for BF thd, we need to prevent him from committing */ if (wsrep_thd_exec_mode(thd) == REPL_RECV) { - wsrep_abort_slave_trx(arg->bf_seqno, + wsrep_abort_slave_trx(bf_seqno, wsrep_thd_trx_seqno(thd)); } } - goto ret_awake; + break; case QUERY_IDLE: { WSREP_DEBUG("kill IDLE for " TRX_ID_FMT, victim_trx->id); @@ -19680,9 +19689,10 @@ static void bg_wsrep_kill_trx( if (wsrep_thd_exec_mode(thd) == REPL_RECV) { WSREP_DEBUG("kill BF IDLE, seqno: %lld", (long long)wsrep_thd_trx_seqno(thd)); - wsrep_abort_slave_trx(arg->bf_seqno, + wsrep_thd_UNLOCK(thd); + wsrep_abort_slave_trx(bf_seqno, wsrep_thd_trx_seqno(thd)); - goto ret_unlock; + DBUG_VOID_RETURN; } /* This will lock thd from proceeding after net_read() */ wsrep_thd_set_conflict_state(thd, ABORTING); @@ -19703,67 +19713,17 @@ static void bg_wsrep_kill_trx( DBUG_PRINT("wsrep",("signalling wsrep rollbacker")); WSREP_DEBUG("signaling aborter"); wsrep_unlock_rollback(); - goto ret_unlock; + wsrep_thd_UNLOCK(thd); + + break; } default: WSREP_WARN("bad wsrep query state: %d", wsrep_thd_query_state(thd)); - goto ret_unlock; + wsrep_thd_UNLOCK(thd); + break; } -ret_awake: - awake= true; - -ret_unlock: - trx_mutex_exit(victim_trx); - lock_mutex_exit(); - if (awake) - wsrep_thd_awake(thd, arg->signal); - wsrep_thd_UNLOCK(thd); - -ret: - free(arg); - DBUG_VOID_RETURN; - -} - -/*******************************************************************//** -This function is used to kill one transaction in BF. */ -UNIV_INTERN -void -wsrep_innobase_kill_one_trx( -/*========================*/ - MYSQL_THD const bf_thd, - const trx_t * const bf_trx, - trx_t *victim_trx, - ibool signal) -{ - ut_ad(bf_thd); - ut_ad(victim_trx); - ut_ad(lock_mutex_own()); - ut_ad(trx_mutex_own(victim_trx)); - - bg_wsrep_kill_trx_arg *arg = (bg_wsrep_kill_trx_arg*)malloc(sizeof(*arg)); - arg->thd_id = thd_get_thread_id(victim_trx->mysql_thd); - arg->trx_id = victim_trx->id; - arg->bf_seqno = wsrep_thd_trx_seqno((THD*)bf_thd); - arg->signal = signal; - - DBUG_ENTER("wsrep_innobase_kill_one_trx"); - - WSREP_LOG_CONFLICT(bf_thd, victim_trx->mysql_thd, TRUE); - - DBUG_EXECUTE_IF("sync.wsrep_after_BF_victim_lock", - { - const char act[]= - "now " - "wait_for signal.wsrep_after_BF_victim_lock"; - DBUG_ASSERT(!debug_sync_set_action(bf_thd, - STRING_WITH_LEN(act))); - };); - - - mysql_manager_submit(bg_wsrep_kill_trx, arg); DBUG_VOID_RETURN; } @@ -19798,8 +19758,8 @@ wsrep_abort_transaction( WSREP_DEBUG("victim does not have transaction"); wsrep_thd_LOCK(victim_thd); wsrep_thd_set_conflict_state(victim_thd, MUST_ABORT); - wsrep_thd_awake(victim_thd, signal); wsrep_thd_UNLOCK(victim_thd); + wsrep_thd_awake(victim_thd, signal); } DBUG_VOID_RETURN; -- cgit v1.2.1