diff options
author | sjaakola <seppo.jaakola@iki.fi> | 2021-09-15 09:16:44 +0300 |
---|---|---|
committer | Jan Lindström <jan.lindstrom@mariadb.com> | 2021-09-24 09:47:31 +0300 |
commit | 88a4be75a5f3b8d59ac8f6347ff2c197813c05dc (patch) | |
tree | cc7d6614f8522f263663423ecd7c66e18717e6ac /storage/innobase/handler | |
parent | 9d97f92febc89941784d17d59c60275e21140ce0 (diff) | |
download | mariadb-git-88a4be75a5f3b8d59ac8f6347ff2c197813c05dc.tar.gz |
MDEV-25114 Crash: WSREP: invalid state ROLLED_BACK (FATAL)
This patch is the plan D variant for fixing potetial mutex locking
order exercised by BF aborting and KILL command execution.
In this approach, KILL command is replicated as TOI operation.
This guarantees total isolation for the KILL command execution
in the first node: there is no concurrent replication applying
and no concurrent DDL executing. Therefore there is no risk of
BF aborting to happen in parallel with KILL command execution
either. Potential mutex deadlocks between the different mutex
access paths with KILL command execution and BF aborting cannot
therefore happen.
TOI replication is used, in this approach, purely as means
to provide isolated KILL command execution in the first node.
KILL command should not (and must not) be applied in secondary
nodes. In this patch, we make this sure by skipping KILL
execution in secondary nodes, in applying phase, where we
bail out if applier thread is trying to execute KILL command.
This is effective, but skipping the applying of KILL command
could happen much earlier as well.
This patch also fixes mutex locking order and unprotected
THD member accesses on bf aborting case. We try to hold
THD::LOCK_thd_data during bf aborting. Only case where it
is not possible is at wsrep_abort_transaction before
call wsrep_innobase_kill_one_trx where we take InnoDB
mutexes first and then THD::LOCK_thd_data.
This will also fix possible race condition during
close_connection and while wsrep is disconnecting
connections.
Added wsrep_bf_kill_debug test case
Reviewed-by: Jan Lindström <jan.lindstrom@mariadb.com>
Diffstat (limited to 'storage/innobase/handler')
-rw-r--r-- | storage/innobase/handler/ha_innodb.cc | 188 |
1 files changed, 113 insertions, 75 deletions
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index d4142481a62..efcdd201fa6 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -5233,17 +5233,21 @@ UNIV_INTERN void lock_cancel_waiting_and_release(lock_t* lock); @sa THD::awake() @sa ha_kill_query() */ static void innobase_kill_query(handlerton*, THD* thd, enum thd_kill_levels) { - DBUG_ENTER("innobase_kill_query"); + DBUG_ENTER("innobase_kill_query"); #ifdef WITH_WSREP - if (wsrep_thd_get_conflict_state(thd) != NO_CONFLICT) { - /* if victim has been signaled by BF thread and/or aborting - is already progressing, following query aborting is not necessary - any more. - Also, BF thread should own trx mutex for the victim, which would - conflict with trx_mutex_enter() below - */ - DBUG_VOID_RETURN; - } + if (wsrep_thd_get_conflict_state(thd) != NO_CONFLICT) + { + /* if victim has been signaled by BF thread and/or aborting + is already progressing, following query aborting is not necessary + any more. E.g. wsrep_innobase_kill_one_trx(). + Also, BF thread should own trx mutex for the victim, which would + conflict with trx_mutex_enter() below + */ + WSREP_DEBUG("Victim thread %ld bail out conflict_state %s query %s", + thd_get_thread_id(thd), + wsrep_thd_conflict_state_str(thd), wsrep_thd_query(thd)); + DBUG_VOID_RETURN; + } #endif /* WITH_WSREP */ if (trx_t* trx= thd_to_trx(thd)) @@ -19520,9 +19524,9 @@ static struct st_mysql_storage_engine innobase_storage_engine= { MYSQL_HANDLERTON_INTERFACE_VERSION }; #ifdef WITH_WSREP +static void wsrep_abort_slave_trx( -/*==================*/ wsrep_seqno_t bf_seqno, wsrep_seqno_t victim_seqno) { @@ -19532,19 +19536,17 @@ wsrep_abort_slave_trx( "2) a bug in the code.\n\t" "3) a database corruption.\n Node consistency compromized, " "need to abort. Restart the node to resync with cluster.", - (long long)bf_seqno, (long long)victim_seqno); + bf_seqno, victim_seqno); abort(); } /*******************************************************************//** This function is used to kill one transaction in BF. */ -UNIV_INTERN void wsrep_innobase_kill_one_trx( -/*========================*/ MYSQL_THD const bf_thd, const trx_t * const bf_trx, trx_t *victim_trx, - ibool signal) + my_bool signal) { ut_ad(bf_thd); ut_ad(victim_trx); @@ -19552,38 +19554,41 @@ wsrep_innobase_kill_one_trx( ut_ad(trx_mutex_own(victim_trx)); DBUG_ENTER("wsrep_innobase_kill_one_trx"); - THD *thd = (THD *) victim_trx->mysql_thd; - int64_t bf_seqno = wsrep_thd_trx_seqno(bf_thd); + THD *thd= (THD *) victim_trx->mysql_thd; + int64_t bf_seqno= wsrep_thd_trx_seqno(bf_thd); if (!thd) { - DBUG_PRINT("wsrep", ("no thd for conflicting lock")); WSREP_WARN("no THD for trx: " TRX_ID_FMT, victim_trx->id); DBUG_VOID_RETURN; } - WSREP_LOG_CONFLICT(bf_thd, thd, TRUE); - - WSREP_DEBUG("BF kill (" ULINTPF ", seqno: " INT64PF - "), victim: (%lu) trx: " TRX_ID_FMT, - signal, bf_seqno, - thd_get_thread_id(thd), - victim_trx->id); - - WSREP_DEBUG("Aborting query: %s conf %d trx: %" PRId64, - (thd && wsrep_thd_query(thd)) ? wsrep_thd_query(thd) : "void", - wsrep_thd_conflict_state(thd, FALSE), - wsrep_thd_ws_handle(thd)->trx_id); - + /* Here we need to lock THD::LOCK_thd_data to protect from + concurrent usage or disconnect or delete. */ + DEBUG_SYNC(bf_thd, "wsrep_before_BF_victim_lock"); wsrep_thd_LOCK(thd); - DBUG_EXECUTE_IF("sync.wsrep_after_BF_victim_lock", - { - const char act[]= - "now " - "wait_for signal.wsrep_after_BF_victim_lock"; - DBUG_ASSERT(!debug_sync_set_action(bf_thd, - STRING_WITH_LEN(act))); - };); + DEBUG_SYNC(bf_thd, "wsrep_after_BF_victim_lock"); + + WSREP_DEBUG("Aborter %s trx_id: " TRX_ID_FMT " thread: %ld " + "seqno: %lld query_state: %s conflict_state: %s query: %s", + wsrep_thd_is_BF(bf_thd, false) ? "BF" : "normal", + bf_trx ? bf_trx->id : TRX_ID_MAX, + thd_get_thread_id(bf_thd), + bf_seqno, + wsrep_thd_query_state_str(bf_thd), + wsrep_thd_conflict_state_str(bf_thd), + wsrep_thd_query(bf_thd)); + + WSREP_DEBUG("Victim %s trx_id: " TRX_ID_FMT " thread: %ld " + "seqno: %lld query_state: %s conflict_state: %s query: %s", + wsrep_thd_is_BF(thd, false) ? "BF" : "normal", + victim_trx->id, + thd_get_thread_id(thd), + wsrep_thd_trx_seqno(thd), + wsrep_thd_query_state_str(thd), + wsrep_thd_conflict_state_str(thd), + wsrep_thd_query(thd)); + WSREP_LOG_CONFLICT(bf_thd, thd, TRUE); if (wsrep_thd_query_state(thd) == QUERY_EXITING) { WSREP_DEBUG("kill trx EXITING for " TRX_ID_FMT, @@ -19593,27 +19598,32 @@ wsrep_innobase_kill_one_trx( } if (wsrep_thd_exec_mode(thd) != LOCAL_STATE) { - WSREP_DEBUG("withdraw for BF trx: " TRX_ID_FMT ", state: %d", + WSREP_DEBUG("withdraw for BF trx: " TRX_ID_FMT + ", state: %s exec %s", victim_trx->id, - wsrep_thd_get_conflict_state(thd)); + wsrep_thd_conflict_state_str(thd), + wsrep_thd_exec_mode_str(thd)); } switch (wsrep_thd_get_conflict_state(thd)) { case NO_CONFLICT: + /* This will cause any call to innobase_kill_query() + for this thd to bail out. */ wsrep_thd_set_conflict_state(thd, MUST_ABORT); break; case MUST_ABORT: WSREP_DEBUG("victim " TRX_ID_FMT " in MUST ABORT state", victim_trx->id); - wsrep_thd_UNLOCK(thd); wsrep_thd_awake(thd, signal); + wsrep_thd_UNLOCK(thd); DBUG_VOID_RETURN; break; case ABORTED: case ABORTING: // fall through default: - WSREP_DEBUG("victim " TRX_ID_FMT " in state %d", - victim_trx->id, wsrep_thd_get_conflict_state(thd)); + WSREP_DEBUG("victim " TRX_ID_FMT " in state %s", + victim_trx->id, + wsrep_thd_conflict_state_str(thd)); wsrep_thd_UNLOCK(thd); DBUG_VOID_RETURN; break; @@ -19643,8 +19653,8 @@ wsrep_innobase_kill_one_trx( WSREP_DEBUG("cancel commit warning: " TRX_ID_FMT, victim_trx->id); - wsrep_thd_UNLOCK(thd); wsrep_thd_awake(thd, signal); + wsrep_thd_UNLOCK(thd); DBUG_VOID_RETURN; break; case WSREP_OK: @@ -19662,8 +19672,8 @@ wsrep_innobase_kill_one_trx( break; } } - wsrep_thd_UNLOCK(thd); wsrep_thd_awake(thd, signal); + wsrep_thd_UNLOCK(thd); break; case QUERY_EXEC: /* it is possible that victim trx is itself waiting for some @@ -19685,23 +19695,19 @@ wsrep_innobase_kill_one_trx( lock_cancel_waiting_and_release(wait_lock); } - wsrep_thd_UNLOCK(thd); wsrep_thd_awake(thd, signal); + wsrep_thd_UNLOCK(thd); } else { /* abort currently executing query */ - DBUG_PRINT("wsrep",("sending KILL_QUERY to: %lu", - thd_get_thread_id(thd))); WSREP_DEBUG("kill query for: %ld", thd_get_thread_id(thd)); - /* Note that innobase_kill_query will take lock_mutex - and trx_mutex */ - wsrep_thd_UNLOCK(thd); wsrep_thd_awake(thd, signal); + wsrep_thd_UNLOCK(thd); /* for BF thd, we need to prevent him from committing */ if (wsrep_thd_exec_mode(thd) == REPL_RECV) { wsrep_abort_slave_trx(bf_seqno, - wsrep_thd_trx_seqno(thd)); + wsrep_thd_trx_seqno(thd)); } } break; @@ -19711,29 +19717,27 @@ wsrep_innobase_kill_one_trx( if (wsrep_thd_exec_mode(thd) == REPL_RECV) { WSREP_DEBUG("kill BF IDLE, seqno: %lld", - (long long)wsrep_thd_trx_seqno(thd)); + wsrep_thd_trx_seqno(thd)); wsrep_thd_UNLOCK(thd); wsrep_abort_slave_trx(bf_seqno, wsrep_thd_trx_seqno(thd)); DBUG_VOID_RETURN; } - /* This will lock thd from proceeding after net_read() */ + /* This will lock thd from proceeding after net_read() + and innobase_kill_query to bail out for this thd. */ wsrep_thd_set_conflict_state(thd, ABORTING); wsrep_lock_rollback(); if (wsrep_aborting_thd_contains(thd)) { WSREP_WARN("duplicate thd aborter %lu", - (ulong) thd_get_thread_id(thd)); + thd_get_thread_id(thd)); } else { wsrep_aborting_thd_enqueue(thd); - DBUG_PRINT("wsrep",("enqueuing trx abort for %lu", - thd_get_thread_id(thd))); WSREP_DEBUG("enqueuing trx abort for (%lu)", - thd_get_thread_id(thd)); + thd_get_thread_id(thd)); } - DBUG_PRINT("wsrep",("signalling wsrep rollbacker")); WSREP_DEBUG("signaling aborter"); wsrep_unlock_rollback(); wsrep_thd_UNLOCK(thd); @@ -19741,10 +19745,7 @@ wsrep_innobase_kill_one_trx( break; } default: - WSREP_WARN("bad wsrep query state: %d", - wsrep_thd_query_state(thd)); - wsrep_thd_UNLOCK(thd); - break; + ut_error; } DBUG_VOID_RETURN; @@ -19753,7 +19754,6 @@ wsrep_innobase_kill_one_trx( static void wsrep_abort_transaction( -/*====================*/ handlerton* hton, THD *bf_thd, THD *victim_thd, @@ -19761,27 +19761,65 @@ wsrep_abort_transaction( { DBUG_ENTER("wsrep_abort_transaction"); - trx_t* victim_trx = thd_to_trx(victim_thd); - trx_t* bf_trx = (bf_thd) ? thd_to_trx(bf_thd) : NULL; - - WSREP_DEBUG("abort transaction: BF: %s victim: %s victim conf: %d", - wsrep_thd_query(bf_thd), - wsrep_thd_query(victim_thd), - wsrep_thd_conflict_state(victim_thd, FALSE)); + ut_ad(bf_thd); + ut_ad(victim_thd); + trx_t* victim_trx= thd_to_trx(victim_thd); + trx_t* bf_trx= thd_to_trx(bf_thd); + + /* Here we should hold THD::LOCK_thd_data to protect + victim from concurrent usage or disconnect or delete. */ + WSREP_DEBUG("wsrep_abort_transaction: BF:" + " thread %ld query_state %s conflict_state %s" + " exec %s query %s trx " TRX_ID_FMT, + thd_get_thread_id(bf_thd), + wsrep_thd_query_state_str(bf_thd), + wsrep_thd_conflict_state_str(bf_thd), + wsrep_thd_exec_mode_str(bf_thd), + wsrep_thd_query(bf_thd), + bf_trx ? bf_trx->id : 0); + + WSREP_DEBUG("wsrep_abort_transaction: victim:" + " thread %ld query_state %s conflict_state %s" + " exec %s query %s trx " TRX_ID_FMT, + thd_get_thread_id(victim_thd), + wsrep_thd_query_state_str(victim_thd), + wsrep_thd_conflict_state_str(victim_thd), + wsrep_thd_exec_mode_str(victim_thd), + wsrep_thd_query(victim_thd), + victim_trx ? victim_trx->id : 0); if (victim_trx) { + WSREP_DEBUG("wsrep_abort_transaction: Victim thread %ld " + "transaction " TRX_ID_FMT " trx_state %d", + thd_get_thread_id(victim_thd), + victim_trx->id, + victim_trx->state); + /* This is necessary as correct mutexing order is + lock_sys -> trx -> THD::LOCK_thd_data and below + function assumes we have lock_sys and trx locked + and takes THD::LOCK_thd_data for THD state check. */ + wsrep_thd_UNLOCK(victim_thd); + DEBUG_SYNC(bf_thd, "wsrep_abort_victim_unlocked"); + DBUG_EXECUTE_IF("wsrep_abort_replicated_sleep", + WSREP_DEBUG("wsrep_abort_transaction: sleeping " + "for thread %ld ", + thd_get_thread_id(victim_thd)); + my_sleep(100000);); lock_mutex_enter(); trx_mutex_enter(victim_trx); wsrep_innobase_kill_one_trx(bf_thd, bf_trx, victim_trx, signal); lock_mutex_exit(); trx_mutex_exit(victim_trx); wsrep_srv_conc_cancel_wait(victim_trx); + wsrep_thd_LOCK(victim_thd); DBUG_VOID_RETURN; } else { - WSREP_DEBUG("victim does not have transaction"); - wsrep_thd_LOCK(victim_thd); + WSREP_DEBUG("wsrep_abort_transaction: Victim thread %ld " + "no transaction", + thd_get_thread_id(victim_thd)); + /* This will cause any call to innobase_kill_query() + for this thd to bail out. */ wsrep_thd_set_conflict_state(victim_thd, MUST_ABORT); - wsrep_thd_UNLOCK(victim_thd); wsrep_thd_awake(victim_thd, signal); } |