diff options
author | Sergei Golubchik <serg@mariadb.org> | 2021-02-07 17:48:58 +0100 |
---|---|---|
committer | Sergei Golubchik <serg@mariadb.org> | 2021-02-12 18:17:06 +0100 |
commit | eac8341df4c3c7b98360f4e9498acf393dc055e3 (patch) | |
tree | 6e68141971047c34568b42451f1510ba906d4497 | |
parent | 9703cffa8cb57e2fe29719f4aae3282bfae82878 (diff) | |
download | mariadb-git-eac8341df4c3c7b98360f4e9498acf393dc055e3.tar.gz |
MDEV-23328 Server hang due to Galera lock conflict resolution
adaptation of 29bbcac0ee8 for 10.4
-rw-r--r-- | mysql-test/suite/galera/t/galera_bf_kill_debug.test | 2 | ||||
-rw-r--r-- | sql/log_event.cc | 22 | ||||
-rw-r--r-- | sql/service_wsrep.cc | 16 | ||||
-rw-r--r-- | sql/slave.cc | 2 | ||||
-rw-r--r-- | sql/sql_class.cc | 11 | ||||
-rw-r--r-- | sql/sql_class.h | 4 | ||||
-rw-r--r-- | sql/sql_parse.cc | 4 | ||||
-rw-r--r-- | sql/sql_repl.cc | 2 | ||||
-rw-r--r-- | sql/wsrep_client_service.cc | 7 | ||||
-rw-r--r-- | sql/wsrep_server_service.cc | 9 | ||||
-rw-r--r-- | storage/innobase/handler/ha_innodb.cc | 208 |
11 files changed, 185 insertions, 102 deletions
diff --git a/mysql-test/suite/galera/t/galera_bf_kill_debug.test b/mysql-test/suite/galera/t/galera_bf_kill_debug.test index b687a5a6a67..c322f283757 100644 --- a/mysql-test/suite/galera/t/galera_bf_kill_debug.test +++ b/mysql-test/suite/galera/t/galera_bf_kill_debug.test @@ -84,7 +84,7 @@ SET DEBUG_SYNC = "now SIGNAL continue_kill"; --reap --connection node_2a ---error 0,1213 +--error 0,1213,2013 select * from t1; --connection node_2 diff --git a/sql/log_event.cc b/sql/log_event.cc index 89ffebf2659..337de3508ed 100644 --- a/sql/log_event.cc +++ b/sql/log_event.cc @@ -8983,8 +8983,20 @@ err: } #endif /* MYSQL_CLIENT */ - #if defined(HAVE_REPLICATION) && !defined(MYSQL_CLIENT) +static bool wsrep_must_replay(THD *thd) +{ +#ifdef WITH_WSREP + mysql_mutex_lock(&thd->LOCK_thd_data); + bool res= WSREP(thd) && thd->wsrep_trx().state() == wsrep::transaction::s_must_replay; + mysql_mutex_unlock(&thd->LOCK_thd_data); + return res; +#else + return false; +#endif +} + + int Xid_log_event::do_apply_event(rpl_group_info *rgi) { bool res; @@ -9049,14 +9061,8 @@ int Xid_log_event::do_apply_event(rpl_group_info *rgi) res= trans_commit(thd); /* Automatically rolls back on error. */ thd->release_transactional_locks(); - mysql_mutex_lock(&thd->LOCK_thd_data); -#ifdef WITH_WSREP - if (sub_id && (!res || (WSREP(thd) && thd->wsrep_trx().state() == wsrep::transaction::s_must_replay))) -#else - if (sub_id && !res) -#endif /* WITH_WSREP */ + if (sub_id && (!res || wsrep_must_replay(thd))) rpl_global_gtid_slave_state->update_state_hash(sub_id, >id, hton, rgi); - mysql_mutex_unlock(&thd->LOCK_thd_data); /* Increment the global status commit count variable */ diff --git a/sql/service_wsrep.cc b/sql/service_wsrep.cc index f0a4cf81c02..80f164855b2 100644 --- a/sql/service_wsrep.cc +++ b/sql/service_wsrep.cc @@ -210,16 +210,8 @@ extern "C" void wsrep_handle_SR_rollback(THD *bf_thd, extern "C" my_bool wsrep_thd_bf_abort(THD *bf_thd, THD *victim_thd, my_bool signal) { - DBUG_EXECUTE_IF("sync.before_wsrep_thd_abort", - { - const char act[]= - "now " - "SIGNAL sync.before_wsrep_thd_abort_reached " - "WAIT_FOR signal.before_wsrep_thd_abort"; - DBUG_ASSERT(!debug_sync_set_action(bf_thd, - STRING_WITH_LEN(act))); - };); - + mysql_mutex_assert_owner(&victim_thd->LOCK_thd_kill); + mysql_mutex_assert_not_owner(&victim_thd->LOCK_thd_data); my_bool ret= wsrep_bf_abort(bf_thd, victim_thd); /* Send awake signal if victim was BF aborted or does not @@ -228,8 +220,6 @@ extern "C" my_bool wsrep_thd_bf_abort(THD *bf_thd, THD *victim_thd, */ if ((ret || !wsrep_on(victim_thd)) && signal) { - mysql_mutex_assert_not_owner(&victim_thd->LOCK_thd_data); - mysql_mutex_assert_not_owner(&victim_thd->LOCK_thd_kill); mysql_mutex_lock(&victim_thd->LOCK_thd_data); if (victim_thd->wsrep_aborter && victim_thd->wsrep_aborter != bf_thd->thread_id) @@ -240,10 +230,8 @@ extern "C" my_bool wsrep_thd_bf_abort(THD *bf_thd, THD *victim_thd, return false; } - mysql_mutex_lock(&victim_thd->LOCK_thd_kill); victim_thd->wsrep_aborter= bf_thd->thread_id; victim_thd->awake_no_mutex(KILL_QUERY); - mysql_mutex_unlock(&victim_thd->LOCK_thd_kill); mysql_mutex_unlock(&victim_thd->LOCK_thd_data); } else { WSREP_DEBUG("wsrep_thd_bf_abort skipped awake"); diff --git a/sql/slave.cc b/sql/slave.cc index 372e46acd1d..31bd9372a14 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -1069,8 +1069,8 @@ terminate_slave_thread(THD *thd, int error __attribute__((unused)); DBUG_PRINT("loop", ("killing slave thread")); - mysql_mutex_lock(&thd->LOCK_thd_data); mysql_mutex_lock(&thd->LOCK_thd_kill); + mysql_mutex_lock(&thd->LOCK_thd_data); #ifndef DONT_USE_THR_ALARM /* Error codes from pthread_kill are: diff --git a/sql/sql_class.cc b/sql/sql_class.cc index dc3903661e1..81718595fec 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -799,6 +799,7 @@ THD::THD(my_thread_id id, bool is_wsrep_applier) mysql_mutex_init(key_LOCK_wakeup_ready, &LOCK_wakeup_ready, MY_MUTEX_INIT_FAST); mysql_mutex_init(key_LOCK_thd_kill, &LOCK_thd_kill, MY_MUTEX_INIT_FAST); mysql_cond_init(key_COND_wakeup_ready, &COND_wakeup_ready, 0); + mysql_mutex_record_order(&LOCK_thd_kill, &LOCK_thd_data); /* Variables with default values */ proc_info="login"; @@ -5058,11 +5059,13 @@ thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd) #ifdef WITH_WSREP /* wsrep applier, replayer and TOI processing threads are ordered by replication provider, relaxed GAP locking protocol can be used - between high priority wsrep threads + between high priority wsrep threads. + Note that wsrep_thd_is_BF() doesn't take LOCK_thd_data for either thd, + the caller should guarantee that the BF state won't change. + (e.g. InnoDB does it by keeping lock_sys.mutex locked) */ - if (WSREP_ON && - wsrep_thd_is_BF(const_cast<THD *>(thd), false) && - wsrep_thd_is_BF(const_cast<THD *>(other_thd), true)) + if (WSREP_ON && wsrep_thd_is_BF(thd, false) && + wsrep_thd_is_BF(other_thd, false)) return 0; #endif /* WITH_WSREP */ rgi= thd->rgi_slave; diff --git a/sql/sql_class.h b/sql/sql_class.h index 39d6ec1027f..4eabd3da450 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -3309,11 +3309,11 @@ public: void awake_no_mutex(killed_state state_to_set); void awake(killed_state state_to_set) { - mysql_mutex_lock(&LOCK_thd_data); mysql_mutex_lock(&LOCK_thd_kill); + mysql_mutex_lock(&LOCK_thd_data); awake_no_mutex(state_to_set); - mysql_mutex_unlock(&LOCK_thd_kill); mysql_mutex_unlock(&LOCK_thd_data); + mysql_mutex_unlock(&LOCK_thd_kill); } void abort_current_cond_wait(bool force); diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc index dd0e5cfa34e..d71d29bc85a 100644 --- a/sql/sql_parse.cc +++ b/sql/sql_parse.cc @@ -9102,8 +9102,8 @@ static my_bool find_thread_with_thd_data_lock_callback(THD *thd, find_thread_cal { if (arg->id == (arg->query_id ? thd->query_id : (longlong) thd->thread_id)) { - mysql_mutex_lock(&thd->LOCK_thd_data); mysql_mutex_lock(&thd->LOCK_thd_kill); // Lock from delete + mysql_mutex_lock(&thd->LOCK_thd_data); // XXX DELME arg->thd= thd; return 1; } @@ -9238,8 +9238,8 @@ static my_bool kill_threads_callback(THD *thd, kill_threads_callback_arg *arg) return 1; if (!arg->threads_to_kill.push_back(thd, arg->thd->mem_root)) { - mysql_mutex_lock(&thd->LOCK_thd_data); mysql_mutex_lock(&thd->LOCK_thd_kill); // Lock from delete + mysql_mutex_lock(&thd->LOCK_thd_data); } } } diff --git a/sql/sql_repl.cc b/sql/sql_repl.cc index a0d8b4ca6d1..6a6cfb2aa5f 100644 --- a/sql/sql_repl.cc +++ b/sql/sql_repl.cc @@ -3474,8 +3474,8 @@ static my_bool kill_callback(THD *thd, kill_callback_arg *arg) thd->variables.server_id == arg->slave_server_id) { arg->thd= thd; - mysql_mutex_lock(&thd->LOCK_thd_data); mysql_mutex_lock(&thd->LOCK_thd_kill); // Lock from delete + mysql_mutex_lock(&thd->LOCK_thd_data); return 1; } return 0; diff --git a/sql/wsrep_client_service.cc b/sql/wsrep_client_service.cc index 245fc1487ca..89621619a23 100644 --- a/sql/wsrep_client_service.cc +++ b/sql/wsrep_client_service.cc @@ -69,20 +69,13 @@ bool Wsrep_client_service::interrupted( wsrep::unique_lock<wsrep::mutex>& lock WSREP_UNUSED) const { DBUG_ASSERT(m_thd == current_thd); - /* Underlying mutex in lock object points to LOCK_thd_data, which - protects m_thd->wsrep_trx(), LOCK_thd_kill protects m_thd->killed. - Locking order is: - 1) LOCK_thd_data - 2) LOCK_thd_kill */ mysql_mutex_assert_owner(static_cast<mysql_mutex_t*>(lock.mutex()->native())); - mysql_mutex_lock(&m_thd->LOCK_thd_kill); bool ret= (m_thd->killed != NOT_KILLED); if (ret) { WSREP_DEBUG("wsrep state is interrupted, THD::killed %d trx state %d", m_thd->killed, m_thd->wsrep_trx().state()); } - mysql_mutex_unlock(&m_thd->LOCK_thd_kill); return ret; } diff --git a/sql/wsrep_server_service.cc b/sql/wsrep_server_service.cc index cd432ab3eae..19259a43925 100644 --- a/sql/wsrep_server_service.cc +++ b/sql/wsrep_server_service.cc @@ -40,6 +40,7 @@ static void init_service_thd(THD* thd, char* thread_stack) thd->prior_thr_create_utime= thd->start_utime= microsecond_interval_timer(); thd->set_command(COM_SLEEP); thd->reset_for_next_command(true); + server_threads.insert(thd); // as wsrep_innobase_kill_one_trx() uses find_thread_by_id() } Wsrep_storage_service* @@ -79,6 +80,7 @@ void Wsrep_server_service::release_storage_service( static_cast<Wsrep_storage_service*>(storage_service); THD* thd= ss->m_thd; wsrep_reset_threadvars(thd); + server_threads.erase(thd); delete ss; delete thd; } @@ -92,7 +94,8 @@ wsrep_create_streaming_applier(THD *orig_thd, const char *ctx) streaming transaction is BF aborted and streaming applier is created from BF aborter context. */ Wsrep_threadvars saved_threadvars(wsrep_save_threadvars()); - wsrep_reset_threadvars(saved_threadvars.cur_thd); + if (saved_threadvars.cur_thd) + wsrep_reset_threadvars(saved_threadvars.cur_thd); THD *thd= 0; Wsrep_applier_service *ret= 0; if (!wsrep_create_threadvars() && @@ -109,7 +112,8 @@ wsrep_create_streaming_applier(THD *orig_thd, const char *ctx) } /* Restore original thread local storage state before returning. */ wsrep_restore_threadvars(saved_threadvars); - wsrep_store_threadvars(saved_threadvars.cur_thd); + if (saved_threadvars.cur_thd) + wsrep_store_threadvars(saved_threadvars.cur_thd); return ret; } @@ -138,6 +142,7 @@ void Wsrep_server_service::release_high_priority_service(wsrep::high_priority_se THD* thd= hps->m_thd; delete hps; wsrep_store_threadvars(thd); + server_threads.erase(thd); delete thd; wsrep_delete_threadvars(); } diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index cc9fa427168..8d55e6901a5 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -62,6 +62,7 @@ this program; if not, write to the Free Software Foundation, Inc., #include <my_service_manager.h> #include <key.h> +#include <sql_manager.h> /* Include necessary InnoDB headers */ #include "btr0btr.h" @@ -5093,6 +5094,8 @@ static void innobase_kill_query(handlerton*, THD *thd, enum thd_kill_levels) if (lock_t *lock= trx->lock.wait_lock) { trx_mutex_enter(trx); + if (trx->is_wsrep() && wsrep_thd_is_aborting(thd)) + trx->lock.was_chosen_as_deadlock_victim= TRUE; lock_cancel_waiting_and_release(lock); trx_mutex_exit(trx); } @@ -18556,54 +18559,52 @@ static struct st_mysql_storage_engine innobase_storage_engine= #ifdef WITH_WSREP -/** This function is used to kill one transaction. - -This transaction was open on this node (not-yet-committed), and a -conflicting writeset from some other node that was being applied -caused a locking conflict. First committed (from other node) -wins, thus open transaction is rolled back. BF stands for -brute-force: any transaction can get aborted by galera any time -it is necessary. +struct bg_wsrep_kill_trx_arg { + my_thread_id thd_id, bf_thd_id; + trx_id_t trx_id, bf_trx_id; + bool signal; +}; -This conflict can happen only when the replicated writeset (from -other node) is being applied, not when it’s waiting in the queue. -If our local transaction reached its COMMIT and this conflicting -writeset was in the queue, then it should fail the local -certification test instead. +/** Kill one transaction from a background manager thread -A brute force abort is only triggered by a locking conflict -between a writeset being applied by an applier thread (slave thread) -and an open transaction on the node, not by a Galera writeset -comparison as in the local certification failure. +wsrep_innobase_kill_one_trx() is invoked when lock_sys.mutex and trx mutex +are taken, wsrep_thd_bf_abort() cannot be used there as it takes THD mutexes +that must be taken before lock_sys.mutex and trx mutex. That's why +wsrep_innobase_kill_one_trx only posts the killing task to the manager thread +and the actual killing happens asynchronously here. -@param[in] bf_thd Brute force (BF) thread -@param[in,out] victim_trx Vimtim trx to be killed -@param[in] signal Should victim be signaled */ -UNIV_INTERN -void -wsrep_innobase_kill_one_trx( - THD* bf_thd, - trx_t *victim_trx, - bool signal) +As no mutexes were held we don't know whether THD or trx pointers are still +valid, so we need to pass thread/trx ids and perform a lookup. +*/ +static void bg_wsrep_kill_trx(void *void_arg) { - ut_ad(bf_thd); - ut_ad(victim_trx); - ut_ad(lock_mutex_own()); - ut_ad(trx_mutex_own(victim_trx)); + bg_wsrep_kill_trx_arg *arg= (bg_wsrep_kill_trx_arg *)void_arg; + THD *thd, *bf_thd; + trx_t *victim_trx; + bool aborting= false; - DBUG_ENTER("wsrep_innobase_kill_one_trx"); + bf_thd= find_thread_by_id_with_thd_data_lock(arg->bf_thd_id); + thd= find_thread_by_id_with_thd_data_lock(arg->thd_id); - THD *thd= (THD *) victim_trx->mysql_thd; - ut_ad(thd); - /* Note that bf_trx might not exist here e.g. on MDL conflict - case (test: galera_concurrent_ctas). Similarly, BF thread - could be also acquiring MDL-lock causing victim to be - aborted. However, we have not yet called innobase_trx_init() - for BF transaction (test: galera_many_columns)*/ - trx_t* bf_trx= thd_to_trx(bf_thd); - DBUG_ASSERT(wsrep_on(bf_thd)); + if (!thd || !bf_thd || !(victim_trx= thd_to_trx(thd))) + goto ret0; - wsrep_thd_LOCK(thd); + lock_mutex_enter(); + trx_mutex_enter(victim_trx); + if (victim_trx->id != arg->trx_id) + { + /* apparently victim trx was meanwhile rolled back. + tell bf thd not to wait, in case it already started to */ + trx_t *trx= thd_to_trx(bf_thd); + if (lock_t *lock= trx->lock.wait_lock) { + trx_mutex_enter(trx); + lock_cancel_waiting_and_release(lock); + trx_mutex_exit(trx); + } + goto ret1; + } + + DBUG_ASSERT(wsrep_on(bf_thd)); WSREP_LOG_CONFLICT(bf_thd, thd, TRUE); @@ -18611,7 +18612,7 @@ wsrep_innobase_kill_one_trx( "seqno: %lld client_state: %s client_mode: %s transaction_mode: %s " "query: %s", wsrep_thd_is_BF(bf_thd, false) ? "BF" : "normal", - bf_trx ? bf_trx->id : TRX_ID_MAX, + arg->bf_trx_id, thd_get_thread_id(bf_thd), wsrep_thd_trx_seqno(bf_thd), wsrep_thd_client_state_str(bf_thd), @@ -18636,28 +18637,84 @@ wsrep_innobase_kill_one_trx( if (wsrep_thd_set_wsrep_aborter(bf_thd, thd)) { WSREP_DEBUG("innodb kill transaction skipped due to wsrep_aborter set"); - wsrep_thd_UNLOCK(thd); - DBUG_VOID_RETURN; + goto ret1; } - /* Note that we need to release this as it will be acquired - below in wsrep-lib */ - wsrep_thd_UNLOCK(thd); - DEBUG_SYNC(bf_thd, "before_wsrep_thd_abort"); + aborting= true; - if (wsrep_thd_bf_abort(bf_thd, thd, signal)) - { - lock_t* wait_lock = victim_trx->lock.wait_lock; - if (wait_lock) { - DBUG_ASSERT(victim_trx->is_wsrep()); - WSREP_DEBUG("victim has wait flag: %lu", - thd_get_thread_id(thd)); - - WSREP_DEBUG("canceling wait lock"); - victim_trx->lock.was_chosen_as_deadlock_victim= TRUE; - lock_cancel_waiting_and_release(wait_lock); +ret1: + trx_mutex_exit(victim_trx); + lock_mutex_exit(); +ret0: + if (thd) { + wsrep_thd_UNLOCK(thd); + if (aborting) { + DEBUG_SYNC(bf_thd, "before_wsrep_thd_abort"); + wsrep_thd_bf_abort(bf_thd, thd, arg->signal); } + wsrep_thd_kill_UNLOCK(thd); } + if (bf_thd) { + wsrep_thd_UNLOCK(bf_thd); + wsrep_thd_kill_UNLOCK(bf_thd); + } + free(arg); +} + +/** This function is used to kill one transaction. + +This transaction was open on this node (not-yet-committed), and a +conflicting writeset from some other node that was being applied +caused a locking conflict. First committed (from other node) +wins, thus open transaction is rolled back. BF stands for +brute-force: any transaction can get aborted by galera any time +it is necessary. + +This conflict can happen only when the replicated writeset (from +other node) is being applied, not when it’s waiting in the queue. +If our local transaction reached its COMMIT and this conflicting +writeset was in the queue, then it should fail the local +certification test instead. + +A brute force abort is only triggered by a locking conflict +between a writeset being applied by an applier thread (slave thread) +and an open transaction on the node, not by a Galera writeset +comparison as in the local certification failure. + +@param[in] bf_thd Brute force (BF) thread +@param[in,out] victim_trx Vimtim trx to be killed +@param[in] signal Should victim be signaled */ +void +wsrep_innobase_kill_one_trx( + THD* bf_thd, + trx_t *victim_trx, + bool signal) +{ + ut_ad(bf_thd); + ut_ad(victim_trx); + ut_ad(lock_mutex_own()); + ut_ad(trx_mutex_own(victim_trx)); + + DBUG_ENTER("wsrep_innobase_kill_one_trx"); + + DBUG_EXECUTE_IF("sync.before_wsrep_thd_abort", + { + const char act[]= + "now " + "SIGNAL sync.before_wsrep_thd_abort_reached " + "WAIT_FOR signal.before_wsrep_thd_abort"; + DBUG_ASSERT(!debug_sync_set_action(bf_thd, + STRING_WITH_LEN(act))); + };); + + trx_t* bf_trx= thd_to_trx(bf_thd); + bg_wsrep_kill_trx_arg *arg = (bg_wsrep_kill_trx_arg*)malloc(sizeof(*arg)); + arg->thd_id = thd_get_thread_id(victim_trx->mysql_thd); + arg->trx_id = victim_trx->id; + arg->bf_thd_id = thd_get_thread_id(bf_thd); + arg->bf_trx_id = bf_trx ? bf_trx->id : TRX_ID_MAX; + arg->signal = signal; + mysql_manager_submit(bg_wsrep_kill_trx, arg); DBUG_VOID_RETURN; } @@ -18693,13 +18750,44 @@ wsrep_abort_transaction( if (victim_trx) { lock_mutex_enter(); trx_mutex_enter(victim_trx); - wsrep_innobase_kill_one_trx(bf_thd, victim_trx, signal); + victim_trx->lock.was_chosen_as_wsrep_victim= true; trx_mutex_exit(victim_trx); lock_mutex_exit(); + + wsrep_thd_kill_LOCK(victim_thd); + wsrep_thd_LOCK(victim_thd); + bool aborting= !wsrep_thd_set_wsrep_aborter(bf_thd, victim_thd); + wsrep_thd_UNLOCK(victim_thd); + if (aborting) { + DEBUG_SYNC(bf_thd, "before_wsrep_thd_abort"); + DBUG_EXECUTE_IF("sync.before_wsrep_thd_abort", + { + const char act[]= + "now " + "SIGNAL sync.before_wsrep_thd_abort_reached " + "WAIT_FOR signal.before_wsrep_thd_abort"; + DBUG_ASSERT(!debug_sync_set_action(bf_thd, + STRING_WITH_LEN(act))); + };); + wsrep_thd_bf_abort(bf_thd, victim_thd, signal); + } + wsrep_thd_kill_UNLOCK(victim_thd); + wsrep_srv_conc_cancel_wait(victim_trx); DBUG_VOID_RETURN; } else { + DBUG_EXECUTE_IF("sync.before_wsrep_thd_abort", + { + const char act[]= + "now " + "SIGNAL sync.before_wsrep_thd_abort_reached " + "WAIT_FOR signal.before_wsrep_thd_abort"; + DBUG_ASSERT(!debug_sync_set_action(bf_thd, + STRING_WITH_LEN(act))); + };); + wsrep_thd_kill_LOCK(victim_thd); wsrep_thd_bf_abort(bf_thd, victim_thd, signal); + wsrep_thd_kill_UNLOCK(victim_thd); } DBUG_VOID_RETURN; |