From 1af6e92f0b106c0588f89c51b749c573262e82d1 Mon Sep 17 00:00:00 2001 From: sjaakola Date: Mon, 25 May 2020 14:23:42 +0300 Subject: MDEV-22666 galera.MW-328A hang The hang can happen between a lock connection issuing KILL CONNECTION for a victim, which is in committing phase. There happens two resource deadlockwhere killer is holding victim's LOCK_thd_data and requires trx mutex for the victim. The victim, otoh, holds his own trx mutex, but requires LOCK_thd_data in wsrep_commit_ordered(). Hence a classic two thread deadlock happens. The fix in this commit changes innodb commit so that wsrep_commit_ordered() is not called while holding trx mutex. With this, wsrep patch commit time mutex locking does not violate the locking protocol of KILL command (i.e. LOCK_thd_data -> trx mutex) Also, a new test case has been added in galera.galera_bf_kill.test for scenario where a client connection is killed in committting phase. --- mysql-test/suite/galera/r/galera_bf_kill.result | 17 ++++++++++ mysql-test/suite/galera/t/galera_bf_kill.test | 44 +++++++++++++++++++++++++ sql/service_wsrep.cc | 1 + storage/innobase/trx/trx0trx.cc | 11 +++---- 4 files changed, 67 insertions(+), 6 deletions(-) diff --git a/mysql-test/suite/galera/r/galera_bf_kill.result b/mysql-test/suite/galera/r/galera_bf_kill.result index 8b620323e35..2a7bc9eac29 100644 --- a/mysql-test/suite/galera/r/galera_bf_kill.result +++ b/mysql-test/suite/galera/r/galera_bf_kill.result @@ -70,3 +70,20 @@ a b 2 1 disconnect node_2a; drop table t1; +connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2; +connection node_2a; +CREATE TABLE t1 (i int primary key); +SET DEBUG_SYNC = "before_wsrep_ordered_commit SIGNAL bwoc_reached WAIT_FOR bwoc_continue"; +INSERT INTO t1 VALUES (1); +connection node_2; +SET DEBUG_SYNC = "now WAIT_FOR bwoc_reached"; +SET DEBUG_SYNC = "now SIGNAL bwoc_continue"; +SET DEBUG_SYNC='RESET'; +connection node_2a; +connection node_2; +select * from t1; +i +1 +disconnect node_2a; +connection node_2; +drop table t1; diff --git a/mysql-test/suite/galera/t/galera_bf_kill.test b/mysql-test/suite/galera/t/galera_bf_kill.test index 0748b732ead..ce8d27c281b 100644 --- a/mysql-test/suite/galera/t/galera_bf_kill.test +++ b/mysql-test/suite/galera/t/galera_bf_kill.test @@ -140,4 +140,48 @@ select * from t1; drop table t1; +# +# Test case 7: +# run a transaction in node 2, and set a sync point to pause the transaction +# in commit phase. +# Through another connection to node 2, kill the committing transaction by +# KILL QUERY command +# + +--connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2 +--connection node_2a +--let $connection_id = `SELECT CONNECTION_ID()` + +CREATE TABLE t1 (i int primary key); + +# Set up sync point +SET DEBUG_SYNC = "before_wsrep_ordered_commit SIGNAL bwoc_reached WAIT_FOR bwoc_continue"; + +# Send insert which will block in the sync point above +--send INSERT INTO t1 VALUES (1) + +--connection node_2 +SET DEBUG_SYNC = "now WAIT_FOR bwoc_reached"; + +--disable_query_log +--disable_result_log +# victim has passed the point of no return, kill is not possible anymore +--eval KILL QUERY $connection_id +--enable_result_log +--enable_query_log + +SET DEBUG_SYNC = "now SIGNAL bwoc_continue"; +SET DEBUG_SYNC='RESET'; +--connection node_2a +--error 0,1213 +--reap + +--connection node_2 +# victim was able to complete the INSERT +select * from t1; + +--disconnect node_2a + +--connection node_2 +drop table t1; diff --git a/sql/service_wsrep.cc b/sql/service_wsrep.cc index 7cac2bf741b..ada0bde803f 100644 --- a/sql/service_wsrep.cc +++ b/sql/service_wsrep.cc @@ -299,6 +299,7 @@ extern "C" void wsrep_commit_ordered(THD *thd) thd->wsrep_trx().state() == wsrep::transaction::s_committing && !wsrep_commit_will_write_binlog(thd)) { + DEBUG_SYNC(thd, "before_wsrep_ordered_commit"); thd->wsrep_cs().ordered_commit(); } } diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc index 276a78d00bf..f926e661be4 100644 --- a/storage/innobase/trx/trx0trx.cc +++ b/storage/innobase/trx/trx0trx.cc @@ -1493,12 +1493,6 @@ inline void trx_t::commit_in_memory(const mtr_t *mtr) if (fts_trx) trx_finalize_for_fts(this, undo_no != 0); - trx_mutex_enter(this); - dict_operation= TRX_DICT_OP_NONE; - - DBUG_LOG("trx", "Commit in memory: " << this); - state= TRX_STATE_NOT_STARTED; - #ifdef WITH_WSREP /* Serialization history has been written and the transaction is committed in memory, which makes this commit ordered. Release commit @@ -1510,6 +1504,11 @@ inline void trx_t::commit_in_memory(const mtr_t *mtr) } lock.was_chosen_as_wsrep_victim= false; #endif /* WITH_WSREP */ + trx_mutex_enter(this); + dict_operation= TRX_DICT_OP_NONE; + + DBUG_LOG("trx", "Commit in memory: " << this); + state= TRX_STATE_NOT_STARTED; assert_freed(); trx_init(this); -- cgit v1.2.1 From e04999c4600cbfc83ff238028b820cf300487b18 Mon Sep 17 00:00:00 2001 From: Julius Goryavsky Date: Tue, 26 May 2020 14:01:13 +0200 Subject: Forgotten include files were added to check the necessary conditions for running the test --- mysql-test/suite/galera/t/galera_bf_kill.test | 3 +++ mysql-test/suite/galera/t/galera_bf_lock_wait.test | 1 + 2 files changed, 4 insertions(+) diff --git a/mysql-test/suite/galera/t/galera_bf_kill.test b/mysql-test/suite/galera/t/galera_bf_kill.test index ce8d27c281b..3eb3ddc32b5 100644 --- a/mysql-test/suite/galera/t/galera_bf_kill.test +++ b/mysql-test/suite/galera/t/galera_bf_kill.test @@ -1,4 +1,7 @@ --source include/galera_cluster.inc +--source include/have_innodb.inc +--source include/have_debug.inc +--source include/have_debug_sync.inc # # Test case 1: Start a transaction on node_2a and kill it diff --git a/mysql-test/suite/galera/t/galera_bf_lock_wait.test b/mysql-test/suite/galera/t/galera_bf_lock_wait.test index a3903fd10c0..97d3b8e0710 100644 --- a/mysql-test/suite/galera/t/galera_bf_lock_wait.test +++ b/mysql-test/suite/galera/t/galera_bf_lock_wait.test @@ -1,4 +1,5 @@ --source include/galera_cluster.inc +--source include/have_innodb.inc --source include/big_test.inc --connection node_1 -- cgit v1.2.1