diff options
author | Brandon Nesterenko <brandon.nesterenko@mariadb.com> | 2021-10-20 20:13:45 -0600 |
---|---|---|
committer | Brandon Nesterenko <brandon.nesterenko@mariadb.com> | 2022-01-05 13:41:58 -0700 |
commit | 515dc593ab47371875434e1de0f77125ce3dc6f4 (patch) | |
tree | 4daf7500ca090923ceab6d72a5b5e591e58e7851 | |
parent | c18896f9c1ce6e4b9a8519a2d5155698d82ae45a (diff) | |
download | mariadb-git-bb-10.4-MDEV-11853.tar.gz |
MDEV-11853: semisync thread can be killed after sync binlog but before ACK in the sync statebb-10.4-MDEV-11853
Problem:
========
If a primary is shutdown during an active semi-sync connection
during the period when the primary is awaiting an ACK, the primary
hard kills the active communication thread and does not ensure the
transaction was received by a replica. This can lead to an
inconsistent replication state.
Solution:
========
During shutdown, the primary should wait for an ACK or timeout
before hard killing a thread which is awaiting a communication. We
extend the `SHUTDOWN WAIT FOR SLAVES` logic to identify and ignore
any threads waiting for a semi-sync ACK in phase 1. Then, before
stopping the ack receiver thread, the shutdown is delayed until all
waiting semi-sync connections receive an ACK or time out. The
connections are then killed in phase 2.
Reviewed By:
============
TODO
-rw-r--r-- | mysql-test/suite/rpl/r/rpl_semi_sync_shutdown_await_ack.result | 516 | ||||
-rw-r--r-- | mysql-test/suite/rpl/t/rpl_semi_sync_shutdown_await_ack.cnf | 14 | ||||
-rw-r--r-- | mysql-test/suite/rpl/t/rpl_semi_sync_shutdown_await_ack.inc | 163 | ||||
-rw-r--r-- | mysql-test/suite/rpl/t/rpl_semi_sync_shutdown_await_ack.test | 213 | ||||
-rw-r--r-- | sql/mysqld.cc | 113 | ||||
-rw-r--r-- | sql/semisync_master.cc | 60 | ||||
-rw-r--r-- | sql/semisync_master.h | 13 | ||||
-rw-r--r-- | sql/semisync_slave.cc | 39 | ||||
-rw-r--r-- | sql/semisync_slave.h | 2 | ||||
-rw-r--r-- | sql/slave.cc | 20 | ||||
-rw-r--r-- | sql/sql_class.cc | 3 | ||||
-rw-r--r-- | sql/sql_class.h | 19 |
12 files changed, 1131 insertions, 44 deletions
diff --git a/mysql-test/suite/rpl/r/rpl_semi_sync_shutdown_await_ack.result b/mysql-test/suite/rpl/r/rpl_semi_sync_shutdown_await_ack.result new file mode 100644 index 00000000000..f8bbcfc10c5 --- /dev/null +++ b/mysql-test/suite/rpl/r/rpl_semi_sync_shutdown_await_ack.result @@ -0,0 +1,516 @@ +############################# +# Common setup for all tests +############################# +# Note: Simulated slave delay is hardcoded to 800 milliseconds +# Note: Simulated master shutdown delay is hardcoded to 500 milliseconds +include/rpl_init.inc [topology=1->2, 1->3] +connection server_1; +# Slaves which simulate an error will produce a timeout on the primary +call mtr.add_suppression("Timeout waiting"); +# Suppress slave errors related to the simulated error +connection server_2; +call mtr.add_suppression("reply failed"); +call mtr.add_suppression("Replication event checksum verification"); +call mtr.add_suppression("Relay log write failure"); +call mtr.add_suppression("Failed to kill the active semi-sync connection"); +connection server_3; +call mtr.add_suppression("reply failed"); +call mtr.add_suppression("Replication event checksum verification"); +call mtr.add_suppression("Relay log write failure"); +call mtr.add_suppression("Failed to kill the active semi-sync connection"); +connection server_1; +CREATE TABLE t1 (a int); +connection server_2; +connection server_3; +connect server_1_con2, localhost, root,,; +############################# +# Test cases +############################# +# +# Test Case 1) If both replicas simulate a delay that is within the +# allowed timeout, the primary should delay killing the suspended thread +# until an ACK is received (Rpl_semi_sync_master_yes_tx should be 1). +# +connection server_1; +#-- +#-- Semi-sync Setup +connection server_1; +#-- Enable semi-sync on slaves +let slave_last= 3 +connection server_2; +set global rpl_semi_sync_slave_enabled = 1; +include/stop_slave.inc +include/start_slave.inc +show status like 'Rpl_semi_sync_slave_status'; +Variable_name Value +Rpl_semi_sync_slave_status ON +connection server_3; +set global rpl_semi_sync_slave_enabled = 1; +include/stop_slave.inc +include/start_slave.inc +show status like 'Rpl_semi_sync_slave_status'; +Variable_name Value +Rpl_semi_sync_slave_status ON +#-- Enable semi-sync on master +connection server_1; +SET @@GLOBAL.rpl_semi_sync_master_enabled = 1; +set @@global.rpl_semi_sync_master_timeout= 1600; +#-- Wait for master to recognize semi-sync slaves +connection server_1; +#-- Master should have semi-sync enabled with 2 connections +show status like 'Rpl_semi_sync_master_status'; +Variable_name Value +Rpl_semi_sync_master_status ON +show status like 'Rpl_semi_sync_master_clients'; +Variable_name Value +Rpl_semi_sync_master_clients 2 +#-- Prepare servers to simulate delay or error +connection server_1; +SET @@GLOBAL.debug_dbug= ""; +connection server_2; +SET @@GLOBAL.debug_dbug= "+d,simulate_delay_semisync_slave_reply"; +connection server_3; +SET @@GLOBAL.debug_dbug= "+d,simulate_delay_semisync_slave_reply"; +#-- +#-- Test begins +connection server_1; +#-- Begin semi-sync transaction +INSERT INTO t1 VALUES (1); +connection server_1_con2; +#-- Wait until master recognizes a connection is awaiting semi-sync ACK +show status like 'Rpl_semi_sync_master_wait_sessions'; +Variable_name Value +Rpl_semi_sync_master_wait_sessions 1 +#-- Give enough time after timeout/ack received to query yes_tx/no_tx +SET @@GLOBAL.debug_dbug= "+d,delay_shutdown_phase_2_after_semisync_wait"; +#-- Begin master shutdown +SHUTDOWN WAIT FOR ALL SLAVES; +connection server_1; +#-- Ensure either ACK was received (yes_tx=1) or timeout (no_tx=1) +show status like 'Rpl_semi_sync_master_yes_tx'; +Variable_name Value +Rpl_semi_sync_master_yes_tx 1 +show status like 'Rpl_semi_sync_master_no_tx'; +Variable_name Value +Rpl_semi_sync_master_no_tx 0 +connection server_1_con2; +# Check logs to ensure shutdown was delayed +FOUND 1 /Delaying shutdown to await semi-sync ACK/ in mysqld.1.err +# Validate slave data is in correct state +connection server_2; +select count(*)=1 from t1; +count(*)=1 +1 +connection server_3; +select count(*)=1 from t1; +count(*)=1 +1 +# +#-- Re-synchronize slaves with master and disable semi-sync +#-- Stop slaves +connection server_2; +SET @@GLOBAL.debug_dbug=""; +SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0; +include/stop_slave.inc +connection server_3; +SET @@GLOBAL.debug_dbug=""; +SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0; +include/stop_slave.inc +#-- Bring the master back up +connection server_1_con2; +connection default; +connection server_1; +SET @@GLOBAL.debug_dbug= ""; +SET @@GLOBAL.rpl_semi_sync_master_enabled = 0; +show status like 'Rpl_semi_sync_master_status'; +Variable_name Value +Rpl_semi_sync_master_status OFF +TRUNCATE TABLE t1; +#-- Bring slaves back up +connection server_2; +include/start_slave.inc +show status like 'Rpl_semi_sync_slave_status'; +Variable_name Value +Rpl_semi_sync_slave_status OFF +SELECT COUNT(*)=0 from t1; +COUNT(*)=0 +1 +connection server_3; +include/start_slave.inc +show status like 'Rpl_semi_sync_slave_status'; +Variable_name Value +Rpl_semi_sync_slave_status OFF +SELECT COUNT(*)=0 from t1; +COUNT(*)=0 +1 +# +# Test Case 2) If both replicas simulate an error before sending an ACK, +# the primary should delay killing the suspended thread until the +# timeout is reached (Rpl_semi_sync_master_no_tx should be 1). +# +connection server_1; +#-- +#-- Semi-sync Setup +connection server_1; +#-- Enable semi-sync on slaves +let slave_last= 3 +connection server_2; +set global rpl_semi_sync_slave_enabled = 1; +include/stop_slave.inc +include/start_slave.inc +show status like 'Rpl_semi_sync_slave_status'; +Variable_name Value +Rpl_semi_sync_slave_status ON +connection server_3; +set global rpl_semi_sync_slave_enabled = 1; +include/stop_slave.inc +include/start_slave.inc +show status like 'Rpl_semi_sync_slave_status'; +Variable_name Value +Rpl_semi_sync_slave_status ON +#-- Enable semi-sync on master +connection server_1; +SET @@GLOBAL.rpl_semi_sync_master_enabled = 1; +set @@global.rpl_semi_sync_master_timeout= 500; +#-- Wait for master to recognize semi-sync slaves +connection server_1; +#-- Master should have semi-sync enabled with 2 connections +show status like 'Rpl_semi_sync_master_status'; +Variable_name Value +Rpl_semi_sync_master_status ON +show status like 'Rpl_semi_sync_master_clients'; +Variable_name Value +Rpl_semi_sync_master_clients 2 +#-- Prepare servers to simulate delay or error +connection server_1; +SET @@GLOBAL.debug_dbug= ""; +connection server_2; +SET @@GLOBAL.debug_dbug= "+d,corrupt_queue_event"; +connection server_3; +SET @@GLOBAL.debug_dbug= "+d,corrupt_queue_event"; +#-- +#-- Test begins +connection server_1; +#-- Begin semi-sync transaction +INSERT INTO t1 VALUES (1); +connection server_1_con2; +#-- Wait until master recognizes a connection is awaiting semi-sync ACK +show status like 'Rpl_semi_sync_master_wait_sessions'; +Variable_name Value +Rpl_semi_sync_master_wait_sessions 1 +#-- Give enough time after timeout/ack received to query yes_tx/no_tx +SET @@GLOBAL.debug_dbug= "+d,delay_shutdown_phase_2_after_semisync_wait"; +#-- Begin master shutdown +SHUTDOWN WAIT FOR ALL SLAVES; +connection server_1; +#-- Ensure either ACK was received (yes_tx=1) or timeout (no_tx=1) +show status like 'Rpl_semi_sync_master_yes_tx'; +Variable_name Value +Rpl_semi_sync_master_yes_tx 0 +show status like 'Rpl_semi_sync_master_no_tx'; +Variable_name Value +Rpl_semi_sync_master_no_tx 1 +connection server_1_con2; +# Check logs to ensure shutdown was delayed +FOUND 2 /Delaying shutdown to await semi-sync ACK/ in mysqld.1.err +# Validate slave data is in correct state +connection server_2; +select count(*)=0 from t1; +count(*)=0 +1 +connection server_3; +select count(*)=0 from t1; +count(*)=0 +1 +# +#-- Re-synchronize slaves with master and disable semi-sync +#-- Stop slaves +connection server_2; +SET @@GLOBAL.debug_dbug=""; +SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0; +include/stop_slave.inc +connection server_3; +SET @@GLOBAL.debug_dbug=""; +SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0; +include/stop_slave.inc +#-- Bring the master back up +connection server_1_con2; +connection default; +connection server_1; +SET @@GLOBAL.debug_dbug= ""; +SET @@GLOBAL.rpl_semi_sync_master_enabled = 0; +show status like 'Rpl_semi_sync_master_status'; +Variable_name Value +Rpl_semi_sync_master_status OFF +TRUNCATE TABLE t1; +#-- Bring slaves back up +connection server_2; +include/start_slave.inc +show status like 'Rpl_semi_sync_slave_status'; +Variable_name Value +Rpl_semi_sync_slave_status OFF +SELECT COUNT(*)=0 from t1; +COUNT(*)=0 +1 +connection server_3; +include/start_slave.inc +show status like 'Rpl_semi_sync_slave_status'; +Variable_name Value +Rpl_semi_sync_slave_status OFF +SELECT COUNT(*)=0 from t1; +COUNT(*)=0 +1 +# +# Test Case 3) If one replica simulates a delay within the allowed +# timeout and the other simulates an error before sending an ACK, the +# primary should delay killing the suspended thread until it receives an +# ACK from the delayed slave (Rpl_semi_sync_master_yes_tx should be 1). +# +connection server_1; +#-- +#-- Semi-sync Setup +connection server_1; +#-- Enable semi-sync on slaves +let slave_last= 3 +connection server_2; +set global rpl_semi_sync_slave_enabled = 1; +include/stop_slave.inc +include/start_slave.inc +show status like 'Rpl_semi_sync_slave_status'; +Variable_name Value +Rpl_semi_sync_slave_status ON +connection server_3; +set global rpl_semi_sync_slave_enabled = 1; +include/stop_slave.inc +include/start_slave.inc +show status like 'Rpl_semi_sync_slave_status'; +Variable_name Value +Rpl_semi_sync_slave_status ON +#-- Enable semi-sync on master +connection server_1; +SET @@GLOBAL.rpl_semi_sync_master_enabled = 1; +set @@global.rpl_semi_sync_master_timeout= 1600; +#-- Wait for master to recognize semi-sync slaves +connection server_1; +#-- Master should have semi-sync enabled with 2 connections +show status like 'Rpl_semi_sync_master_status'; +Variable_name Value +Rpl_semi_sync_master_status ON +show status like 'Rpl_semi_sync_master_clients'; +Variable_name Value +Rpl_semi_sync_master_clients 2 +#-- Prepare servers to simulate delay or error +connection server_1; +SET @@GLOBAL.debug_dbug= ""; +connection server_2; +SET @@GLOBAL.debug_dbug= "+d,corrupt_queue_event"; +connection server_3; +SET @@GLOBAL.debug_dbug= "+d,simulate_delay_semisync_slave_reply"; +#-- +#-- Test begins +connection server_1; +#-- Begin semi-sync transaction +INSERT INTO t1 VALUES (1); +connection server_1_con2; +#-- Wait until master recognizes a connection is awaiting semi-sync ACK +show status like 'Rpl_semi_sync_master_wait_sessions'; +Variable_name Value +Rpl_semi_sync_master_wait_sessions 1 +#-- Give enough time after timeout/ack received to query yes_tx/no_tx +SET @@GLOBAL.debug_dbug= "+d,delay_shutdown_phase_2_after_semisync_wait"; +#-- Begin master shutdown +SHUTDOWN WAIT FOR ALL SLAVES; +connection server_1; +#-- Ensure either ACK was received (yes_tx=1) or timeout (no_tx=1) +show status like 'Rpl_semi_sync_master_yes_tx'; +Variable_name Value +Rpl_semi_sync_master_yes_tx 1 +show status like 'Rpl_semi_sync_master_no_tx'; +Variable_name Value +Rpl_semi_sync_master_no_tx 0 +connection server_1_con2; +# Check logs to ensure shutdown was delayed +FOUND 3 /Delaying shutdown to await semi-sync ACK/ in mysqld.1.err +# Validate slave data is in correct state +connection server_2; +select count(*)=0 from t1; +count(*)=0 +1 +connection server_3; +select count(*)=1 from t1; +count(*)=1 +1 +# +#-- Re-synchronize slaves with master and disable semi-sync +#-- Stop slaves +connection server_2; +SET @@GLOBAL.debug_dbug=""; +SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0; +include/stop_slave.inc +connection server_3; +SET @@GLOBAL.debug_dbug=""; +SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0; +include/stop_slave.inc +#-- Bring the master back up +connection server_1_con2; +connection default; +connection server_1; +SET @@GLOBAL.debug_dbug= ""; +SET @@GLOBAL.rpl_semi_sync_master_enabled = 0; +show status like 'Rpl_semi_sync_master_status'; +Variable_name Value +Rpl_semi_sync_master_status OFF +TRUNCATE TABLE t1; +#-- Bring slaves back up +connection server_2; +include/start_slave.inc +show status like 'Rpl_semi_sync_slave_status'; +Variable_name Value +Rpl_semi_sync_slave_status OFF +SELECT COUNT(*)=0 from t1; +COUNT(*)=0 +1 +connection server_3; +include/start_slave.inc +show status like 'Rpl_semi_sync_slave_status'; +Variable_name Value +Rpl_semi_sync_slave_status OFF +SELECT COUNT(*)=0 from t1; +COUNT(*)=0 +1 +# +# Test Case 4) If a replica errors before sending an ACK, it will cause +# the IO thread to stop and handle the error. During error handling, if +# semi-sync is active, the replica will form a new connection with the +# primary to kill the active connection. However, if the primary is +# shutting down, it may kill the new connection, thereby leaving the +# active semi-sync connection in-tact. The slave should notice this, and +# not issue a `QUIT` command to the primary, which would otherwise be +# sent to kill an active connection. This test case validates that the +# slave does not send a `QUIT` in this case (Rpl_semi_sync_master_yes_tx +# should be 1 because server_3 will send the ACK within a valid timeout). +# +connection server_1; +#-- +#-- Semi-sync Setup +connection server_1; +#-- Enable semi-sync on slaves +let slave_last= 3 +connection server_2; +set global rpl_semi_sync_slave_enabled = 1; +include/stop_slave.inc +include/start_slave.inc +show status like 'Rpl_semi_sync_slave_status'; +Variable_name Value +Rpl_semi_sync_slave_status ON +connection server_3; +set global rpl_semi_sync_slave_enabled = 1; +include/stop_slave.inc +include/start_slave.inc +show status like 'Rpl_semi_sync_slave_status'; +Variable_name Value +Rpl_semi_sync_slave_status ON +#-- Enable semi-sync on master +connection server_1; +SET @@GLOBAL.rpl_semi_sync_master_enabled = 1; +set @@global.rpl_semi_sync_master_timeout= 1600; +#-- Wait for master to recognize semi-sync slaves +connection server_1; +#-- Master should have semi-sync enabled with 2 connections +show status like 'Rpl_semi_sync_master_status'; +Variable_name Value +Rpl_semi_sync_master_status ON +show status like 'Rpl_semi_sync_master_clients'; +Variable_name Value +Rpl_semi_sync_master_clients 2 +#-- Prepare servers to simulate delay or error +connection server_1; +SET @@GLOBAL.debug_dbug= "+d,mysqld_delay_kill_threads_phase_1"; +connection server_2; +SET @@GLOBAL.debug_dbug= "+d,corrupt_queue_event,slave_delay_killing_semisync_connection"; +connection server_3; +SET @@GLOBAL.debug_dbug= "+d,simulate_delay_semisync_slave_reply"; +#-- +#-- Test begins +connection server_1; +#-- Begin semi-sync transaction +INSERT INTO t1 VALUES (1); +connection server_1_con2; +#-- Wait until master recognizes a connection is awaiting semi-sync ACK +show status like 'Rpl_semi_sync_master_wait_sessions'; +Variable_name Value +Rpl_semi_sync_master_wait_sessions 1 +#-- Give enough time after timeout/ack received to query yes_tx/no_tx +SET @@GLOBAL.debug_dbug= "+d,delay_shutdown_phase_2_after_semisync_wait"; +#-- Begin master shutdown +SHUTDOWN WAIT FOR ALL SLAVES; +connection server_1; +#-- Ensure either ACK was received (yes_tx=1) or timeout (no_tx=1) +show status like 'Rpl_semi_sync_master_yes_tx'; +Variable_name Value +Rpl_semi_sync_master_yes_tx 1 +show status like 'Rpl_semi_sync_master_no_tx'; +Variable_name Value +Rpl_semi_sync_master_no_tx 0 +connection server_1_con2; +# Check logs to ensure shutdown was delayed +FOUND 4 /Delaying shutdown to await semi-sync ACK/ in mysqld.1.err +# Validate slave data is in correct state +connection server_2; +select count(*)=0 from t1; +count(*)=0 +1 +connection server_3; +select count(*)=1 from t1; +count(*)=1 +1 +# +#-- Re-synchronize slaves with master and disable semi-sync +#-- Stop slaves +connection server_2; +SET @@GLOBAL.debug_dbug=""; +SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0; +include/stop_slave.inc +connection server_3; +SET @@GLOBAL.debug_dbug=""; +SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0; +include/stop_slave.inc +#-- Bring the master back up +connection server_1_con2; +connection default; +connection server_1; +SET @@GLOBAL.debug_dbug= ""; +SET @@GLOBAL.rpl_semi_sync_master_enabled = 0; +show status like 'Rpl_semi_sync_master_status'; +Variable_name Value +Rpl_semi_sync_master_status OFF +TRUNCATE TABLE t1; +#-- Bring slaves back up +connection server_2; +include/start_slave.inc +show status like 'Rpl_semi_sync_slave_status'; +Variable_name Value +Rpl_semi_sync_slave_status OFF +SELECT COUNT(*)=0 from t1; +COUNT(*)=0 +1 +connection server_3; +include/start_slave.inc +show status like 'Rpl_semi_sync_slave_status'; +Variable_name Value +Rpl_semi_sync_slave_status OFF +SELECT COUNT(*)=0 from t1; +COUNT(*)=0 +1 +############################# +# Cleanup +############################# +connection server_2; +include/stop_slave.inc +include/start_slave.inc +connection server_3; +include/stop_slave.inc +include/start_slave.inc +connection server_1; +drop table t1; +include/rpl_end.inc diff --git a/mysql-test/suite/rpl/t/rpl_semi_sync_shutdown_await_ack.cnf b/mysql-test/suite/rpl/t/rpl_semi_sync_shutdown_await_ack.cnf new file mode 100644 index 00000000000..2cf1b1786bd --- /dev/null +++ b/mysql-test/suite/rpl/t/rpl_semi_sync_shutdown_await_ack.cnf @@ -0,0 +1,14 @@ +!include ../my.cnf + +[mysqld.1] +log_warnings=9 + +[mysqld.2] +log_warnings=9 + +[mysqld.3] +log_warnings=9 + +[ENV] +SERVER_MYPORT_3= @mysqld.3.port +SERVER_MYSOCK_3= @mysqld.3.socket diff --git a/mysql-test/suite/rpl/t/rpl_semi_sync_shutdown_await_ack.inc b/mysql-test/suite/rpl/t/rpl_semi_sync_shutdown_await_ack.inc new file mode 100644 index 00000000000..f4fed2c34ee --- /dev/null +++ b/mysql-test/suite/rpl/t/rpl_semi_sync_shutdown_await_ack.inc @@ -0,0 +1,163 @@ +# +# Helper file to ensure that a primary waits for all ACKS (or timeout) from its +# replicas before shutting down. +# +# Parameters: +# server_1_dbug (string) Debug setting for primary (server 1) +# server_2_dbug (string) Debug setting to simulate delay or error on +# the first replica (server 2) +# server_3_dbug (string) Debug setting to simulate delay or error on +# the second replica (server 3) +# semisync_timeout (int) Rpl_semi_sync_master_timeout to use +# server_2_expect_row_count (int) The number of rows expected on the first +# replica after the shutdown +# server_3_expect_row_count (int) The number of rows expected on the second +# replica after the shutdown +# + +--connection server_1 +let $log_error_file= `SELECT @@GLOBAL.log_error`; + +--echo #-- +--echo #-- Semi-sync Setup + +--connection server_1 +--save_master_pos + +echo #-- Enable semi-sync on slaves +let slave_last= 3; +--let i= 2 +while (`SELECT $i <= $slave_last`) +{ + --connection server_$i + --sync_with_master + + set global rpl_semi_sync_slave_enabled = 1; + source include/stop_slave.inc; + source include/start_slave.inc; + show status like 'Rpl_semi_sync_slave_status'; + + --inc $i +} + +--echo #-- Enable semi-sync on master +--connection server_1 +SET @@GLOBAL.rpl_semi_sync_master_enabled = 1; +--eval set @@global.rpl_semi_sync_master_timeout= $semisync_timeout + +--echo #-- Wait for master to recognize semi-sync slaves +--connection server_1 +let $status_var= Rpl_semi_sync_master_clients; +let $status_var_value= 2; +source include/wait_for_status_var.inc; + +--echo #-- Master should have semi-sync enabled with 2 connections +show status like 'Rpl_semi_sync_master_status'; +show status like 'Rpl_semi_sync_master_clients'; + +--echo #-- Prepare servers to simulate delay or error +--connection server_1 +--eval SET @@GLOBAL.debug_dbug= $server_1_dbug +--connection server_2 +--eval SET @@GLOBAL.debug_dbug= $server_2_dbug +--connection server_3 +--eval SET @@GLOBAL.debug_dbug= $server_3_dbug + +--echo #-- +--echo #-- Test begins + +--connection server_1 +--echo #-- Begin semi-sync transaction +--send INSERT INTO t1 VALUES (1) + +--connection server_1_con2 +--echo #-- Wait until master recognizes a connection is awaiting semi-sync ACK +let $status_var= Rpl_semi_sync_master_wait_sessions; +let $status_var_value= 1; +source include/wait_for_status_var.inc; +show status like 'Rpl_semi_sync_master_wait_sessions'; + +--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect +wait +EOF + +--echo #-- Give enough time after timeout/ack received to query yes_tx/no_tx +SET @@GLOBAL.debug_dbug= "+d,delay_shutdown_phase_2_after_semisync_wait"; + +--echo #-- Begin master shutdown +--send SHUTDOWN WAIT FOR ALL SLAVES + +--connection server_1 +--reap +--echo #-- Ensure either ACK was received (yes_tx=1) or timeout (no_tx=1) +show status like 'Rpl_semi_sync_master_yes_tx'; +show status like 'Rpl_semi_sync_master_no_tx'; + +--connection server_1_con2 +--reap +--source include/wait_until_disconnected.inc + +--echo # Check logs to ensure shutdown was delayed +--let SEARCH_FILE=$log_error_file +--let SEARCH_PATTERN=Delaying shutdown to await semi-sync ACK +--source include/search_pattern_in_file.inc + +--echo # Validate slave data is in correct state +--connection server_2 +--eval select count(*)=$server_2_expect_row_count from t1 +--connection server_3 +--eval select count(*)=$server_3_expect_row_count from t1 + +--echo # +--echo #-- Re-synchronize slaves with master and disable semi-sync + +--echo #-- Stop slaves +--let i= 2 +while (`SELECT $i <= $slave_last`) +{ + --connection server_$i + SET @@GLOBAL.debug_dbug=""; + --eval SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0 + source include/stop_slave.inc; + + --inc $i +} + +--echo #-- Bring the master back up +--connection server_1_con2 +--append_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect +restart +EOF + +--enable_reconnect +--source include/wait_until_connected_again.inc + +--connection default +--enable_reconnect +--source include/wait_until_connected_again.inc + +--connection server_1 +--enable_reconnect +--source include/wait_until_connected_again.inc + +SET @@GLOBAL.debug_dbug= ""; +let $status_var= Rpl_semi_sync_master_clients; +let $status_var_value= 0; +source include/wait_for_status_var.inc; +--eval SET @@GLOBAL.rpl_semi_sync_master_enabled = 0 +show status like 'Rpl_semi_sync_master_status'; + +TRUNCATE TABLE t1; +--save_master_pos + +--echo #-- Bring slaves back up +--let i= 2 +while (`SELECT $i <= $slave_last`) +{ + --connection server_$i + source include/start_slave.inc; + show status like 'Rpl_semi_sync_slave_status'; + --sync_with_master + SELECT COUNT(*)=0 from t1; + --inc $i +} diff --git a/mysql-test/suite/rpl/t/rpl_semi_sync_shutdown_await_ack.test b/mysql-test/suite/rpl/t/rpl_semi_sync_shutdown_await_ack.test new file mode 100644 index 00000000000..1e03fe3fa0e --- /dev/null +++ b/mysql-test/suite/rpl/t/rpl_semi_sync_shutdown_await_ack.test @@ -0,0 +1,213 @@ +# +# Purpose: +# This test validates that data is consistent between a primary and replica +# in semi-sync mode when the primary is issued `SHUTDOWN WAIT FOR SLAVES` +# during an active communication. More specifically, the primary should not +# kill the connection until it is sure a replica has received all binlog +# data, i.e. once the primary receives the ACK. If a primary is issued a +# shutdown before receiving an ACK, it should wait until either 1) the ACK is +# received, or 2) the configured timeout (rpl_semi_sync_master_timeout) is +# reached. +# +# Methodology: +# Using a topology consisting of one primary with two replicas, all in +# semi-sync mode, we use DEBUG_DBUG to simulate an error or delay on the +# replicas during an active communication while the primary is issued +# `SHUTDOWN WAIT FOR SLAVES`. We create four test cases to ensure the primary +# will correctly wait for the communication to finish, and use the semi-sync +# status variables Rpl_semi_sync_master_yes_tx and Rpl_semi_sync_master_no_tx +# to ensure the connection was not prematurely killed due to the shutdown. +# Test Case 1) If both replicas simulate a delay that is within the allowed +# timeout, the primary should delay killing the suspended thread +# until an ACK is received (Rpl_semi_sync_master_yes_tx should +# be 1). +# Test Case 2) If both replicas simulate an error before sending an ACK, the +# primary should delay killing the suspended thread until the +# the timeout is reached (Rpl_semi_sync_master_no_tx should be +# 1). +# Test Case 3) If one replica simulates a delay within the allowed timeout +# and the other simulates an error before sending an ACK, the +# primary should delay killing the suspended thread until it +# receives an ACK from the delayed slave +# (Rpl_semi_sync_master_yes_tx should be 1). +# Test Case 4) If a replica errors before sending an ACK, it will cause the +# IO thread to stop and handle the error. During error handling, +# if semi-sync is active, the replica will form a new connection +# with the primary to kill the active connection. However, if +# the primary is shutting down, it may kill the new connection, +# thereby leaving the active semi-sync connection in-tact. The +# slave should notice this, and not issue a `QUIT` command to +# the primary, which would otherwise be sent to kill an active +# connection. This test case validates that the slave does not +# send a `QUIT` in this case (Rpl_semi_sync_master_yes_tx should +# be 1 because server_3 will send the ACK within a valid +# timeout). +# +# References: +# MDEV-11853: semisync thread can be killed after sync binlog but before ACK +# in the sync state +# + +--echo ############################# +--echo # Common setup for all tests +--echo ############################# + +--echo # Note: Simulated slave delay is hardcoded to 800 milliseconds +--echo # Note: Simulated master shutdown delay is hardcoded to 500 milliseconds + +--source include/have_debug.inc +--let $rpl_topology=1->2, 1->3 +--source include/rpl_init.inc + +--connection server_1 + +--echo # Slaves which simulate an error will produce a timeout on the primary +call mtr.add_suppression("Timeout waiting"); + +--let $sav_master_timeout= `SELECT @@global.rpl_semi_sync_master_timeout` +--let $sav_enabled_master= `SELECT @@GLOBAL.rpl_semi_sync_master_enabled` +--let $sav_master_dbug= `SELECT @@GLOBAL.debug_dbug` + +--echo # Suppress slave errors related to the simulated error +--connection server_2 +call mtr.add_suppression("reply failed"); +call mtr.add_suppression("Replication event checksum verification"); +call mtr.add_suppression("Relay log write failure"); +call mtr.add_suppression("Failed to kill the active semi-sync connection"); +--let $sav_enabled_server_2=`SELECT @@GLOBAL.rpl_semi_sync_slave_enabled` +--let $sav_server_2_dbug= `SELECT @@GLOBAL.debug_dbug` + +--connection server_3 +call mtr.add_suppression("reply failed"); +call mtr.add_suppression("Replication event checksum verification"); +call mtr.add_suppression("Relay log write failure"); +call mtr.add_suppression("Failed to kill the active semi-sync connection"); +--let $sav_enabled_server_3=`SELECT @@GLOBAL.rpl_semi_sync_slave_enabled` +--let $sav_server_3_dbug= `SELECT @@GLOBAL.debug_dbug` + +--connection server_1 +CREATE TABLE t1 (a int); +--save_master_pos + +--let i= 2 +--let slave_last= 3 +while (`SELECT $i <= $slave_last`) +{ + --connection server_$i + --sync_with_master + --inc $i +} + +# Set up the connection used to issue the shutdown +--connect(server_1_con2, localhost, root,,) + + +--echo ############################# +--echo # Test cases +--echo ############################# + +--echo # +--echo # Test Case 1) If both replicas simulate a delay that is within the +--echo # allowed timeout, the primary should delay killing the suspended thread +--echo # until an ACK is received (Rpl_semi_sync_master_yes_tx should be 1). +--echo # +--let server_1_dbug= "" +--let server_2_dbug= "+d,simulate_delay_semisync_slave_reply" +--let server_3_dbug= "+d,simulate_delay_semisync_slave_reply" +--let semisync_timeout= 1600 +--let server_2_expect_row_count= 1 +--let server_3_expect_row_count= 1 +--source rpl_semi_sync_shutdown_await_ack.inc + +--echo # +--echo # Test Case 2) If both replicas simulate an error before sending an ACK, +--echo # the primary should delay killing the suspended thread until the +--echo # timeout is reached (Rpl_semi_sync_master_no_tx should be 1). +--echo # +--let server_1_dbug= "" +--let server_2_dbug= "+d,corrupt_queue_event" +--let server_3_dbug= "+d,corrupt_queue_event" +--let semisync_timeout= 500 +--let server_2_expect_row_count= 0 +--let server_3_expect_row_count= 0 +--source rpl_semi_sync_shutdown_await_ack.inc + +--echo # +--echo # Test Case 3) If one replica simulates a delay within the allowed +--echo # timeout and the other simulates an error before sending an ACK, the +--echo # primary should delay killing the suspended thread until it receives an +--echo # ACK from the delayed slave (Rpl_semi_sync_master_yes_tx should be 1). +--echo # +--let server_1_dbug= "" +--let server_2_dbug= "+d,corrupt_queue_event" +--let server_3_dbug= "+d,simulate_delay_semisync_slave_reply" +--let semisync_timeout= 1600 +--let server_2_expect_row_count= 0 +--let server_3_expect_row_count= 1 +--source rpl_semi_sync_shutdown_await_ack.inc + +--echo # +--echo # Test Case 4) If a replica errors before sending an ACK, it will cause +--echo # the IO thread to stop and handle the error. During error handling, if +--echo # semi-sync is active, the replica will form a new connection with the +--echo # primary to kill the active connection. However, if the primary is +--echo # shutting down, it may kill the new connection, thereby leaving the +--echo # active semi-sync connection in-tact. The slave should notice this, and +--echo # not issue a `QUIT` command to the primary, which would otherwise be +--echo # sent to kill an active connection. This test case validates that the +--echo # slave does not send a `QUIT` in this case (Rpl_semi_sync_master_yes_tx +--echo # should be 1 because server_3 will send the ACK within a valid timeout). +--echo # + +# mysqld_delay_kill_threads_phase1 ensures that server_2 will have enough time +# to start a new connection that has the intent to kill the active semi-sync +# connection +--let server_1_dbug= "+d,mysqld_delay_kill_threads_phase_1" + +# slave_delay_killing_semisync_connection ensures that the primary has force +# killed its current connection before it is able to issue `KILL` +--let server_2_dbug= "+d,corrupt_queue_event,slave_delay_killing_semisync_connection" +--let server_3_dbug= "+d,simulate_delay_semisync_slave_reply" +--let semisync_timeout= 1600 +--let server_2_expect_row_count= 0 +--let server_3_expect_row_count= 1 +--source rpl_semi_sync_shutdown_await_ack.inc + + +--echo ############################# +--echo # Cleanup +--echo ############################# + +--connection server_2 +source include/stop_slave.inc; +source include/start_slave.inc; + +--disable_query_log +--eval SET @@GLOBAL.rpl_semi_sync_slave_enabled = $sav_enabled_server_2 +--eval SET @@GLOBAL.debug_dbug= "$sav_server_2_dbug" +--enable_query_log + +--connection server_3 +source include/stop_slave.inc; +source include/start_slave.inc; + +--disable_query_log +--eval SET @@GLOBAL.rpl_semi_sync_slave_enabled = $sav_enabled_server_3 +--eval SET @@GLOBAL.debug_dbug= "$sav_server_3_dbug" +--enable_query_log + + +--connection server_1 +let $status_var= Rpl_semi_sync_master_clients; +let $status_var_value= 0; +source include/wait_for_status_var.inc; + +--disable_query_log +--eval SET @@GLOBAL.rpl_semi_sync_master_timeout= $sav_master_timeout +--eval SET @@GLOBAL.rpl_semi_sync_master_enabled= $sav_enabled_master +--eval SET @@GLOBAL.debug_dbug= "$sav_master_dbug" +--enable_query_log + +drop table t1; + +--source include/rpl_end.inc diff --git a/sql/mysqld.cc b/sql/mysqld.cc index 310dcada671..10eb5f81d53 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -1524,10 +1524,20 @@ static void kill_thread(THD *thd) /** First shutdown everything but slave threads and binlog dump connections */ -static my_bool kill_thread_phase_1(THD *thd, void *) +static my_bool kill_thread_phase_1(THD *thd, DYNAMIC_ARRAY *phase_2_kill_threads) { DBUG_PRINT("quit", ("Informing thread %ld that it's time to die", (ulong) thd->thread_id)); + + if (thd->is_awaiting_semisync_ack()) + { + insert_dynamic(phase_2_kill_threads, (const void *) &(thd->thread_id)); + DBUG_PRINT("quit", + ("Thread %ld kill delayed to phase 2", (ulong) thd->thread_id)); + + return 0; + } + if (thd->slave_thread || thd->is_binlog_dump_thread()) return 0; @@ -1544,30 +1554,64 @@ static my_bool kill_thread_phase_1(THD *thd, void *) /** Last shutdown binlog dump connections */ -static my_bool kill_thread_phase_2(THD *thd, void *) +static my_bool kill_thread_phase_2(THD *thd, DYNAMIC_ARRAY *phase_2_kill_threads) { if (shutdown_wait_for_slaves) { - thd->set_killed(KILL_SERVER); - } - else - { - thd->set_killed(KILL_SERVER_HARD); - MYSQL_CALLBACK(thread_scheduler, post_kill_notification, (thd)); + uint i; + long long unsigned int test_tid; + bool hard_kill= FALSE; + + for(i= 0; i < phase_2_kill_threads->elements; i++) + { + get_dynamic(phase_2_kill_threads, (void *)(&test_tid), i); + if (test_tid == thd->thread_id) + { + hard_kill= TRUE; + break; + } + } + + if (!hard_kill) + { + thd->set_killed(KILL_SERVER); + goto end; + } } + + thd->set_killed(KILL_SERVER_HARD); + MYSQL_CALLBACK(thread_scheduler, post_kill_notification, (thd)); + +end: kill_thread(thd); return 0; } /* associated with the kill thread phase 1 */ -static my_bool warn_threads_active_after_phase_1(THD *thd, void *) +static my_bool warn_threads_active_after_phase_1(THD *thd, DYNAMIC_ARRAY *phase_2_kill_threads) { - if (!thd->is_binlog_dump_thread() && thd->vio_ok()) - sql_print_warning("%s: Thread %llu (user : '%s') did not exit\n", my_progname, - (ulonglong) thd->thread_id, - (thd->main_security_ctx.user ? - thd->main_security_ctx.user : "")); + uint i; + long long unsigned int test_tid; + + if (thd->is_binlog_dump_thread() || !thd->vio_ok()) + goto end; + + /* If the thread is identified for phase 2 kill */ + for(i= 0; i < phase_2_kill_threads->elements; i++) + { + get_dynamic(phase_2_kill_threads, (void *)(&test_tid), i); + if (test_tid == thd->thread_id) + goto end; + } + + /* Unknown reason for this thread being alive */ + sql_print_warning("%s: Thread %llu (user : '%s') did not exit\n", my_progname, + (ulonglong) thd->thread_id, + (thd->main_security_ctx.user ? + thd->main_security_ctx.user : "")); + +end: return 0; } @@ -1729,7 +1773,28 @@ static void close_connections(void) This will give the threads some time to gracefully abort their statements and inform their clients that the server is about to die. */ - server_threads.iterate(kill_thread_phase_1); + DYNAMIC_ARRAY phase_2_hard_kill_threads; + my_init_dynamic_array(&phase_2_hard_kill_threads, sizeof(long long unsigned int), 4, 4, MYF(0)); + DBUG_EXECUTE_IF("mysqld_delay_kill_threads_phase_1", my_sleep(200000);); + server_threads.iterate(kill_thread_phase_1, &phase_2_hard_kill_threads); + + /* + If we are waiting on any ACKs, delay killing the thread until either an ACK + is received or the timeout is hit. + + Allow at max the number of sessions to await a timeout; however, if all + ACKs have been received in less iterations, then quit early + */ + if (shutdown_wait_for_slaves) + { + int delay_attempts= rpl_semi_sync_master_wait_sessions; + if (delay_attempts) + sql_print_information("Delaying shutdown to await semi-sync ACK"); + + while (rpl_semi_sync_master_wait_sessions && (delay_attempts-- > 0)) + repl_semisync_master.await_slave_reply(); + } + DBUG_EXECUTE_IF("delay_shutdown_phase_2_after_semisync_wait", my_sleep(500000);); Events::deinit(); slave_prepare_for_shutdown(); @@ -1752,11 +1817,15 @@ static void close_connections(void) */ DBUG_PRINT("info", ("THD_count: %u", THD_count::value())); - for (int i= 0; (THD_count::value() - binlog_dump_thread_count) && i < 1000; i++) + for (int i= 0; (THD_count::value() - binlog_dump_thread_count - + phase_2_hard_kill_threads.elements) && + i < 1000; + i++) my_sleep(20000); if (global_system_variables.log_warnings) - server_threads.iterate(warn_threads_active_after_phase_1); + server_threads.iterate(warn_threads_active_after_phase_1, + &phase_2_hard_kill_threads); #ifdef WITH_WSREP if (wsrep_inited == 1) @@ -1765,13 +1834,17 @@ static void close_connections(void) } #endif /* All threads has now been aborted */ - DBUG_PRINT("quit", ("Waiting for threads to die (count=%u)", THD_count::value())); + DBUG_PRINT("quit", ("Waiting for threads to die (count=%u)", + THD_count::value() - binlog_dump_thread_count - + phase_2_hard_kill_threads.elements)); - while (THD_count::value() - binlog_dump_thread_count) + while (THD_count::value() - binlog_dump_thread_count - + phase_2_hard_kill_threads.elements) my_sleep(1000); /* Kill phase 2 */ - server_threads.iterate(kill_thread_phase_2); + server_threads.iterate(kill_thread_phase_2, &phase_2_hard_kill_threads); + delete_dynamic(&phase_2_hard_kill_threads); for (uint64 i= 0; THD_count::value(); i++) { /* diff --git a/sql/semisync_master.cc b/sql/semisync_master.cc index b239a9776a7..8c07775d254 100644 --- a/sql/semisync_master.cc +++ b/sql/semisync_master.cc @@ -463,6 +463,28 @@ void Repl_semi_sync_master::cleanup() delete m_active_tranxs; } +void Repl_semi_sync_master::create_timeout(struct timespec *out, + struct timespec *start_arg) +{ + struct timespec *start_ts; + struct timespec now_ts; + if (!start_arg) + { + set_timespec(now_ts, 0); + start_ts= &now_ts; + } + else + { + start_ts= start_arg; + } + + long diff_secs= (long) (m_wait_timeout / TIME_THOUSAND); + long diff_nsecs= (long) ((m_wait_timeout % TIME_THOUSAND) * TIME_MILLION); + long nsecs= start_ts->tv_nsec + diff_nsecs; + out->tv_sec= start_ts->tv_sec + diff_secs + nsecs / TIME_BILLION; + out->tv_nsec= nsecs % TIME_BILLION; +} + void Repl_semi_sync_master::lock() { mysql_mutex_lock(&LOCK_binlog); @@ -862,13 +884,6 @@ int Repl_semi_sync_master::commit_trx(const char* trx_wait_binlog_name, m_wait_file_name, (ulong)m_wait_file_pos)); } - /* Calcuate the waiting period. */ - long diff_secs = (long) (m_wait_timeout / TIME_THOUSAND); - long diff_nsecs = (long) ((m_wait_timeout % TIME_THOUSAND) * TIME_MILLION); - long nsecs = start_ts.tv_nsec + diff_nsecs; - abstime.tv_sec = start_ts.tv_sec + diff_secs + nsecs/TIME_BILLION; - abstime.tv_nsec = nsecs % TIME_BILLION; - /* In semi-synchronous replication, we wait until the binlog-dump * thread has received the reply on the relevant binlog segment from the * replication slave. @@ -876,16 +891,28 @@ int Repl_semi_sync_master::commit_trx(const char* trx_wait_binlog_name, * Let us suspend this thread to wait on the condition; * when replication has progressed far enough, we will release * these waiting threads. + * + * Additionally, the thread state and system variable which represent + * this suspended thread need to be synchronized to ensure this thread is + * not immediately killed while awaiting the ACK if a shutdown is issued. */ + mysql_mutex_lock(&thd->LOCK_thd_data); rpl_semi_sync_master_wait_sessions++; + thd->set_awaiting_semisync_ack(TRUE); + mysql_mutex_unlock(&thd->LOCK_thd_data); DBUG_PRINT("semisync", ("%s: wait %lu ms for binlog sent (%s, %lu)", "Repl_semi_sync_master::commit_trx", m_wait_timeout, m_wait_file_name, (ulong)m_wait_file_pos)); + create_timeout(&abstime, &start_ts); wait_result = cond_timewait(&abstime); + + mysql_mutex_lock(&thd->LOCK_thd_data); rpl_semi_sync_master_wait_sessions--; + thd->set_awaiting_semisync_ack(FALSE); + mysql_mutex_unlock(&thd->LOCK_thd_data); if (wait_result != 0) { @@ -1319,6 +1346,25 @@ void Repl_semi_sync_master::set_export_stats() unlock(); } +void Repl_semi_sync_master::await_slave_reply() +{ + struct timespec abstime; + + DBUG_ENTER("Repl_semi_sync_master::::await_slave_reply"); + lock(); + + /* Just return if there is nothing to wait for */ + if (!rpl_semi_sync_master_wait_sessions) + goto end; + + create_timeout(&abstime, NULL); + cond_timewait(&abstime); + +end: + unlock(); + DBUG_VOID_RETURN; +} + /* Get the waiting time given the wait's staring time. * * Return: diff --git a/sql/semisync_master.h b/sql/semisync_master.h index 74f6c24c8ea..3f0af1bfe17 100644 --- a/sql/semisync_master.h +++ b/sql/semisync_master.h @@ -472,6 +472,19 @@ class Repl_semi_sync_master m_wait_timeout = wait_timeout; } + /* + Calculates a timeout that is m_wait_timeout after start_arg and saves it + in out. If start_arg is NULL, the timeout is m_wait_timeout after the + current system time. + */ + void create_timeout(struct timespec *out, struct timespec *start_arg); + + /* + Blocks the calling thread until the ack_receiver either receives an ACK + or times out (from rpl_semi_sync_master_timeout) + */ + void await_slave_reply(); + /*set the ACK point, after binlog sync or after transaction commit*/ void set_wait_point(unsigned long ack_point) { diff --git a/sql/semisync_slave.cc b/sql/semisync_slave.cc index df8baf045ac..a34170182cd 100644 --- a/sql/semisync_slave.cc +++ b/sql/semisync_slave.cc @@ -114,11 +114,14 @@ int Repl_semi_sync_slave::slave_start(Master_info *mi) int Repl_semi_sync_slave::slave_stop(Master_info *mi) { + int ret= 0; + if (get_slave_enabled()) + ret= kill_connection(mi->mysql); + if (rpl_semi_sync_slave_status) rpl_semi_sync_slave_status= 0; - if (get_slave_enabled()) - kill_connection(mi->mysql); - return 0; + + return ret; } int Repl_semi_sync_slave::reset_slave(Master_info *mi) @@ -126,31 +129,39 @@ int Repl_semi_sync_slave::reset_slave(Master_info *mi) return 0; } -void Repl_semi_sync_slave::kill_connection(MYSQL *mysql) +int Repl_semi_sync_slave::kill_connection(MYSQL *mysql) { - if (!mysql) - return; - + bool ret= 0; char kill_buffer[30]; MYSQL *kill_mysql = NULL; + size_t kill_buffer_length; + + if (!mysql) + goto end; + kill_mysql = mysql_init(kill_mysql); mysql_options(kill_mysql, MYSQL_OPT_CONNECT_TIMEOUT, &m_kill_conn_timeout); mysql_options(kill_mysql, MYSQL_OPT_READ_TIMEOUT, &m_kill_conn_timeout); mysql_options(kill_mysql, MYSQL_OPT_WRITE_TIMEOUT, &m_kill_conn_timeout); - bool ret= (!mysql_real_connect(kill_mysql, mysql->host, + ret= (!mysql_real_connect(kill_mysql, mysql->host, mysql->user, mysql->passwd,0, mysql->port, mysql->unix_socket, 0)); - if (DBUG_EVALUATE_IF("semisync_slave_failed_kill", 1, 0) || ret) + if (DBUG_EVALUATE_IF("semisync_slave_failed_kill", (ret= 1), 0) || ret) { sql_print_information("cannot connect to master to kill slave io_thread's " "connection"); - mysql_close(kill_mysql); - return; + goto end; } - size_t kill_buffer_length = my_snprintf(kill_buffer, 30, "KILL %lu", - mysql->thread_id); - mysql_real_query(kill_mysql, kill_buffer, (ulong)kill_buffer_length); + + DBUG_EXECUTE_IF("slave_delay_killing_semisync_connection", my_sleep(400000);); + + kill_buffer_length= + my_snprintf(kill_buffer, 30, "KILL %lu", mysql->thread_id); + ret= mysql_real_query(kill_mysql, kill_buffer, (ulong)kill_buffer_length); + +end: mysql_close(kill_mysql); + return ret; } int Repl_semi_sync_slave::request_transmit(Master_info *mi) diff --git a/sql/semisync_slave.h b/sql/semisync_slave.h index d65262f151d..85f515caaf8 100644 --- a/sql/semisync_slave.h +++ b/sql/semisync_slave.h @@ -90,7 +90,7 @@ public: int slave_start(Master_info *mi); int slave_stop(Master_info *mi); int request_transmit(Master_info*); - void kill_connection(MYSQL *mysql); + int kill_connection(MYSQL *mysql); int reset_slave(Master_info *mi); private: diff --git a/sql/slave.cc b/sql/slave.cc index 68ddc611c03..0f86ea2b547 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -4522,6 +4522,7 @@ pthread_handler_t handle_slave_io(void *arg) uint retry_count; bool suppress_warnings; int ret; + int err_stopping_semisync; rpl_io_thread_info io_info; #ifndef DBUG_OFF mi->dbug_do_disconnect= false; @@ -4850,6 +4851,7 @@ Stopping slave I/O thread due to out-of-memory error from master"); not cause the slave IO thread to stop, and the error messages are already reported. */ + DBUG_EXECUTE_IF("simulate_delay_semisync_slave_reply", my_sleep(800000);); (void)repl_semisync_slave.slave_reply(mi); } @@ -4921,7 +4923,7 @@ err: tmp.c_ptr_safe()); sql_print_information("master was %s:%d", mi->host, mi->port); } - repl_semisync_slave.slave_stop(mi); + err_stopping_semisync= repl_semisync_slave.slave_stop(mi); thd->reset_query(); thd->reset_db(&null_clex_str); if (mysql) @@ -4937,6 +4939,22 @@ err: #ifdef SIGNAL_WITH_VIO_CLOSE thd->clear_active_vio(); #endif + /* + If repl_semisync_slave.slave_stop() fails, e.g. if the master force + killed the kill_mysql connection due to it actively shutting down, we + need to locally clean up our side of the connection before calling + mysql_close. This is because mysql_close will send COM_QUIT on the + current connection, which the master still assumes is semisync, and + will thereby error. + */ + if (err_stopping_semisync) + { + sql_print_information( + "Failed to gracefully kill our active semi-sync connection with " + "primary. Silently closing the connection."); + net_clear(&(mysql->net),0); + end_server(mysql); + } mysql_close(mysql); mi->mysql=0; } diff --git a/sql/sql_class.cc b/sql/sql_class.cc index 21546ddaebf..03482c6ea69 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -640,7 +640,8 @@ THD::THD(my_thread_id id, bool is_wsrep_applier) #ifdef HAVE_REPLICATION , current_linfo(0), - slave_info(0) + slave_info(0), + awaiting_semisync_ack(0) #endif #ifdef WITH_WSREP , diff --git a/sql/sql_class.h b/sql/sql_class.h index b39be36a9dd..9608c07b45a 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -4826,12 +4826,31 @@ public: LOG_INFO *current_linfo; Slave_info *slave_info; + /* + Indicates if this thread is suspended due to awaiting an ACK from a + replica. True if suspended, false otherwise. + */ + bool awaiting_semisync_ack; + void set_current_linfo(LOG_INFO *linfo); void reset_current_linfo() { set_current_linfo(0); } + void set_awaiting_semisync_ack(bool status) + { + mysql_mutex_assert_owner(&LOCK_thd_data); + awaiting_semisync_ack= status; + } + int register_slave(uchar *packet, size_t packet_length); void unregister_slave(); bool is_binlog_dump_thread(); + bool is_awaiting_semisync_ack() + { + mysql_mutex_lock(&LOCK_thd_data); + bool res= awaiting_semisync_ack; + mysql_mutex_unlock(&LOCK_thd_data); + return res; + } #endif inline ulong wsrep_binlog_format() const |