From cdf19cd618ed23fbf7051130b2a6b587c4a4316b Mon Sep 17 00:00:00 2001 From: Brandon Nesterenko Date: Tue, 8 Feb 2022 09:53:56 -0700 Subject: MDEV-16091: Seconds_Behind_Master spikes to millions of seconds The rpl.rpl_seconds_behind_master_spike test would sometimes timeout or take a very long time to complete. This happened because an MTR DEBUG_SYNC signal would be lost due to a subsequent call to RESET. I.e., the slave SQL thread would be paused due to the WAIT_FOR signal being lost, resulting in either a failed test if the `select master_pos_wait` timeout occurs first, or a very long run-time if the DBUG_SYNC timeout occurs first. The fix ensures that the MTR signal is processed by the slave SQL thread before issuing the call to RESET Reviewed By: ============ Andrei Elkin --- .../suite/rpl/r/rpl_seconds_behind_master_spike.result | 5 +++-- .../suite/rpl/t/rpl_seconds_behind_master_spike.test | 18 ++++++++++++++++-- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/mysql-test/suite/rpl/r/rpl_seconds_behind_master_spike.result b/mysql-test/suite/rpl/r/rpl_seconds_behind_master_spike.result index be18f95c2c0..4eeb863bb40 100644 --- a/mysql-test/suite/rpl/r/rpl_seconds_behind_master_spike.result +++ b/mysql-test/suite/rpl/r/rpl_seconds_behind_master_spike.result @@ -29,9 +29,10 @@ count(*)=1 # not considered in Seconds_Behind_Master calculation connection slave1; # Safely resume slave SQL thread -SET @@global.debug_dbug=''; -SET DEBUG_SYNC='pause_sql_thread_on_fde CLEAR'; +# Prove SQL thread is in state "debug sync point: now" +SET @@global.debug_dbug="-d,pause_sql_thread_on_fde"; SET DEBUG_SYNC='now SIGNAL sql_thread_continue'; +# Wait for SQL thread to continue into normal execution SET DEBUG_SYNC='RESET'; connection master; DROP TABLE t1; diff --git a/mysql-test/suite/rpl/t/rpl_seconds_behind_master_spike.test b/mysql-test/suite/rpl/t/rpl_seconds_behind_master_spike.test index 029625a09ad..31a9478f632 100644 --- a/mysql-test/suite/rpl/t/rpl_seconds_behind_master_spike.test +++ b/mysql-test/suite/rpl/t/rpl_seconds_behind_master_spike.test @@ -71,10 +71,24 @@ if(`select $sbm > $t_now - $t_master_events_logged + 1`) } --echo # Safely resume slave SQL thread -SET @@global.debug_dbug=''; -SET DEBUG_SYNC='pause_sql_thread_on_fde CLEAR'; + +--let $dbug_wait_state="debug sync point: now" +--echo # Prove SQL thread is in state $dbug_wait_state +--let $wait_condition= SELECT STATE=$dbug_wait_state from information_schema.PROCESSLIST where COMMAND="Slave_SQL" +--source include/wait_condition.inc + +SET @@global.debug_dbug="-d,pause_sql_thread_on_fde"; SET DEBUG_SYNC='now SIGNAL sql_thread_continue'; +# We have to wait for the SQL thread to acknowledge the sql_thread_continue +# signal. Otherwise the below RESET command can overwrite the signal before +# the SQL thread is notified to proceed, causing it to "permanently" become +# stuck awaiting the signal (until timeout is reached). + +--echo # Wait for SQL thread to continue into normal execution +--let $wait_condition= SELECT STATE!= $dbug_wait_state from information_schema.PROCESSLIST where COMMAND="Slave_SQL" +--source include/wait_condition.inc + # Reset last sql_thread_continue signal SET DEBUG_SYNC='RESET'; -- cgit v1.2.1