diff options
author | Brandon Nesterenko <brandon.nesterenko@mariadb.com> | 2022-03-03 09:16:21 -0700 |
---|---|---|
committer | Brandon Nesterenko <brandon.nesterenko@mariadb.com> | 2022-03-03 14:01:53 -0700 |
commit | f07f8ae9be46475cb6305e21c4bb044211e67d4e (patch) | |
tree | 2d000dada901ae7ed6361957cbc05c7b36ead0d6 | |
parent | 4c9ca39f8cfbbd55ab7c2be78aed6d662862ef5d (diff) | |
download | mariadb-git-bb-10.2-MDEV-27850.tar.gz |
MDEV-27850bb-10.2-MDEV-27850
Temporary commit for buildbot testing of DEBUG_SYNC
race detection and test fixes.
10 files changed, 130 insertions, 25 deletions
diff --git a/mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test b/mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test index 181759283e2..06cf7f1dd78 100644 --- a/mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test +++ b/mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test @@ -44,18 +44,36 @@ if (!$debug_sync_action) SET @old_dbug = @@global.debug_dbug; eval SET @@global.debug_dbug= "d,$dbug_sync_point"; ---source include/start_slave.inc +--echo # Ensure no initial DEBUG_SYNC state +let $wait_condition= select count(*)=0 from information_schema.processlist where state like "%debug%"; +source include/wait_condition.inc; + --echo slave is going to hang in get_master_version_and_clock +--send START SLAVE + +--connection slave1 +--echo # Wait for slave to hit the WAIT_FOR sync point in get_master_version_and_clock +let $wait_condition= select count(*)=1 from information_schema.processlist where state like "%debug%"; +source include/wait_condition.inc; + +--connection master + --let $rpl_server_number= 1 --source include/rpl_stop_server.inc +--connection slave1 --echo slave is unblocked eval SET DEBUG_SYNC=$debug_sync_action; +--echo # Waiting for DEBUG_SYNC signal to be acknowledged.. +let $wait_condition= select count(*)=0 from information_schema.processlist where state like "%debug%"; +source include/wait_condition.inc; + # Show slave last IO errno connection slave; +--reap --echo Check network error happened here # '2013' CR_SERVER_LOST # '2003' CR_CONN_HOST_ERROR diff --git a/mysql-test/extra/rpl_tests/rpl_parallel.inc b/mysql-test/extra/rpl_tests/rpl_parallel.inc index b88d2126d4d..e581c41f557 100644 --- a/mysql-test/extra/rpl_tests/rpl_parallel.inc +++ b/mysql-test/extra/rpl_tests/rpl_parallel.inc @@ -1547,7 +1547,6 @@ SELECT * FROM t4 ORDER BY a; SET debug_sync= 'now WAIT_FOR waiting'; --sync_with_master SELECT * FROM t4 ORDER BY a; -SET debug_sync= 'now SIGNAL cont'; # Re-spawn the worker threads to remove any DBUG injections or DEBUG_SYNC. --source include/stop_slave.inc @@ -1620,6 +1619,7 @@ SELECT * FROM t2 WHERE a=40 FOR UPDATE; # transaction in the previous relay log file, blocked on the SELECT FOR # UPDATE. SET debug_sync= 'now WAIT_FOR scheduled_gtid_0_x_100'; +SET GLOBAL debug_dbug="-d,rpl_parallel_scheduled_gtid_0_x_100"; # At this point, the SQL driver thread is in the new relay log file, while # the DELETE from the old relay log file is not yet complete. We will stop # the slave at this point. The bug was that the DELETE statement would @@ -1637,6 +1637,7 @@ send STOP SLAVE; # transaction (and not after a later transaction that might have been able # to set a fixed position). SET debug_sync= 'now WAIT_FOR wait_for_done_waiting'; +SET GLOBAL debug_dbug="-d,rpl_parallel_wait_for_done_trigger"; # Now release the row lock that was blocking the replication of DELETE. ROLLBACK; diff --git a/mysql-test/suite/binlog_encryption/rpl_parallel.result b/mysql-test/suite/binlog_encryption/rpl_parallel.result index 12e5455c6c1..179d539fd71 100644 --- a/mysql-test/suite/binlog_encryption/rpl_parallel.result +++ b/mysql-test/suite/binlog_encryption/rpl_parallel.result @@ -1168,7 +1168,6 @@ a b 4 4 5 NULL 6 NULL -SET debug_sync= 'now SIGNAL cont'; include/stop_slave.inc SET GLOBAL debug_dbug=@old_dbug; SET GLOBAL binlog_format= @old_format; @@ -1207,9 +1206,11 @@ a connection server_2; include/start_slave.inc SET debug_sync= 'now WAIT_FOR scheduled_gtid_0_x_100'; +SET GLOBAL debug_dbug="-d,rpl_parallel_scheduled_gtid_0_x_100"; STOP SLAVE; connection con_temp2; SET debug_sync= 'now WAIT_FOR wait_for_done_waiting'; +SET GLOBAL debug_dbug="-d,rpl_parallel_wait_for_done_trigger"; ROLLBACK; connection server_2; include/wait_for_slave_sql_to_stop.inc diff --git a/mysql-test/suite/binlog_encryption/rpl_parallel_ignored_errors.result b/mysql-test/suite/binlog_encryption/rpl_parallel_ignored_errors.result index 3dd5a3ea83c..485e09d0ce6 100644 --- a/mysql-test/suite/binlog_encryption/rpl_parallel_ignored_errors.result +++ b/mysql-test/suite/binlog_encryption/rpl_parallel_ignored_errors.result @@ -36,6 +36,7 @@ connection con_temp2; COMMIT; connection server_2; include/stop_slave.inc +SET debug_sync='now WAIT_FOR continue_worker'; include/assert.inc [table t1 should have zero rows where a>32] SELECT * FROM t1 WHERE a>32; a diff --git a/mysql-test/suite/rpl/include/rpl_parallel_ignored_errors.inc b/mysql-test/suite/rpl/include/rpl_parallel_ignored_errors.inc index 7a6a758a508..130551688b6 100644 --- a/mysql-test/suite/rpl/include/rpl_parallel_ignored_errors.inc +++ b/mysql-test/suite/rpl/include/rpl_parallel_ignored_errors.inc @@ -94,6 +94,14 @@ COMMIT; # Clean up. --connection server_2 --source include/stop_slave.inc + +# Here we manually must complete the DEBUG_SYNC communication from the last +# signal. This is because the last worker thread signalled 'continue_worker' +# after committing the transaction; however, the signal remains unacknowledged +# because the WAIT_FOR happens when a new GTID is processed, but there are no +# more events. +SET debug_sync='now WAIT_FOR continue_worker'; + --let $assert_cond= COUNT(*) = 0 FROM t1 WHERE a>32 --let $assert_text= table t1 should have zero rows where a>32 --source include/assert.inc diff --git a/mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result b/mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result index deea7caa12a..b96ecdee983 100644 --- a/mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result +++ b/mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result @@ -9,12 +9,18 @@ SET @saved_dbug = @@GLOBAL.debug_dbug; connection slave; include/stop_slave.inc SET @old_dbug = @@global.debug_dbug; -SET @@global.debug_dbug= "d,'debug_lock.before_get_UNIX_TIMESTAMP'"; -include/start_slave.inc +SET @@global.debug_dbug= "d,dbug.before_get_UNIX_TIMESTAMP"; +# Ensure no initial DEBUG_SYNC state slave is going to hang in get_master_version_and_clock +START SLAVE; +connection slave1; +# Wait for slave to hit the WAIT_FOR sync point in get_master_version_and_clock +connection master; include/rpl_stop_server.inc [server_number=1] +connection slave1; slave is unblocked SET DEBUG_SYNC='now SIGNAL signal.get_unix_timestamp'; +# Waiting for DEBUG_SYNC signal to be acknowledged.. connection slave; Check network error happened here include/wait_for_slave_io_error.inc [errno=1053, 2002, 2003, 2006, 2013] @@ -25,12 +31,18 @@ connection slave; connection slave; include/stop_slave.inc SET @old_dbug = @@global.debug_dbug; -SET @@global.debug_dbug= "d,'debug_lock.before_get_SERVER_ID'"; -include/start_slave.inc +SET @@global.debug_dbug= "d,dbug.before_get_SERVER_ID"; +# Ensure no initial DEBUG_SYNC state slave is going to hang in get_master_version_and_clock +START SLAVE; +connection slave1; +# Wait for slave to hit the WAIT_FOR sync point in get_master_version_and_clock +connection master; include/rpl_stop_server.inc [server_number=1] +connection slave1; slave is unblocked SET DEBUG_SYNC='now SIGNAL signal.get_server_id'; +# Waiting for DEBUG_SYNC signal to be acknowledged.. connection slave; Check network error happened here include/wait_for_slave_io_error.inc [errno=1053, 2002, 2003, 2006, 2013] diff --git a/mysql-test/suite/rpl/r/rpl_parallel.result b/mysql-test/suite/rpl/r/rpl_parallel.result index 657b3ba7448..af89b15dcff 100644 --- a/mysql-test/suite/rpl/r/rpl_parallel.result +++ b/mysql-test/suite/rpl/r/rpl_parallel.result @@ -1167,7 +1167,6 @@ a b 4 4 5 NULL 6 NULL -SET debug_sync= 'now SIGNAL cont'; include/stop_slave.inc SET GLOBAL debug_dbug=@old_dbug; SET GLOBAL binlog_format= @old_format; @@ -1206,9 +1205,11 @@ a connection server_2; include/start_slave.inc SET debug_sync= 'now WAIT_FOR scheduled_gtid_0_x_100'; +SET GLOBAL debug_dbug="-d,rpl_parallel_scheduled_gtid_0_x_100"; STOP SLAVE; connection con_temp2; SET debug_sync= 'now WAIT_FOR wait_for_done_waiting'; +SET GLOBAL debug_dbug="-d,rpl_parallel_wait_for_done_trigger"; ROLLBACK; connection server_2; include/wait_for_slave_sql_to_stop.inc diff --git a/mysql-test/suite/rpl/r/rpl_parallel_ignored_errors.result b/mysql-test/suite/rpl/r/rpl_parallel_ignored_errors.result index 3dd5a3ea83c..485e09d0ce6 100644 --- a/mysql-test/suite/rpl/r/rpl_parallel_ignored_errors.result +++ b/mysql-test/suite/rpl/r/rpl_parallel_ignored_errors.result @@ -36,6 +36,7 @@ connection con_temp2; COMMIT; connection server_2; include/stop_slave.inc +SET debug_sync='now WAIT_FOR continue_worker'; include/assert.inc [table t1 should have zero rows where a>32] SELECT * FROM t1 WHERE a>32; a diff --git a/mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test b/mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test index 0e37ac631ec..b0b7b23d955 100644 --- a/mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test +++ b/mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test @@ -30,14 +30,14 @@ call mtr.add_suppression("Slave I/O thread .* register on master"); SET @saved_dbug = @@GLOBAL.debug_dbug; # set up two parameters to pass into extra/rpl_tests/rpl_get_master_version_and_clock -let $dbug_sync_point= 'debug_lock.before_get_UNIX_TIMESTAMP'; +let $dbug_sync_point= dbug.before_get_UNIX_TIMESTAMP; let $debug_sync_action= 'now SIGNAL signal.get_unix_timestamp'; source extra/rpl_tests/rpl_get_master_version_and_clock.test; #Test case 2: Try to get the value of the SERVER_ID from master under network disconnection connection slave; -let $dbug_sync_point= 'debug_lock.before_get_SERVER_ID'; +let $dbug_sync_point= dbug.before_get_SERVER_ID; let $debug_sync_action= 'now SIGNAL signal.get_server_id'; source extra/rpl_tests/rpl_get_master_version_and_clock.test; diff --git a/sql/debug_sync.cc b/sql/debug_sync.cc index cae39a9bf5c..3646e3ad2b3 100644 --- a/sql/debug_sync.cc +++ b/sql/debug_sync.cc @@ -71,11 +71,13 @@ struct st_debug_sync_signal_counter String signal_name; ulong suspended_threads; ulong resumed_threads; + ulong killed_threads; st_debug_sync_signal_counter(const String &sig) { suspended_threads= 0; resumed_threads= 0; + killed_threads= 0; signal_name.copy(sig); } @@ -618,8 +620,9 @@ static void debug_sync_reset(THD *thd) mysql_mutex_lock(&debug_sync_global.ds_mutex); /* - Bad synchronization state, a SIGNAL has been sent but never - acknowledged, the underlying test probably has a pattern similar to + This block validates that an existing signal which was not acknowledged + (i.e. by WAIT_FOR) is not being dropped. If you have encountered this + error, the underlying test may have a pattern similar to: SET DEBUG_SYNC= 'now SIGNAL sig1'; SET DEBUG_SYNC= 'RESET'; @@ -1497,8 +1500,10 @@ static void debug_sync_execute(THD *thd, st_debug_sync_action *action) if (action->signal.length()) { /* - Bad synchronization state, a SIGNAL has been sent but never - acknowledged, the underlying test probably has a pattern similar to + This block validates that the new signal is not overwriting an existing + signal which was not acknowledged (i.e. by WAIT_FOR). If you + have encountered this error, the underlying test may have a pattern + similar to: SET DEBUG_SYNC= 'now SIGNAL sig1'; SET DEBUG_SYNC= 'now SIGNAL sig2'; @@ -1511,13 +1516,25 @@ static void debug_sync_execute(THD *thd, st_debug_sync_action *action) have this pattern and can be used as a starting place to fix this, e.g. 1) main.query_cache_debug 2) main.partition_debug_sync + + Note that, if the new signal is the same as the current signal, this + check is skipped, as concurrent threads performing the same task can + send the same signal, e.g. in parallel replication. */ - if (debug_sync_global.cur_acks_needed) + if (debug_sync_global.ds_signal.length() && + stringcmp(&debug_sync_global.ds_signal, &action->signal)) { - sql_print_error("Lost DEBUG_SYNC signal '%s' due to SIGNAL '%s'", - debug_sync_global.ds_signal.ptr(), - action->signal.ptr()); - DBUG_ABORT(); + st_debug_sync_signal_counter *cur_sig_el= + find_or_create_signal_counter(debug_sync_global.ds_signal); + if (debug_sync_global.cur_acks_needed && !cur_sig_el->killed_threads) + { + sql_print_error("Lost DEBUG_SYNC signal '%s' due to SIGNAL '%s' " + "(needed %lu acks and got %lu)", + debug_sync_global.ds_signal.ptr(), + action->signal.ptr(), cur_sig_el->suspended_threads, + cur_sig_el->resumed_threads); + DBUG_ABORT(); + } } sig_el= find_or_create_signal_counter(action->signal); @@ -1532,12 +1549,26 @@ static void debug_sync_execute(THD *thd, st_debug_sync_action *action) debug_sync_emergency_disable(); /* purecov: tested */ } + /* The WAIT_FOR hasn't yet executed, but we want to save that this signal was given */ debug_sync_global.cur_acks_needed= - sig_el->suspended_threads ? sig_el->suspended_threads : 1; + (sig_el->suspended_threads ? sig_el->suspended_threads : 1) - + sig_el->killed_threads; + + if (sig_el->killed_threads && !debug_sync_global.cur_acks_needed) + { + /* + No other threads need to be acknowledged by the signal, delete it + from the map here + */ + fprintf(stderr, "Deleting signal %s during SIGNALLING because of killed thread\n", sig_el->signal_name.ptr()); + my_hash_delete(&debug_sync_global.ds_signals_all, (uchar *) sig_el); + } + + fprintf(stderr, "Set cur_acks_needed=%lu for signal %s\n", debug_sync_global.cur_acks_needed, action->signal.ptr()); /* Wake threads waiting in a sync point. */ mysql_cond_broadcast(&debug_sync_global.ds_cond); @@ -1587,6 +1618,9 @@ static void debug_sync_execute(THD *thd, st_debug_sync_action *action) sig_el= find_or_create_signal_counter(action->wait_for); sig_el->suspended_threads++; + fprintf(stderr, "suspending waiting for signal %s (total %lu)\n", + sig_el->signal_name.ptr(), sig_el->suspended_threads); + /* Wait until global signal string matches the wait_for string. Interrupt when thread or query is killed or facility disabled. @@ -1627,12 +1661,40 @@ static void debug_sync_execute(THD *thd, st_debug_sync_action *action) debug_sync_global.cur_acks_needed= sig_el->suspended_threads - sig_el->resumed_threads; - /* - If all suspended threads have acknowledged the signal, delete the - counter - */ - if (sig_el->resumed_threads == sig_el->suspended_threads) + fprintf(stderr, + "Signal %s finished waiting (%lu of %lu) cur acks is %lu killed " + "is %d\n", + sig_el->signal_name.ptr(), sig_el->resumed_threads, + sig_el->suspended_threads, debug_sync_global.cur_acks_needed, + thd->killed + ); + + if (thd->killed) + { + /* + A thread which was awaiting a signal was forcibly killed, we should + make note of this in case the corresponding signal is given later + + We do not want to remove the signal from the map yet because the + signal can be given later. + + ALSO maybe rewrite this? + */ + sig_el->killed_threads++; + } + else if (sig_el->resumed_threads == sig_el->suspended_threads) + { + /* + If all suspended threads have acknowledged the signal, delete the + counter + */ + fprintf( + stderr, + "Removing signal %s because it has been fully ACKd\n", + sig_el->signal_name.ptr()); my_hash_delete(&debug_sync_global.ds_signals_all, (uchar*) sig_el); + } + DBUG_EXECUTE("debug_sync_exec", if (thd->killed) |