summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrandon Nesterenko <brandon.nesterenko@mariadb.com>2022-03-03 09:16:21 -0700
committerBrandon Nesterenko <brandon.nesterenko@mariadb.com>2022-03-03 14:01:53 -0700
commitf07f8ae9be46475cb6305e21c4bb044211e67d4e (patch)
tree2d000dada901ae7ed6361957cbc05c7b36ead0d6
parent4c9ca39f8cfbbd55ab7c2be78aed6d662862ef5d (diff)
downloadmariadb-git-bb-10.2-MDEV-27850.tar.gz
Temporary commit for buildbot testing of DEBUG_SYNC race detection and test fixes.
-rw-r--r--mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test20
-rw-r--r--mysql-test/extra/rpl_tests/rpl_parallel.inc3
-rw-r--r--mysql-test/suite/binlog_encryption/rpl_parallel.result3
-rw-r--r--mysql-test/suite/binlog_encryption/rpl_parallel_ignored_errors.result1
-rw-r--r--mysql-test/suite/rpl/include/rpl_parallel_ignored_errors.inc8
-rw-r--r--mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result20
-rw-r--r--mysql-test/suite/rpl/r/rpl_parallel.result3
-rw-r--r--mysql-test/suite/rpl/r/rpl_parallel_ignored_errors.result1
-rw-r--r--mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test4
-rw-r--r--sql/debug_sync.cc92
10 files changed, 130 insertions, 25 deletions
diff --git a/mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test b/mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test
index 181759283e2..06cf7f1dd78 100644
--- a/mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test
+++ b/mysql-test/extra/rpl_tests/rpl_get_master_version_and_clock.test
@@ -44,18 +44,36 @@ if (!$debug_sync_action)
SET @old_dbug = @@global.debug_dbug;
eval SET @@global.debug_dbug= "d,$dbug_sync_point";
---source include/start_slave.inc
+--echo # Ensure no initial DEBUG_SYNC state
+let $wait_condition= select count(*)=0 from information_schema.processlist where state like "%debug%";
+source include/wait_condition.inc;
+
--echo slave is going to hang in get_master_version_and_clock
+--send START SLAVE
+
+--connection slave1
+--echo # Wait for slave to hit the WAIT_FOR sync point in get_master_version_and_clock
+let $wait_condition= select count(*)=1 from information_schema.processlist where state like "%debug%";
+source include/wait_condition.inc;
+
+--connection master
+
--let $rpl_server_number= 1
--source include/rpl_stop_server.inc
+--connection slave1
--echo slave is unblocked
eval SET DEBUG_SYNC=$debug_sync_action;
+--echo # Waiting for DEBUG_SYNC signal to be acknowledged..
+let $wait_condition= select count(*)=0 from information_schema.processlist where state like "%debug%";
+source include/wait_condition.inc;
+
# Show slave last IO errno
connection slave;
+--reap
--echo Check network error happened here
# '2013' CR_SERVER_LOST
# '2003' CR_CONN_HOST_ERROR
diff --git a/mysql-test/extra/rpl_tests/rpl_parallel.inc b/mysql-test/extra/rpl_tests/rpl_parallel.inc
index b88d2126d4d..e581c41f557 100644
--- a/mysql-test/extra/rpl_tests/rpl_parallel.inc
+++ b/mysql-test/extra/rpl_tests/rpl_parallel.inc
@@ -1547,7 +1547,6 @@ SELECT * FROM t4 ORDER BY a;
SET debug_sync= 'now WAIT_FOR waiting';
--sync_with_master
SELECT * FROM t4 ORDER BY a;
-SET debug_sync= 'now SIGNAL cont';
# Re-spawn the worker threads to remove any DBUG injections or DEBUG_SYNC.
--source include/stop_slave.inc
@@ -1620,6 +1619,7 @@ SELECT * FROM t2 WHERE a=40 FOR UPDATE;
# transaction in the previous relay log file, blocked on the SELECT FOR
# UPDATE.
SET debug_sync= 'now WAIT_FOR scheduled_gtid_0_x_100';
+SET GLOBAL debug_dbug="-d,rpl_parallel_scheduled_gtid_0_x_100";
# At this point, the SQL driver thread is in the new relay log file, while
# the DELETE from the old relay log file is not yet complete. We will stop
# the slave at this point. The bug was that the DELETE statement would
@@ -1637,6 +1637,7 @@ send STOP SLAVE;
# transaction (and not after a later transaction that might have been able
# to set a fixed position).
SET debug_sync= 'now WAIT_FOR wait_for_done_waiting';
+SET GLOBAL debug_dbug="-d,rpl_parallel_wait_for_done_trigger";
# Now release the row lock that was blocking the replication of DELETE.
ROLLBACK;
diff --git a/mysql-test/suite/binlog_encryption/rpl_parallel.result b/mysql-test/suite/binlog_encryption/rpl_parallel.result
index 12e5455c6c1..179d539fd71 100644
--- a/mysql-test/suite/binlog_encryption/rpl_parallel.result
+++ b/mysql-test/suite/binlog_encryption/rpl_parallel.result
@@ -1168,7 +1168,6 @@ a b
4 4
5 NULL
6 NULL
-SET debug_sync= 'now SIGNAL cont';
include/stop_slave.inc
SET GLOBAL debug_dbug=@old_dbug;
SET GLOBAL binlog_format= @old_format;
@@ -1207,9 +1206,11 @@ a
connection server_2;
include/start_slave.inc
SET debug_sync= 'now WAIT_FOR scheduled_gtid_0_x_100';
+SET GLOBAL debug_dbug="-d,rpl_parallel_scheduled_gtid_0_x_100";
STOP SLAVE;
connection con_temp2;
SET debug_sync= 'now WAIT_FOR wait_for_done_waiting';
+SET GLOBAL debug_dbug="-d,rpl_parallel_wait_for_done_trigger";
ROLLBACK;
connection server_2;
include/wait_for_slave_sql_to_stop.inc
diff --git a/mysql-test/suite/binlog_encryption/rpl_parallel_ignored_errors.result b/mysql-test/suite/binlog_encryption/rpl_parallel_ignored_errors.result
index 3dd5a3ea83c..485e09d0ce6 100644
--- a/mysql-test/suite/binlog_encryption/rpl_parallel_ignored_errors.result
+++ b/mysql-test/suite/binlog_encryption/rpl_parallel_ignored_errors.result
@@ -36,6 +36,7 @@ connection con_temp2;
COMMIT;
connection server_2;
include/stop_slave.inc
+SET debug_sync='now WAIT_FOR continue_worker';
include/assert.inc [table t1 should have zero rows where a>32]
SELECT * FROM t1 WHERE a>32;
a
diff --git a/mysql-test/suite/rpl/include/rpl_parallel_ignored_errors.inc b/mysql-test/suite/rpl/include/rpl_parallel_ignored_errors.inc
index 7a6a758a508..130551688b6 100644
--- a/mysql-test/suite/rpl/include/rpl_parallel_ignored_errors.inc
+++ b/mysql-test/suite/rpl/include/rpl_parallel_ignored_errors.inc
@@ -94,6 +94,14 @@ COMMIT;
# Clean up.
--connection server_2
--source include/stop_slave.inc
+
+# Here we manually must complete the DEBUG_SYNC communication from the last
+# signal. This is because the last worker thread signalled 'continue_worker'
+# after committing the transaction; however, the signal remains unacknowledged
+# because the WAIT_FOR happens when a new GTID is processed, but there are no
+# more events.
+SET debug_sync='now WAIT_FOR continue_worker';
+
--let $assert_cond= COUNT(*) = 0 FROM t1 WHERE a>32
--let $assert_text= table t1 should have zero rows where a>32
--source include/assert.inc
diff --git a/mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result b/mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result
index deea7caa12a..b96ecdee983 100644
--- a/mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result
+++ b/mysql-test/suite/rpl/r/rpl_get_master_version_and_clock.result
@@ -9,12 +9,18 @@ SET @saved_dbug = @@GLOBAL.debug_dbug;
connection slave;
include/stop_slave.inc
SET @old_dbug = @@global.debug_dbug;
-SET @@global.debug_dbug= "d,'debug_lock.before_get_UNIX_TIMESTAMP'";
-include/start_slave.inc
+SET @@global.debug_dbug= "d,dbug.before_get_UNIX_TIMESTAMP";
+# Ensure no initial DEBUG_SYNC state
slave is going to hang in get_master_version_and_clock
+START SLAVE;
+connection slave1;
+# Wait for slave to hit the WAIT_FOR sync point in get_master_version_and_clock
+connection master;
include/rpl_stop_server.inc [server_number=1]
+connection slave1;
slave is unblocked
SET DEBUG_SYNC='now SIGNAL signal.get_unix_timestamp';
+# Waiting for DEBUG_SYNC signal to be acknowledged..
connection slave;
Check network error happened here
include/wait_for_slave_io_error.inc [errno=1053, 2002, 2003, 2006, 2013]
@@ -25,12 +31,18 @@ connection slave;
connection slave;
include/stop_slave.inc
SET @old_dbug = @@global.debug_dbug;
-SET @@global.debug_dbug= "d,'debug_lock.before_get_SERVER_ID'";
-include/start_slave.inc
+SET @@global.debug_dbug= "d,dbug.before_get_SERVER_ID";
+# Ensure no initial DEBUG_SYNC state
slave is going to hang in get_master_version_and_clock
+START SLAVE;
+connection slave1;
+# Wait for slave to hit the WAIT_FOR sync point in get_master_version_and_clock
+connection master;
include/rpl_stop_server.inc [server_number=1]
+connection slave1;
slave is unblocked
SET DEBUG_SYNC='now SIGNAL signal.get_server_id';
+# Waiting for DEBUG_SYNC signal to be acknowledged..
connection slave;
Check network error happened here
include/wait_for_slave_io_error.inc [errno=1053, 2002, 2003, 2006, 2013]
diff --git a/mysql-test/suite/rpl/r/rpl_parallel.result b/mysql-test/suite/rpl/r/rpl_parallel.result
index 657b3ba7448..af89b15dcff 100644
--- a/mysql-test/suite/rpl/r/rpl_parallel.result
+++ b/mysql-test/suite/rpl/r/rpl_parallel.result
@@ -1167,7 +1167,6 @@ a b
4 4
5 NULL
6 NULL
-SET debug_sync= 'now SIGNAL cont';
include/stop_slave.inc
SET GLOBAL debug_dbug=@old_dbug;
SET GLOBAL binlog_format= @old_format;
@@ -1206,9 +1205,11 @@ a
connection server_2;
include/start_slave.inc
SET debug_sync= 'now WAIT_FOR scheduled_gtid_0_x_100';
+SET GLOBAL debug_dbug="-d,rpl_parallel_scheduled_gtid_0_x_100";
STOP SLAVE;
connection con_temp2;
SET debug_sync= 'now WAIT_FOR wait_for_done_waiting';
+SET GLOBAL debug_dbug="-d,rpl_parallel_wait_for_done_trigger";
ROLLBACK;
connection server_2;
include/wait_for_slave_sql_to_stop.inc
diff --git a/mysql-test/suite/rpl/r/rpl_parallel_ignored_errors.result b/mysql-test/suite/rpl/r/rpl_parallel_ignored_errors.result
index 3dd5a3ea83c..485e09d0ce6 100644
--- a/mysql-test/suite/rpl/r/rpl_parallel_ignored_errors.result
+++ b/mysql-test/suite/rpl/r/rpl_parallel_ignored_errors.result
@@ -36,6 +36,7 @@ connection con_temp2;
COMMIT;
connection server_2;
include/stop_slave.inc
+SET debug_sync='now WAIT_FOR continue_worker';
include/assert.inc [table t1 should have zero rows where a>32]
SELECT * FROM t1 WHERE a>32;
a
diff --git a/mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test b/mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test
index 0e37ac631ec..b0b7b23d955 100644
--- a/mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test
+++ b/mysql-test/suite/rpl/t/rpl_get_master_version_and_clock.test
@@ -30,14 +30,14 @@ call mtr.add_suppression("Slave I/O thread .* register on master");
SET @saved_dbug = @@GLOBAL.debug_dbug;
# set up two parameters to pass into extra/rpl_tests/rpl_get_master_version_and_clock
-let $dbug_sync_point= 'debug_lock.before_get_UNIX_TIMESTAMP';
+let $dbug_sync_point= dbug.before_get_UNIX_TIMESTAMP;
let $debug_sync_action= 'now SIGNAL signal.get_unix_timestamp';
source extra/rpl_tests/rpl_get_master_version_and_clock.test;
#Test case 2: Try to get the value of the SERVER_ID from master under network disconnection
connection slave;
-let $dbug_sync_point= 'debug_lock.before_get_SERVER_ID';
+let $dbug_sync_point= dbug.before_get_SERVER_ID;
let $debug_sync_action= 'now SIGNAL signal.get_server_id';
source extra/rpl_tests/rpl_get_master_version_and_clock.test;
diff --git a/sql/debug_sync.cc b/sql/debug_sync.cc
index cae39a9bf5c..3646e3ad2b3 100644
--- a/sql/debug_sync.cc
+++ b/sql/debug_sync.cc
@@ -71,11 +71,13 @@ struct st_debug_sync_signal_counter
String signal_name;
ulong suspended_threads;
ulong resumed_threads;
+ ulong killed_threads;
st_debug_sync_signal_counter(const String &sig)
{
suspended_threads= 0;
resumed_threads= 0;
+ killed_threads= 0;
signal_name.copy(sig);
}
@@ -618,8 +620,9 @@ static void debug_sync_reset(THD *thd)
mysql_mutex_lock(&debug_sync_global.ds_mutex);
/*
- Bad synchronization state, a SIGNAL has been sent but never
- acknowledged, the underlying test probably has a pattern similar to
+ This block validates that an existing signal which was not acknowledged
+ (i.e. by WAIT_FOR) is not being dropped. If you have encountered this
+ error, the underlying test may have a pattern similar to:
SET DEBUG_SYNC= 'now SIGNAL sig1';
SET DEBUG_SYNC= 'RESET';
@@ -1497,8 +1500,10 @@ static void debug_sync_execute(THD *thd, st_debug_sync_action *action)
if (action->signal.length())
{
/*
- Bad synchronization state, a SIGNAL has been sent but never
- acknowledged, the underlying test probably has a pattern similar to
+ This block validates that the new signal is not overwriting an existing
+ signal which was not acknowledged (i.e. by WAIT_FOR). If you
+ have encountered this error, the underlying test may have a pattern
+ similar to:
SET DEBUG_SYNC= 'now SIGNAL sig1';
SET DEBUG_SYNC= 'now SIGNAL sig2';
@@ -1511,13 +1516,25 @@ static void debug_sync_execute(THD *thd, st_debug_sync_action *action)
have this pattern and can be used as a starting place to fix this, e.g.
1) main.query_cache_debug
2) main.partition_debug_sync
+
+ Note that, if the new signal is the same as the current signal, this
+ check is skipped, as concurrent threads performing the same task can
+ send the same signal, e.g. in parallel replication.
*/
- if (debug_sync_global.cur_acks_needed)
+ if (debug_sync_global.ds_signal.length() &&
+ stringcmp(&debug_sync_global.ds_signal, &action->signal))
{
- sql_print_error("Lost DEBUG_SYNC signal '%s' due to SIGNAL '%s'",
- debug_sync_global.ds_signal.ptr(),
- action->signal.ptr());
- DBUG_ABORT();
+ st_debug_sync_signal_counter *cur_sig_el=
+ find_or_create_signal_counter(debug_sync_global.ds_signal);
+ if (debug_sync_global.cur_acks_needed && !cur_sig_el->killed_threads)
+ {
+ sql_print_error("Lost DEBUG_SYNC signal '%s' due to SIGNAL '%s' "
+ "(needed %lu acks and got %lu)",
+ debug_sync_global.ds_signal.ptr(),
+ action->signal.ptr(), cur_sig_el->suspended_threads,
+ cur_sig_el->resumed_threads);
+ DBUG_ABORT();
+ }
}
sig_el= find_or_create_signal_counter(action->signal);
@@ -1532,12 +1549,26 @@ static void debug_sync_execute(THD *thd, st_debug_sync_action *action)
debug_sync_emergency_disable(); /* purecov: tested */
}
+
/*
The WAIT_FOR hasn't yet executed, but we want to save that this signal
was given
*/
debug_sync_global.cur_acks_needed=
- sig_el->suspended_threads ? sig_el->suspended_threads : 1;
+ (sig_el->suspended_threads ? sig_el->suspended_threads : 1) -
+ sig_el->killed_threads;
+
+ if (sig_el->killed_threads && !debug_sync_global.cur_acks_needed)
+ {
+ /*
+ No other threads need to be acknowledged by the signal, delete it
+ from the map here
+ */
+ fprintf(stderr, "Deleting signal %s during SIGNALLING because of killed thread\n", sig_el->signal_name.ptr());
+ my_hash_delete(&debug_sync_global.ds_signals_all, (uchar *) sig_el);
+ }
+
+ fprintf(stderr, "Set cur_acks_needed=%lu for signal %s\n", debug_sync_global.cur_acks_needed, action->signal.ptr());
/* Wake threads waiting in a sync point. */
mysql_cond_broadcast(&debug_sync_global.ds_cond);
@@ -1587,6 +1618,9 @@ static void debug_sync_execute(THD *thd, st_debug_sync_action *action)
sig_el= find_or_create_signal_counter(action->wait_for);
sig_el->suspended_threads++;
+ fprintf(stderr, "suspending waiting for signal %s (total %lu)\n",
+ sig_el->signal_name.ptr(), sig_el->suspended_threads);
+
/*
Wait until global signal string matches the wait_for string.
Interrupt when thread or query is killed or facility disabled.
@@ -1627,12 +1661,40 @@ static void debug_sync_execute(THD *thd, st_debug_sync_action *action)
debug_sync_global.cur_acks_needed=
sig_el->suspended_threads - sig_el->resumed_threads;
- /*
- If all suspended threads have acknowledged the signal, delete the
- counter
- */
- if (sig_el->resumed_threads == sig_el->suspended_threads)
+ fprintf(stderr,
+ "Signal %s finished waiting (%lu of %lu) cur acks is %lu killed "
+ "is %d\n",
+ sig_el->signal_name.ptr(), sig_el->resumed_threads,
+ sig_el->suspended_threads, debug_sync_global.cur_acks_needed,
+ thd->killed
+ );
+
+ if (thd->killed)
+ {
+ /*
+ A thread which was awaiting a signal was forcibly killed, we should
+ make note of this in case the corresponding signal is given later
+
+ We do not want to remove the signal from the map yet because the
+ signal can be given later.
+
+ ALSO maybe rewrite this?
+ */
+ sig_el->killed_threads++;
+ }
+ else if (sig_el->resumed_threads == sig_el->suspended_threads)
+ {
+ /*
+ If all suspended threads have acknowledged the signal, delete the
+ counter
+ */
+ fprintf(
+ stderr,
+ "Removing signal %s because it has been fully ACKd\n",
+ sig_el->signal_name.ptr());
my_hash_delete(&debug_sync_global.ds_signals_all, (uchar*) sig_el);
+ }
+
DBUG_EXECUTE("debug_sync_exec",
if (thd->killed)