diff options
author | Kristian Nielsen <knielsen@knielsen-hq.org> | 2015-08-04 11:20:03 +0200 |
---|---|---|
committer | Kristian Nielsen <knielsen@knielsen-hq.org> | 2015-08-04 11:40:19 +0200 |
commit | 9b9c5e890c16176eaf6b54d466708c54e2df7c9d (patch) | |
tree | 370ca46a9c0471e416b29bee0669b9ed61ee7714 | |
parent | a6087e7dc1ef3561d8189c8db15e9591d0f9b520 (diff) | |
download | mariadb-git-9b9c5e890c16176eaf6b54d466708c54e2df7c9d.tar.gz |
MDEV-8302: Duplicate key with parallel replication
This bug is essentially another variant of MDEV-7458.
If a transaction conflict caused a deadlock kill of T2 in record_gtid()
during commit, the code would do a rollback _before_ running
rgi->unmark_start_commit(). This creates a race where following transactions
could start too early (before T2 has completed its transaction retry). This
in turn could lead to replication failure, if there was a conflict that
caused eg. duplicate key error or similar.
The fix is to remove these rollbacks (in Query_log_event::do_apply_event()
and Xid_log_event::do_apply_event(). They seem out-of-place; code in
log_event.cc generally does not roll back on error, this is handled higher
up.
In addition, because of the extreme difficulty of reproducing bugs like
MDEV-7458 and MDEV-8302, this patch adds some extra precations to try to
detect (in debug builds) or prevent (in release builds) similar bugs.
ha_rollback_trans() will now call unmark_start_commit() if needed (and
assert in debug build when a caller does rollback without unmark first).
We also add an extra check for thd->killed() so that we avoid doing
mark_start_commit() if we already have a pending deadlock kill.
And we add a missing unmark_start_commit() call in the error case, found by
the above assertion.
-rw-r--r-- | mysql-test/suite/rpl/r/rpl_parallel.result | 40 | ||||
-rw-r--r-- | mysql-test/suite/rpl/t/rpl_parallel.test | 54 | ||||
-rw-r--r-- | sql/handler.cc | 18 | ||||
-rw-r--r-- | sql/log_event.cc | 2 | ||||
-rw-r--r-- | sql/rpl_parallel.cc | 39 |
5 files changed, 147 insertions, 6 deletions
diff --git a/mysql-test/suite/rpl/r/rpl_parallel.result b/mysql-test/suite/rpl/r/rpl_parallel.result index 124b7c9a029..de67f5f0610 100644 --- a/mysql-test/suite/rpl/r/rpl_parallel.result +++ b/mysql-test/suite/rpl/r/rpl_parallel.result @@ -1649,6 +1649,46 @@ include/stop_slave.inc SET GLOBAL debug_dbug= @old_debg; SET GLOBAL max_relay_log_size= @old_max; include/start_slave.inc +*** MDEV-8302: Duplicate key with parallel replication *** +include/stop_slave.inc +/* Inject a small sleep which makes the race easier to hit. */ +SET @old_dbug=@@GLOBAL.debug_dbug; +SET GLOBAL debug_dbug="+d,inject_mdev8302"; +INSERT INTO t7 VALUES (100,1), (101,2), (102,3), (103,4), (104,5); +SET @old_dbug= @@SESSION.debug_dbug; +SET @commit_id= 20000; +SET SESSION debug_dbug="+d,binlog_force_commit_id"; +SET SESSION debug_dbug=@old_dbug; +SELECT * FROM t7 ORDER BY a; +a b +1 1 +2 2 +3 86 +4 4 +5 5 +100 5 +101 1 +102 2 +103 3 +104 4 +include/save_master_gtid.inc +include/start_slave.inc +include/sync_with_master_gtid.inc +SELECT * FROM t7 ORDER BY a; +a b +1 1 +2 2 +3 86 +4 4 +5 5 +100 5 +101 1 +102 2 +103 3 +104 4 +include/stop_slave.inc +SET GLOBAL debug_dbug=@old_dbug; +include/start_slave.inc include/stop_slave.inc SET GLOBAL slave_parallel_threads=@old_parallel_threads; include/start_slave.inc diff --git a/mysql-test/suite/rpl/t/rpl_parallel.test b/mysql-test/suite/rpl/t/rpl_parallel.test index feac32b1454..e70dcfa5bb0 100644 --- a/mysql-test/suite/rpl/t/rpl_parallel.test +++ b/mysql-test/suite/rpl/t/rpl_parallel.test @@ -2314,6 +2314,60 @@ SET GLOBAL max_relay_log_size= @old_max; --source include/start_slave.inc +--echo *** MDEV-8302: Duplicate key with parallel replication *** + +--connection server_2 +--source include/stop_slave.inc +/* Inject a small sleep which makes the race easier to hit. */ +SET @old_dbug=@@GLOBAL.debug_dbug; +SET GLOBAL debug_dbug="+d,inject_mdev8302"; + + +--connection server_1 +INSERT INTO t7 VALUES (100,1), (101,2), (102,3), (103,4), (104,5); + +# Artificially create a bunch of group commits with conflicting transactions. +# The bug happened when T1 and T2 was in one group commit, and T3 was in the +# following group commit. T2 is a DELETE of a row with same primary key as a +# row that T3 inserts. T1 and T2 can conflict, causing T2 to be deadlock +# killed after starting to commit. The bug was that T2 could roll back before +# doing unmark_start_commit(); this could allow T3 to run before the retry +# of T2, causing duplicate key violation. + +SET @old_dbug= @@SESSION.debug_dbug; +SET @commit_id= 20000; +SET SESSION debug_dbug="+d,binlog_force_commit_id"; + +--let $n = 100 +--disable_query_log +while ($n) +{ + eval UPDATE t7 SET b=b+1 WHERE a=100+($n MOD 5); + eval DELETE FROM t7 WHERE a=100+($n MOD 5); + + SET @commit_id = @commit_id + 1; + eval INSERT INTO t7 VALUES (100+($n MOD 5), $n); + SET @commit_id = @commit_id + 1; + dec $n; +} +--enable_query_log +SET SESSION debug_dbug=@old_dbug; + + +SELECT * FROM t7 ORDER BY a; +--source include/save_master_gtid.inc + + +--connection server_2 +--source include/start_slave.inc +--source include/sync_with_master_gtid.inc +SELECT * FROM t7 ORDER BY a; + +--source include/stop_slave.inc +SET GLOBAL debug_dbug=@old_dbug; +--source include/start_slave.inc + + # Clean up. --connection server_2 diff --git a/sql/handler.cc b/sql/handler.cc index 1f8daf3927b..6d0b20e29ee 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -1582,6 +1582,24 @@ int ha_rollback_trans(THD *thd, bool all) DBUG_ASSERT(thd->transaction.stmt.ha_list == NULL || trans == &thd->transaction.stmt); + if (is_real_trans) + { + /* + In parallel replication, if we need to rollback during commit, we must + first inform following transactions that we are going to abort our commit + attempt. Otherwise those following transactions can run too early, and + possibly cause replication to fail. See comments in retry_event_group(). + + There were several bugs with this in the past that were very hard to + track down (MDEV-7458, MDEV-8302). So we add here an assertion for + rollback without signalling following transactions. And in release + builds, we explicitly do the signalling before rolling back. + */ + DBUG_ASSERT(!(thd->rgi_slave && thd->rgi_slave->did_mark_start_commit)); + if (thd->rgi_slave && thd->rgi_slave->did_mark_start_commit) + thd->rgi_slave->unmark_start_commit(); + } + if (thd->in_sub_stmt) { DBUG_ASSERT(0); diff --git a/sql/log_event.cc b/sql/log_event.cc index 69b981f3f47..c4b3d3da365 100644 --- a/sql/log_event.cc +++ b/sql/log_event.cc @@ -4261,7 +4261,6 @@ int Query_log_event::do_apply_event(rpl_group_info *rgi, "mysql", rpl_gtid_slave_state_table_name.str, errcode, thd->get_stmt_da()->message()); - trans_rollback(thd); sub_id= 0; thd->is_slave_error= 1; goto end; @@ -7351,7 +7350,6 @@ int Xid_log_event::do_apply_event(rpl_group_info *rgi) "%s.%s: %d: %s", "mysql", rpl_gtid_slave_state_table_name.str, ec, thd->get_stmt_da()->message()); - trans_rollback(thd); thd->is_slave_error= 1; return err; } diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc index 305e8053032..600d2ab41aa 100644 --- a/sql/rpl_parallel.cc +++ b/sql/rpl_parallel.cc @@ -226,6 +226,11 @@ static void signal_error_to_sql_driver_thread(THD *thd, rpl_group_info *rgi, int err) { rgi->worker_error= err; + /* + In case we get an error during commit, inform following transactions that + we aborted our commit. + */ + rgi->unmark_start_commit(); rgi->cleanup_context(thd, true); rgi->rli->abort_slave= true; rgi->rli->stop_for_until= false; @@ -370,6 +375,7 @@ do_retry: transaction we deadlocked with will not signal that it started to commit until after the unmark. */ + DBUG_EXECUTE_IF("inject_mdev8302", { my_sleep(20000);}); rgi->unmark_start_commit(); DEBUG_SYNC(thd, "rpl_parallel_retry_after_unmark"); @@ -884,9 +890,24 @@ handle_rpl_parallel_thread(void *arg) group_ending= is_group_ending(qev->ev, event_type); if (group_ending && likely(!rgi->worker_error)) { - DEBUG_SYNC(thd, "rpl_parallel_before_mark_start_commit"); - rgi->mark_start_commit(); - DEBUG_SYNC(thd, "rpl_parallel_after_mark_start_commit"); + /* + Do an extra check for (deadlock) kill here. This helps prevent a + lingering deadlock kill that occured during normal DML processing to + propagate past the mark_start_commit(). If we detect a deadlock only + after mark_start_commit(), we have to unmark, which has at least a + theoretical possibility of leaving a window where it looks like all + transactions in a GCO have started committing, while in fact one + will need to rollback and retry. This is not supposed to be possible + (since there is a deadlock, at least one transaction should be + blocked from reaching commit), but this seems a fragile ensurance, + and there were historically a number of subtle bugs in this area. + */ + if (!thd->killed) + { + DEBUG_SYNC(thd, "rpl_parallel_before_mark_start_commit"); + rgi->mark_start_commit(); + DEBUG_SYNC(thd, "rpl_parallel_after_mark_start_commit"); + } } /* @@ -911,7 +932,17 @@ handle_rpl_parallel_thread(void *arg) }); if (!err) #endif - err= rpt_handle_event(qev, rpt); + { + if (thd->check_killed()) + { + thd->clear_error(); + thd->get_stmt_da()->reset_diagnostics_area(); + thd->send_kill_message(); + err= 1; + } + else + err= rpt_handle_event(qev, rpt); + } delete_or_keep_event_post_apply(rgi, event_type, qev->ev); DBUG_EXECUTE_IF("rpl_parallel_simulate_temp_err_gtid_0_x_100", err= dbug_simulate_tmp_error(rgi, thd);); |