summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKristian Nielsen <knielsen@knielsen-hq.org>2015-04-08 11:01:18 +0200
committerKristian Nielsen <knielsen@knielsen-hq.org>2015-04-08 11:01:18 +0200
commit3b961347db2b2ad1d31cf64829a6d0e31795e158 (patch)
tree8790bb8d47eec70ce414064db808ee64656320d1
parent880f2273fdc39cd1a2ab28f448cdfbf3d6581af2 (diff)
downloadmariadb-git-3b961347db2b2ad1d31cf64829a6d0e31795e158.tar.gz
MDEV-7888, MDEV-7929: Parallel replication hangs sometimes on ANALYZE TABLE or DDL
The hangs occur when the group_commit_orderer object is freed before the last mark_start_commit() call on it - this loses the wakeup to other waiting worker threads, causing them to hang until killed manually. The object was freed because wakeup_subsequent_commits() was called two early in two places. For MDEV-7888, during ANALYZE TABLE, and for MDEV-7929 during record_gtid() after processing a DDL event. The group_commit_orderer object can be freed when its last transaction has called wait_for_prior_commit(). Fix by implementing a suspend/resume mechanism for wakeup_subsequent_commits() that can be used in places where a transaction is committed without this being the commit of the actual replication event group. Also add a protection mechanism (that asserts in debug builds) which can prevent the too-early free and hang if other similar bugs should remain in other parts of the code.
-rw-r--r--mysql-test/suite/rpl/r/rpl_parallel.result58
-rw-r--r--mysql-test/suite/rpl/t/rpl_parallel.test94
-rw-r--r--sql/log.cc12
-rw-r--r--sql/rpl_gtid.cc29
-rw-r--r--sql/rpl_parallel.cc18
-rw-r--r--sql/sql_admin.cc14
-rw-r--r--sql/sql_class.h9
7 files changed, 231 insertions, 3 deletions
diff --git a/mysql-test/suite/rpl/r/rpl_parallel.result b/mysql-test/suite/rpl/r/rpl_parallel.result
index 666978afc98..d3e54c126a9 100644
--- a/mysql-test/suite/rpl/r/rpl_parallel.result
+++ b/mysql-test/suite/rpl/r/rpl_parallel.result
@@ -1421,6 +1421,64 @@ a b
99 99
include/stop_slave.inc
SET GLOBAL slave_transaction_retries= @old_retries;
+SET GLOBAL slave_parallel_threads=10;
+include/start_slave.inc
+*** MDEV-7888: ANALYZE TABLE does wakeup_subsequent_commits(), causing wrong binlog order and parallel replication hang ***
+include/stop_slave.inc
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug= '+d,inject_analyze_table_sleep';
+SET @old_dbug= @@SESSION.debug_dbug;
+SET SESSION debug_dbug="+d,binlog_force_commit_id";
+SET @commit_id= 10000;
+ANALYZE TABLE t2;
+Table Op Msg_type Msg_text
+test.t2 analyze status OK
+INSERT INTO t3 VALUES (120, 0);
+SET @commit_id= 10001;
+INSERT INTO t3 VALUES (121, 0);
+SET SESSION debug_dbug=@old_dbug;
+SELECT * FROM t3 WHERE a >= 120 ORDER BY a;
+a b
+120 0
+121 0
+include/save_master_gtid.inc
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+SELECT * FROM t3 WHERE a >= 120 ORDER BY a;
+a b
+120 0
+121 0
+include/stop_slave.inc
+SET GLOBAL debug_dbug= @old_debug;
+include/start_slave.inc
+*** MDEV-7929: record_gtid() for non-transactional event group calls wakeup_subsequent_commits() too early, causing slave hang. ***
+include/stop_slave.inc
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug= '+d,inject_record_gtid_serverid_100_sleep';
+SET @old_dbug= @@SESSION.debug_dbug;
+SET SESSION debug_dbug="+d,binlog_force_commit_id";
+SET @old_server_id= @@SESSION.server_id;
+SET SESSION server_id= 100;
+SET @commit_id= 10010;
+ALTER TABLE t1 COMMENT "Hulubulu!";
+SET SESSION server_id= @old_server_id;
+INSERT INTO t3 VALUES (130, 0);
+SET @commit_id= 10011;
+INSERT INTO t3 VALUES (131, 0);
+SET SESSION debug_dbug=@old_dbug;
+SELECT * FROM t3 WHERE a >= 130 ORDER BY a;
+a b
+130 0
+131 0
+include/save_master_gtid.inc
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+SELECT * FROM t3 WHERE a >= 130 ORDER BY a;
+a b
+130 0
+131 0
+include/stop_slave.inc
+SET GLOBAL debug_dbug= @old_debug;
include/start_slave.inc
include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
diff --git a/mysql-test/suite/rpl/t/rpl_parallel.test b/mysql-test/suite/rpl/t/rpl_parallel.test
index d6d39690f09..640c557c8e5 100644
--- a/mysql-test/suite/rpl/t/rpl_parallel.test
+++ b/mysql-test/suite/rpl/t/rpl_parallel.test
@@ -2042,6 +2042,100 @@ SELECT * FROM t8 ORDER BY a;
--source include/stop_slave.inc
SET GLOBAL slave_transaction_retries= @old_retries;
+SET GLOBAL slave_parallel_threads=10;
+--source include/start_slave.inc
+
+
+--echo *** MDEV-7888: ANALYZE TABLE does wakeup_subsequent_commits(), causing wrong binlog order and parallel replication hang ***
+
+--connection server_2
+--source include/stop_slave.inc
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug= '+d,inject_analyze_table_sleep';
+
+--connection server_1
+# Inject two group commits. The bug was that ANALYZE TABLE would call
+# wakeup_subsequent_commits() too early, allowing the following transaction
+# in the same group to run ahead and binlog and free the GCO. Then we get
+# wrong binlog order and later access freed GCO, which causes lost wakeup
+# of following GCO and thus replication hang.
+# We injected a small sleep in ANALYZE to make the race easier to hit (this
+# can only cause false negatives in versions with the bug, not false positives,
+# so sleep is ok here. And it's in general not possible to trigger reliably
+# the race with debug_sync, since the bugfix makes the race impossible).
+
+SET @old_dbug= @@SESSION.debug_dbug;
+SET SESSION debug_dbug="+d,binlog_force_commit_id";
+
+# Group commit with cid=10000, two event groups.
+SET @commit_id= 10000;
+ANALYZE TABLE t2;
+INSERT INTO t3 VALUES (120, 0);
+
+# Group commit with cid=10001, one event group.
+SET @commit_id= 10001;
+INSERT INTO t3 VALUES (121, 0);
+
+SET SESSION debug_dbug=@old_dbug;
+
+SELECT * FROM t3 WHERE a >= 120 ORDER BY a;
+--source include/save_master_gtid.inc
+
+--connection server_2
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+
+SELECT * FROM t3 WHERE a >= 120 ORDER BY a;
+
+--source include/stop_slave.inc
+SET GLOBAL debug_dbug= @old_debug;
+--source include/start_slave.inc
+
+
+--echo *** MDEV-7929: record_gtid() for non-transactional event group calls wakeup_subsequent_commits() too early, causing slave hang. ***
+
+--connection server_2
+--source include/stop_slave.inc
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug= '+d,inject_record_gtid_serverid_100_sleep';
+
+--connection server_1
+# Inject two group commits. The bug was that record_gtid for a
+# non-transactional event group would commit its own transaction, which would
+# cause ha_commit_trans() to call wakeup_subsequent_commits() too early. This
+# in turn lead to access to freed group_commit_orderer object, losing a wakeup
+# and causing slave threads to hang.
+# We inject a small sleep in the corresponding record_gtid() to make the race
+# easier to hit.
+
+SET @old_dbug= @@SESSION.debug_dbug;
+SET SESSION debug_dbug="+d,binlog_force_commit_id";
+
+# Group commit with cid=10010, two event groups.
+SET @old_server_id= @@SESSION.server_id;
+SET SESSION server_id= 100;
+SET @commit_id= 10010;
+ALTER TABLE t1 COMMENT "Hulubulu!";
+SET SESSION server_id= @old_server_id;
+INSERT INTO t3 VALUES (130, 0);
+
+# Group commit with cid=10011, one event group.
+SET @commit_id= 10011;
+INSERT INTO t3 VALUES (131, 0);
+
+SET SESSION debug_dbug=@old_dbug;
+
+SELECT * FROM t3 WHERE a >= 130 ORDER BY a;
+--source include/save_master_gtid.inc
+
+--connection server_2
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+
+SELECT * FROM t3 WHERE a >= 130 ORDER BY a;
+
+--source include/stop_slave.inc
+SET GLOBAL debug_dbug= @old_debug;
--source include/start_slave.inc
diff --git a/sql/log.cc b/sql/log.cc
index 5801b3a1141..0139967ffe9 100644
--- a/sql/log.cc
+++ b/sql/log.cc
@@ -5850,6 +5850,7 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info, my_bool *with_annotate)
if (direct)
{
int res;
+ uint64 commit_id= 0;
DBUG_PRINT("info", ("direct is set"));
if ((res= thd->wait_for_prior_commit()))
DBUG_RETURN(res);
@@ -5857,7 +5858,16 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info, my_bool *with_annotate)
my_org_b_tell= my_b_tell(file);
mysql_mutex_lock(&LOCK_log);
prev_binlog_id= current_binlog_id;
- if (write_gtid_event(thd, true, using_trans, 0))
+ DBUG_EXECUTE_IF("binlog_force_commit_id",
+ {
+ const LEX_STRING name= { C_STRING_WITH_LEN("commit_id") };
+ bool null_value;
+ user_var_entry *entry=
+ (user_var_entry*) my_hash_search(&thd->user_vars,
+ (uchar*) name.str, name.length);
+ commit_id= entry->val_int(&null_value);
+ });
+ if (write_gtid_event(thd, true, using_trans, commit_id))
goto err;
}
else
diff --git a/sql/rpl_gtid.cc b/sql/rpl_gtid.cc
index e5620ec41a2..08df804ac4c 100644
--- a/sql/rpl_gtid.cc
+++ b/sql/rpl_gtid.cc
@@ -515,6 +515,7 @@ rpl_slave_state::record_gtid(THD *thd, const rpl_gtid *gtid, uint64 sub_id,
element *elem;
ulonglong thd_saved_option= thd->variables.option_bits;
Query_tables_list lex_backup;
+ wait_for_commit* suspended_wfc;
DBUG_ENTER("record_gtid");
if (unlikely(!loaded))
@@ -538,6 +539,28 @@ rpl_slave_state::record_gtid(THD *thd, const rpl_gtid *gtid, uint64 sub_id,
DBUG_RETURN(1);
} );
+ /*
+ If we are applying a non-transactional event group, we will be committing
+ here a transaction, but that does not imply that the event group has
+ completed or has been binlogged. So we should not trigger
+ wakeup_subsequent_commits() here.
+
+ Note: An alternative here could be to put a call to mark_start_commit() in
+ stmt_done() before the call to record_and_update_gtid(). This would
+ prevent later calling mark_start_commit() after we have run
+ wakeup_subsequent_commits() from committing the GTID update transaction
+ (which must be avoided to avoid accessing freed group_commit_orderer
+ object). It would also allow following event groups to start slightly
+ earlier. And in the cases where record_gtid() is called without an active
+ transaction, the current statement should have been binlogged already, so
+ binlog order is preserved.
+
+ But this is rather subtle, and potentially fragile. And it does not really
+ seem worth it; non-transactional loads are unlikely to benefit much from
+ parallel replication in any case. So for now, we go with the simple
+ suspend/resume of wakeup_subsequent_commits() here in record_gtid().
+ */
+ suspended_wfc= thd->suspend_subsequent_commits();
thd->lex->reset_n_backup_query_tables_list(&lex_backup);
tlist.init_one_table(STRING_WITH_LEN("mysql"),
rpl_gtid_slave_state_table_name.str,
@@ -689,6 +712,12 @@ end:
}
thd->lex->restore_backup_query_tables_list(&lex_backup);
thd->variables.option_bits= thd_saved_option;
+ thd->resume_subsequent_commits(suspended_wfc);
+ DBUG_EXECUTE_IF("inject_record_gtid_serverid_100_sleep",
+ {
+ if (gtid->server_id == 100)
+ my_sleep(500000);
+ });
DBUG_RETURN(err);
}
diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc
index 7df1ea3bd4b..48b6ad0ef89 100644
--- a/sql/rpl_parallel.cc
+++ b/sql/rpl_parallel.cc
@@ -171,8 +171,24 @@ finish_event_group(rpl_parallel_thread *rpt, uint64 sub_id,
/* Now free any GCOs in which all transactions have committed. */
group_commit_orderer *tmp_gco= rgi->gco;
while (tmp_gco &&
- (!tmp_gco->next_gco || tmp_gco->last_sub_id > sub_id))
+ (!tmp_gco->next_gco || tmp_gco->last_sub_id > sub_id ||
+ tmp_gco->next_gco->wait_count > entry->count_committing_event_groups))
+ {
+ /*
+ We must not free a GCO before the wait_count of the following GCO has
+ been reached and wakeup has been sent. Otherwise we will lose the
+ wakeup and hang (there were several such bugs in the past).
+
+ The intention is that this is ensured already since we only free when
+ the last event group in the GCO has committed
+ (tmp_gco->last_sub_id <= sub_id). However, if we have a bug, we have
+ extra check on next_gco->wait_count to hopefully avoid hanging; we
+ have here an assertion in debug builds that this check does not in
+ fact trigger.
+ */
+ DBUG_ASSERT(!tmp_gco->next_gco || tmp_gco->last_sub_id > sub_id);
tmp_gco= tmp_gco->prev_gco;
+ }
while (tmp_gco)
{
group_commit_orderer *prev_gco= tmp_gco->prev_gco;
diff --git a/sql/sql_admin.cc b/sql/sql_admin.cc
index 1d917de0c1a..ce8302af0b3 100644
--- a/sql/sql_admin.cc
+++ b/sql/sql_admin.cc
@@ -320,6 +320,7 @@ static bool mysql_admin_table(THD* thd, TABLE_LIST* tables,
int result_code;
int compl_result_code;
bool need_repair_or_alter= 0;
+ wait_for_commit* suspended_wfc;
DBUG_ENTER("mysql_admin_table");
DBUG_PRINT("enter", ("extra_open_options: %u", extra_open_options));
@@ -337,6 +338,13 @@ static bool mysql_admin_table(THD* thd, TABLE_LIST* tables,
Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF))
DBUG_RETURN(TRUE);
+ /*
+ This function calls trans_commit() during its operation, but that does not
+ imply that the operation is complete or binlogged. So we have to suspend
+ temporarily the wakeup_subsequent_commits() calls (if used).
+ */
+ suspended_wfc= thd->suspend_subsequent_commits();
+
mysql_ha_rm_tables(thd, tables);
/*
@@ -464,7 +472,7 @@ static bool mysql_admin_table(THD* thd, TABLE_LIST* tables,
if (!table->table->part_info)
{
my_error(ER_PARTITION_MGMT_ON_NONPARTITIONED, MYF(0));
- DBUG_RETURN(TRUE);
+ goto err2;
}
if (set_part_state(alter_info, table->table->part_info, PART_ADMIN))
{
@@ -1045,6 +1053,8 @@ send_result_message:
}
my_eof(thd);
+ thd->resume_subsequent_commits(suspended_wfc);
+ DBUG_EXECUTE_IF("inject_analyze_table_sleep", my_sleep(500000););
DBUG_RETURN(FALSE);
err:
@@ -1058,6 +1068,8 @@ err:
}
close_thread_tables(thd); // Shouldn't be needed
thd->mdl_context.release_transactional_locks();
+err2:
+ thd->resume_subsequent_commits(suspended_wfc);
DBUG_RETURN(TRUE);
}
diff --git a/sql/sql_class.h b/sql/sql_class.h
index 5d64837a2c1..e47d335d926 100644
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -3682,6 +3682,15 @@ public:
if (wait_for_commit_ptr)
wait_for_commit_ptr->wakeup_subsequent_commits(wakeup_error);
}
+ wait_for_commit *suspend_subsequent_commits() {
+ wait_for_commit *suspended= wait_for_commit_ptr;
+ wait_for_commit_ptr= NULL;
+ return suspended;
+ }
+ void resume_subsequent_commits(wait_for_commit *suspended) {
+ DBUG_ASSERT(!wait_for_commit_ptr);
+ wait_for_commit_ptr= suspended;
+ }
private: