summaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorAndrei Elkin <aelkin@mysql.com>2009-10-09 16:26:37 +0300
committerAndrei Elkin <aelkin@mysql.com>2009-10-09 16:26:37 +0300
commit6a541923c390a81a64a373fc91aafaea6f748b85 (patch)
tree754058f2682c151caea880f5435fd4fd1c0ce3ac /sql
parent66b869d63740a146277346ba7f7ab06264cd6527 (diff)
downloadmariadb-git-6a541923c390a81a64a373fc91aafaea6f748b85.tar.gz
backporting fixes of bug@45940 to 5.1-rpl+2 to cover failures rpl_heartbeat_* as well.
mysql-test/suite/rpl/r/rpl_stop_middle_group.result: the new result file mysql-test/suite/rpl/t/rpl_stop_middle_group.test: renamed from rpl_row_stop_middle_update and added a regression test for bug#45940.
Diffstat (limited to 'sql')
-rw-r--r--sql/log_event.cc53
-rw-r--r--sql/log_event_old.cc59
-rw-r--r--sql/rpl_rli.cc1
-rw-r--r--sql/rpl_rli.h10
-rw-r--r--sql/slave.cc128
5 files changed, 132 insertions, 119 deletions
diff --git a/sql/log_event.cc b/sql/log_event.cc
index d8dd40c91de..e2f0029eb6e 100644
--- a/sql/log_event.cc
+++ b/sql/log_event.cc
@@ -3272,6 +3272,21 @@ Default database: '%s'. Query: '%s'",
*/
} /* End of if (db_ok(... */
+ {/**
+ The following failure injecion works in cooperation with tests
+ setting @@global.debug= 'd,stop_slave_middle_group'.
+ The sql thread receives the killed status and will proceed
+ to shutdown trying to finish incomplete events group.
+ */
+ DBUG_EXECUTE_IF("stop_slave_middle_group",
+ if (strcmp("COMMIT", query) != 0 &&
+ strcmp("BEGIN", query) != 0)
+ {
+ if (thd->transaction.all.modified_non_trans_table)
+ const_cast<Relay_log_info*>(rli)->abort_slave= 1;
+ };);
+ }
+
end:
/*
Probably we have set thd->query, thd->db, thd->catalog to point to places
@@ -7475,8 +7490,16 @@ int Rows_log_event::do_apply_event(Relay_log_info const *rli)
thd->transaction.stmt.modified_non_trans_table= TRUE;
} // row processing loop
- DBUG_EXECUTE_IF("STOP_SLAVE_after_first_Rows_event",
- const_cast<Relay_log_info*>(rli)->abort_slave= 1;);
+ {/**
+ The following failure injecion works in cooperation with tests
+ setting @@global.debug= 'd,stop_slave_middle_group'.
+ The sql thread receives the killed status and will proceed
+ to shutdown trying to finish incomplete events group.
+ */
+ DBUG_EXECUTE_IF("stop_slave_middle_group",
+ if (thd->transaction.all.modified_non_trans_table)
+ const_cast<Relay_log_info*>(rli)->abort_slave= 1;);
+ }
if ((error= do_after_row_operations(rli, error)) &&
ignored_error_code(convert_handler_error(error, thd, table)))
@@ -7513,32 +7536,6 @@ int Rows_log_event::do_apply_event(Relay_log_info const *rli)
const_cast<Relay_log_info*>(rli)->cleanup_context(thd, error);
thd->is_slave_error= 1;
}
- /*
- This code would ideally be placed in do_update_pos() instead, but
- since we have no access to table there, we do the setting of
- last_event_start_time here instead.
- */
- else if (table && (table->s->primary_key == MAX_KEY) &&
- !cache_stmt && get_flags(STMT_END_F) == RLE_NO_FLAGS)
- {
- /*
- ------------ Temporary fix until WL#2975 is implemented ---------
-
- This event is not the last one (no STMT_END_F). If we stop now
- (in case of terminate_slave_thread()), how will we restart? We
- have to restart from Table_map_log_event, but as this table is
- not transactional, the rows already inserted will still be
- present, and idempotency is not guaranteed (no PK) so we risk
- that repeating leads to double insert. So we desperately try to
- continue, hope we'll eventually leave this buggy situation (by
- executing the final Rows_log_event). If we are in a hopeless
- wait (reached end of last relay log and nothing gets appended
- there), we timeout after one minute, and notify DBA about the
- problem. When WL#2975 is implemented, just remove the member
- Relay_log_info::last_event_start_time and all its occurrences.
- */
- const_cast<Relay_log_info*>(rli)->last_event_start_time= my_time(0);
- }
DBUG_RETURN(error);
}
diff --git a/sql/log_event_old.cc b/sql/log_event_old.cc
index 68cd2bf4673..a335210ce53 100644
--- a/sql/log_event_old.cc
+++ b/sql/log_event_old.cc
@@ -226,7 +226,7 @@ Old_rows_log_event::do_apply_event(Old_rows_log_event *ev, const Relay_log_info
row_start= row_end;
}
- DBUG_EXECUTE_IF("STOP_SLAVE_after_first_Rows_event",
+ DBUG_EXECUTE_IF("stop_slave_middle_group",
const_cast<Relay_log_info*>(rli)->abort_slave= 1;);
error= do_after_row_operations(table, error);
if (!ev->cache_stmt)
@@ -269,34 +269,6 @@ Old_rows_log_event::do_apply_event(Old_rows_log_event *ev, const Relay_log_info
DBUG_RETURN(error);
}
- /*
- This code would ideally be placed in do_update_pos() instead, but
- since we have no access to table there, we do the setting of
- last_event_start_time here instead.
- */
- if (table && (table->s->primary_key == MAX_KEY) &&
- !ev->cache_stmt &&
- ev->get_flags(Old_rows_log_event::STMT_END_F) == Old_rows_log_event::RLE_NO_FLAGS)
- {
- /*
- ------------ Temporary fix until WL#2975 is implemented ---------
-
- This event is not the last one (no STMT_END_F). If we stop now
- (in case of terminate_slave_thread()), how will we restart? We
- have to restart from Table_map_log_event, but as this table is
- not transactional, the rows already inserted will still be
- present, and idempotency is not guaranteed (no PK) so we risk
- that repeating leads to double insert. So we desperately try to
- continue, hope we'll eventually leave this buggy situation (by
- executing the final Old_rows_log_event). If we are in a hopeless
- wait (reached end of last relay log and nothing gets appended
- there), we timeout after one minute, and notify DBA about the
- problem. When WL#2975 is implemented, just remove the member
- st_relay_log_info::last_event_start_time and all its occurences.
- */
- const_cast<Relay_log_info*>(rli)->last_event_start_time= my_time(0);
- }
-
DBUG_RETURN(0);
}
#endif
@@ -1744,7 +1716,7 @@ int Old_rows_log_event::do_apply_event(Relay_log_info const *rli)
} // row processing loop
- DBUG_EXECUTE_IF("STOP_SLAVE_after_first_Rows_event",
+ DBUG_EXECUTE_IF("stop_slave_middle_group",
const_cast<Relay_log_info*>(rli)->abort_slave= 1;);
error= do_after_row_operations(rli, error);
if (!cache_stmt)
@@ -1787,33 +1759,6 @@ int Old_rows_log_event::do_apply_event(Relay_log_info const *rli)
DBUG_RETURN(error);
}
- /*
- This code would ideally be placed in do_update_pos() instead, but
- since we have no access to table there, we do the setting of
- last_event_start_time here instead.
- */
- if (table && (table->s->primary_key == MAX_KEY) &&
- !cache_stmt && get_flags(STMT_END_F) == RLE_NO_FLAGS)
- {
- /*
- ------------ Temporary fix until WL#2975 is implemented ---------
-
- This event is not the last one (no STMT_END_F). If we stop now
- (in case of terminate_slave_thread()), how will we restart? We
- have to restart from Table_map_log_event, but as this table is
- not transactional, the rows already inserted will still be
- present, and idempotency is not guaranteed (no PK) so we risk
- that repeating leads to double insert. So we desperately try to
- continue, hope we'll eventually leave this buggy situation (by
- executing the final Old_rows_log_event). If we are in a hopeless
- wait (reached end of last relay log and nothing gets appended
- there), we timeout after one minute, and notify DBA about the
- problem. When WL#2975 is implemented, just remove the member
- Relay_log_info::last_event_start_time and all its occurrences.
- */
- const_cast<Relay_log_info*>(rli)->last_event_start_time= my_time(0);
- }
-
DBUG_RETURN(0);
}
diff --git a/sql/rpl_rli.cc b/sql/rpl_rli.cc
index b3a1bbc31d2..ec3adc79ca2 100644
--- a/sql/rpl_rli.cc
+++ b/sql/rpl_rli.cc
@@ -1195,7 +1195,6 @@ void Relay_log_info::cleanup_context(THD *thd, bool error)
*/
thd->options&= ~OPTION_NO_FOREIGN_KEY_CHECKS;
thd->options&= ~OPTION_RELAXED_UNIQUE_CHECKS;
- last_event_start_time= 0;
DBUG_VOID_RETURN;
}
diff --git a/sql/rpl_rli.h b/sql/rpl_rli.h
index a5410dd0c79..fd36d18adae 100644
--- a/sql/rpl_rli.h
+++ b/sql/rpl_rli.h
@@ -349,12 +349,10 @@ public:
void clear_tables_to_lock();
/*
- Used by row-based replication to detect that it should not stop at
- this event, but give it a chance to send more events. The time
- where the last event inside a group started is stored here. If the
- variable is zero, we are not in a group (but may be in a
- transaction).
- */
+ Used to defer stopping the SQL thread to give it a chance
+ to finish up the current group of events.
+ The timestamp is set and reset in @c sql_slave_killed().
+ */
time_t last_event_start_time;
/**
diff --git a/sql/slave.cc b/sql/slave.cc
index 44993601e78..834ed8ed933 100644
--- a/sql/slave.cc
+++ b/sql/slave.cc
@@ -49,6 +49,10 @@
#define FLAGSTR(V,F) ((V)&(F)?#F" ":"")
#define MAX_SLAVE_RETRY_PAUSE 5
+/*
+ a parameter of sql_slave_killed() to defer the killed status
+*/
+#define SLAVE_WAIT_GROUP_DONE 60
bool use_slave_mask = 0;
MY_BITMAP slave_error_mask;
char slave_skip_error_names[SHOW_VAR_FUNC_BUFF_SIZE];
@@ -790,46 +794,95 @@ static bool io_slave_killed(THD* thd, Master_info* mi)
DBUG_RETURN(mi->abort_slave || abort_loop || thd->killed);
}
+/**
+ The function analyzes a possible killed status and makes
+ a decision whether to accept it or not.
+ Normally upon accepting the sql thread goes to shutdown.
+ In the event of deffering decision @rli->last_event_start_time waiting
+ timer is set to force the killed status be accepted upon its expiration.
+ @param thd pointer to a THD instance
+ @param rli pointer to Relay_log_info instance
+
+ @return TRUE the killed status is recognized, FALSE a possible killed
+ status is deferred.
+*/
static bool sql_slave_killed(THD* thd, Relay_log_info* rli)
{
+ bool ret= FALSE;
DBUG_ENTER("sql_slave_killed");
DBUG_ASSERT(rli->sql_thd == thd);
DBUG_ASSERT(rli->slave_running == 1);// tracking buffer overrun
if (abort_loop || thd->killed || rli->abort_slave)
{
- if (rli->abort_slave && rli->is_in_group() &&
- thd->transaction.all.modified_non_trans_table)
- DBUG_RETURN(0);
- /*
- If we are in an unsafe situation (stopping could corrupt replication),
- we give one minute to the slave SQL thread of grace before really
- terminating, in the hope that it will be able to read more events and
- the unsafe situation will soon be left. Note that this one minute starts
- from the last time anything happened in the slave SQL thread. So it's
- really one minute of idleness, we don't timeout if the slave SQL thread
- is actively working.
- */
- if (rli->last_event_start_time == 0)
- DBUG_RETURN(1);
- DBUG_PRINT("info", ("Slave SQL thread is in an unsafe situation, giving "
- "it some grace period"));
- if (difftime(time(0), rli->last_event_start_time) > 60)
+ if (thd->transaction.all.modified_non_trans_table && rli->is_in_group())
{
- rli->report(ERROR_LEVEL, 0,
- "SQL thread had to stop in an unsafe situation, in "
- "the middle of applying updates to a "
- "non-transactional table without any primary key. "
- "There is a risk of duplicate updates when the slave "
- "SQL thread is restarted. Please check your tables' "
- "contents after restart.");
- DBUG_RETURN(1);
+ char msg_stopped[]=
+ "... The slave SQL is stopped, leaving the current group "
+ "of events unfinished with a non-transaction table changed. "
+ "If the group consists solely of Row-based events, you can try "
+ "restarting the slave with --slave-exec-mode=IDEMPOTENT, which "
+ "ignores duplicate key, key not found, and similar errors (see "
+ "documentation for details).";
+
+ if (rli->abort_slave)
+ {
+ DBUG_PRINT("info", ("Slave SQL thread is being stopped in the middle of"
+ " a group having updated a non-trans table, giving"
+ " it some grace period"));
+
+ /*
+ Slave sql thread shutdown in face of unfinished group modified
+ Non-trans table is handled via a timer. The slave may eventually
+ give out to complete the current group and in that case there
+ might be issues at consequent slave restart, see the error message.
+ WL#2975 offers a robust solution requiring to store the last exectuted
+ event's coordinates along with the group's coordianates
+ instead of waiting with @c last_event_start_time the timer.
+ */
+
+ if (rli->last_event_start_time == 0)
+ rli->last_event_start_time= my_time(0);
+ ret= difftime(my_time(0), rli->last_event_start_time) <=
+ SLAVE_WAIT_GROUP_DONE ? FALSE : TRUE;
+
+ DBUG_EXECUTE_IF("stop_slave_middle_group",
+ DBUG_EXECUTE_IF("incomplete_group_in_relay_log",
+ ret= TRUE;);); // time is over
+
+ if (ret == 0)
+ {
+ rli->report(WARNING_LEVEL, 0,
+ "slave SQL thread is being stopped in the middle "
+ "of applying of a group having updated a non-transaction "
+ "table; waiting for the group completion ... ");
+ }
+ else
+ {
+ rli->report(ERROR_LEVEL, ER_SLAVE_FATAL_ERROR,
+ ER(ER_SLAVE_FATAL_ERROR), msg_stopped);
+ }
+ }
+ else
+ {
+ ret= TRUE;
+ rli->report(ERROR_LEVEL, ER_SLAVE_FATAL_ERROR, ER(ER_SLAVE_FATAL_ERROR),
+ msg_stopped);
+ }
+ }
+ else
+ {
+ ret= TRUE;
}
}
- DBUG_RETURN(0);
+ if (ret)
+ rli->last_event_start_time= 0;
+
+ DBUG_RETURN(ret);
}
+
/*
skip_load_data_infile()
@@ -2525,6 +2578,27 @@ static int exec_relay_log_event(THD* thd, Relay_log_info* rli)
delete ev;
DBUG_RETURN(1);
}
+
+ { /**
+ The following failure injecion works in cooperation with tests
+ setting @@global.debug= 'd,incomplete_group_in_relay_log'.
+ Xid or Commit events are not executed to force the slave sql
+ read hanging if the realy log does not have any more events.
+ */
+ DBUG_EXECUTE_IF("incomplete_group_in_relay_log",
+ if ((ev->get_type_code() == XID_EVENT) ||
+ ((ev->get_type_code() == QUERY_EVENT) &&
+ strcmp("COMMIT", ((Query_log_event *) ev)->query) == 0))
+ {
+ DBUG_ASSERT(thd->transaction.all.modified_non_trans_table);
+ rli->abort_slave= 1;
+ pthread_mutex_unlock(&rli->data_lock);
+ delete ev;
+ rli->inc_event_relay_log_pos();
+ DBUG_RETURN(0);
+ };);
+ }
+
exec_res= apply_event_and_update_pos(ev, thd, rli, TRUE);
/*
@@ -3953,7 +4027,7 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
goto skip_relay_logging;
}
break;
-
+
default:
inc_pos= event_len;
break;