diff options
author | Andrei Elkin <aelkin@mysql.com> | 2009-10-09 16:26:37 +0300 |
---|---|---|
committer | Andrei Elkin <aelkin@mysql.com> | 2009-10-09 16:26:37 +0300 |
commit | 6a541923c390a81a64a373fc91aafaea6f748b85 (patch) | |
tree | 754058f2682c151caea880f5435fd4fd1c0ce3ac /sql | |
parent | 66b869d63740a146277346ba7f7ab06264cd6527 (diff) | |
download | mariadb-git-6a541923c390a81a64a373fc91aafaea6f748b85.tar.gz |
backporting fixes of bug@45940 to 5.1-rpl+2 to cover failures rpl_heartbeat_* as well.
mysql-test/suite/rpl/r/rpl_stop_middle_group.result:
the new result file
mysql-test/suite/rpl/t/rpl_stop_middle_group.test:
renamed from rpl_row_stop_middle_update and added a regression test for bug#45940.
Diffstat (limited to 'sql')
-rw-r--r-- | sql/log_event.cc | 53 | ||||
-rw-r--r-- | sql/log_event_old.cc | 59 | ||||
-rw-r--r-- | sql/rpl_rli.cc | 1 | ||||
-rw-r--r-- | sql/rpl_rli.h | 10 | ||||
-rw-r--r-- | sql/slave.cc | 128 |
5 files changed, 132 insertions, 119 deletions
diff --git a/sql/log_event.cc b/sql/log_event.cc index d8dd40c91de..e2f0029eb6e 100644 --- a/sql/log_event.cc +++ b/sql/log_event.cc @@ -3272,6 +3272,21 @@ Default database: '%s'. Query: '%s'", */ } /* End of if (db_ok(... */ + {/** + The following failure injecion works in cooperation with tests + setting @@global.debug= 'd,stop_slave_middle_group'. + The sql thread receives the killed status and will proceed + to shutdown trying to finish incomplete events group. + */ + DBUG_EXECUTE_IF("stop_slave_middle_group", + if (strcmp("COMMIT", query) != 0 && + strcmp("BEGIN", query) != 0) + { + if (thd->transaction.all.modified_non_trans_table) + const_cast<Relay_log_info*>(rli)->abort_slave= 1; + };); + } + end: /* Probably we have set thd->query, thd->db, thd->catalog to point to places @@ -7475,8 +7490,16 @@ int Rows_log_event::do_apply_event(Relay_log_info const *rli) thd->transaction.stmt.modified_non_trans_table= TRUE; } // row processing loop - DBUG_EXECUTE_IF("STOP_SLAVE_after_first_Rows_event", - const_cast<Relay_log_info*>(rli)->abort_slave= 1;); + {/** + The following failure injecion works in cooperation with tests + setting @@global.debug= 'd,stop_slave_middle_group'. + The sql thread receives the killed status and will proceed + to shutdown trying to finish incomplete events group. + */ + DBUG_EXECUTE_IF("stop_slave_middle_group", + if (thd->transaction.all.modified_non_trans_table) + const_cast<Relay_log_info*>(rli)->abort_slave= 1;); + } if ((error= do_after_row_operations(rli, error)) && ignored_error_code(convert_handler_error(error, thd, table))) @@ -7513,32 +7536,6 @@ int Rows_log_event::do_apply_event(Relay_log_info const *rli) const_cast<Relay_log_info*>(rli)->cleanup_context(thd, error); thd->is_slave_error= 1; } - /* - This code would ideally be placed in do_update_pos() instead, but - since we have no access to table there, we do the setting of - last_event_start_time here instead. - */ - else if (table && (table->s->primary_key == MAX_KEY) && - !cache_stmt && get_flags(STMT_END_F) == RLE_NO_FLAGS) - { - /* - ------------ Temporary fix until WL#2975 is implemented --------- - - This event is not the last one (no STMT_END_F). If we stop now - (in case of terminate_slave_thread()), how will we restart? We - have to restart from Table_map_log_event, but as this table is - not transactional, the rows already inserted will still be - present, and idempotency is not guaranteed (no PK) so we risk - that repeating leads to double insert. So we desperately try to - continue, hope we'll eventually leave this buggy situation (by - executing the final Rows_log_event). If we are in a hopeless - wait (reached end of last relay log and nothing gets appended - there), we timeout after one minute, and notify DBA about the - problem. When WL#2975 is implemented, just remove the member - Relay_log_info::last_event_start_time and all its occurrences. - */ - const_cast<Relay_log_info*>(rli)->last_event_start_time= my_time(0); - } DBUG_RETURN(error); } diff --git a/sql/log_event_old.cc b/sql/log_event_old.cc index 68cd2bf4673..a335210ce53 100644 --- a/sql/log_event_old.cc +++ b/sql/log_event_old.cc @@ -226,7 +226,7 @@ Old_rows_log_event::do_apply_event(Old_rows_log_event *ev, const Relay_log_info row_start= row_end; } - DBUG_EXECUTE_IF("STOP_SLAVE_after_first_Rows_event", + DBUG_EXECUTE_IF("stop_slave_middle_group", const_cast<Relay_log_info*>(rli)->abort_slave= 1;); error= do_after_row_operations(table, error); if (!ev->cache_stmt) @@ -269,34 +269,6 @@ Old_rows_log_event::do_apply_event(Old_rows_log_event *ev, const Relay_log_info DBUG_RETURN(error); } - /* - This code would ideally be placed in do_update_pos() instead, but - since we have no access to table there, we do the setting of - last_event_start_time here instead. - */ - if (table && (table->s->primary_key == MAX_KEY) && - !ev->cache_stmt && - ev->get_flags(Old_rows_log_event::STMT_END_F) == Old_rows_log_event::RLE_NO_FLAGS) - { - /* - ------------ Temporary fix until WL#2975 is implemented --------- - - This event is not the last one (no STMT_END_F). If we stop now - (in case of terminate_slave_thread()), how will we restart? We - have to restart from Table_map_log_event, but as this table is - not transactional, the rows already inserted will still be - present, and idempotency is not guaranteed (no PK) so we risk - that repeating leads to double insert. So we desperately try to - continue, hope we'll eventually leave this buggy situation (by - executing the final Old_rows_log_event). If we are in a hopeless - wait (reached end of last relay log and nothing gets appended - there), we timeout after one minute, and notify DBA about the - problem. When WL#2975 is implemented, just remove the member - st_relay_log_info::last_event_start_time and all its occurences. - */ - const_cast<Relay_log_info*>(rli)->last_event_start_time= my_time(0); - } - DBUG_RETURN(0); } #endif @@ -1744,7 +1716,7 @@ int Old_rows_log_event::do_apply_event(Relay_log_info const *rli) } // row processing loop - DBUG_EXECUTE_IF("STOP_SLAVE_after_first_Rows_event", + DBUG_EXECUTE_IF("stop_slave_middle_group", const_cast<Relay_log_info*>(rli)->abort_slave= 1;); error= do_after_row_operations(rli, error); if (!cache_stmt) @@ -1787,33 +1759,6 @@ int Old_rows_log_event::do_apply_event(Relay_log_info const *rli) DBUG_RETURN(error); } - /* - This code would ideally be placed in do_update_pos() instead, but - since we have no access to table there, we do the setting of - last_event_start_time here instead. - */ - if (table && (table->s->primary_key == MAX_KEY) && - !cache_stmt && get_flags(STMT_END_F) == RLE_NO_FLAGS) - { - /* - ------------ Temporary fix until WL#2975 is implemented --------- - - This event is not the last one (no STMT_END_F). If we stop now - (in case of terminate_slave_thread()), how will we restart? We - have to restart from Table_map_log_event, but as this table is - not transactional, the rows already inserted will still be - present, and idempotency is not guaranteed (no PK) so we risk - that repeating leads to double insert. So we desperately try to - continue, hope we'll eventually leave this buggy situation (by - executing the final Old_rows_log_event). If we are in a hopeless - wait (reached end of last relay log and nothing gets appended - there), we timeout after one minute, and notify DBA about the - problem. When WL#2975 is implemented, just remove the member - Relay_log_info::last_event_start_time and all its occurrences. - */ - const_cast<Relay_log_info*>(rli)->last_event_start_time= my_time(0); - } - DBUG_RETURN(0); } diff --git a/sql/rpl_rli.cc b/sql/rpl_rli.cc index b3a1bbc31d2..ec3adc79ca2 100644 --- a/sql/rpl_rli.cc +++ b/sql/rpl_rli.cc @@ -1195,7 +1195,6 @@ void Relay_log_info::cleanup_context(THD *thd, bool error) */ thd->options&= ~OPTION_NO_FOREIGN_KEY_CHECKS; thd->options&= ~OPTION_RELAXED_UNIQUE_CHECKS; - last_event_start_time= 0; DBUG_VOID_RETURN; } diff --git a/sql/rpl_rli.h b/sql/rpl_rli.h index a5410dd0c79..fd36d18adae 100644 --- a/sql/rpl_rli.h +++ b/sql/rpl_rli.h @@ -349,12 +349,10 @@ public: void clear_tables_to_lock(); /* - Used by row-based replication to detect that it should not stop at - this event, but give it a chance to send more events. The time - where the last event inside a group started is stored here. If the - variable is zero, we are not in a group (but may be in a - transaction). - */ + Used to defer stopping the SQL thread to give it a chance + to finish up the current group of events. + The timestamp is set and reset in @c sql_slave_killed(). + */ time_t last_event_start_time; /** diff --git a/sql/slave.cc b/sql/slave.cc index 44993601e78..834ed8ed933 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -49,6 +49,10 @@ #define FLAGSTR(V,F) ((V)&(F)?#F" ":"") #define MAX_SLAVE_RETRY_PAUSE 5 +/* + a parameter of sql_slave_killed() to defer the killed status +*/ +#define SLAVE_WAIT_GROUP_DONE 60 bool use_slave_mask = 0; MY_BITMAP slave_error_mask; char slave_skip_error_names[SHOW_VAR_FUNC_BUFF_SIZE]; @@ -790,46 +794,95 @@ static bool io_slave_killed(THD* thd, Master_info* mi) DBUG_RETURN(mi->abort_slave || abort_loop || thd->killed); } +/** + The function analyzes a possible killed status and makes + a decision whether to accept it or not. + Normally upon accepting the sql thread goes to shutdown. + In the event of deffering decision @rli->last_event_start_time waiting + timer is set to force the killed status be accepted upon its expiration. + @param thd pointer to a THD instance + @param rli pointer to Relay_log_info instance + + @return TRUE the killed status is recognized, FALSE a possible killed + status is deferred. +*/ static bool sql_slave_killed(THD* thd, Relay_log_info* rli) { + bool ret= FALSE; DBUG_ENTER("sql_slave_killed"); DBUG_ASSERT(rli->sql_thd == thd); DBUG_ASSERT(rli->slave_running == 1);// tracking buffer overrun if (abort_loop || thd->killed || rli->abort_slave) { - if (rli->abort_slave && rli->is_in_group() && - thd->transaction.all.modified_non_trans_table) - DBUG_RETURN(0); - /* - If we are in an unsafe situation (stopping could corrupt replication), - we give one minute to the slave SQL thread of grace before really - terminating, in the hope that it will be able to read more events and - the unsafe situation will soon be left. Note that this one minute starts - from the last time anything happened in the slave SQL thread. So it's - really one minute of idleness, we don't timeout if the slave SQL thread - is actively working. - */ - if (rli->last_event_start_time == 0) - DBUG_RETURN(1); - DBUG_PRINT("info", ("Slave SQL thread is in an unsafe situation, giving " - "it some grace period")); - if (difftime(time(0), rli->last_event_start_time) > 60) + if (thd->transaction.all.modified_non_trans_table && rli->is_in_group()) { - rli->report(ERROR_LEVEL, 0, - "SQL thread had to stop in an unsafe situation, in " - "the middle of applying updates to a " - "non-transactional table without any primary key. " - "There is a risk of duplicate updates when the slave " - "SQL thread is restarted. Please check your tables' " - "contents after restart."); - DBUG_RETURN(1); + char msg_stopped[]= + "... The slave SQL is stopped, leaving the current group " + "of events unfinished with a non-transaction table changed. " + "If the group consists solely of Row-based events, you can try " + "restarting the slave with --slave-exec-mode=IDEMPOTENT, which " + "ignores duplicate key, key not found, and similar errors (see " + "documentation for details)."; + + if (rli->abort_slave) + { + DBUG_PRINT("info", ("Slave SQL thread is being stopped in the middle of" + " a group having updated a non-trans table, giving" + " it some grace period")); + + /* + Slave sql thread shutdown in face of unfinished group modified + Non-trans table is handled via a timer. The slave may eventually + give out to complete the current group and in that case there + might be issues at consequent slave restart, see the error message. + WL#2975 offers a robust solution requiring to store the last exectuted + event's coordinates along with the group's coordianates + instead of waiting with @c last_event_start_time the timer. + */ + + if (rli->last_event_start_time == 0) + rli->last_event_start_time= my_time(0); + ret= difftime(my_time(0), rli->last_event_start_time) <= + SLAVE_WAIT_GROUP_DONE ? FALSE : TRUE; + + DBUG_EXECUTE_IF("stop_slave_middle_group", + DBUG_EXECUTE_IF("incomplete_group_in_relay_log", + ret= TRUE;);); // time is over + + if (ret == 0) + { + rli->report(WARNING_LEVEL, 0, + "slave SQL thread is being stopped in the middle " + "of applying of a group having updated a non-transaction " + "table; waiting for the group completion ... "); + } + else + { + rli->report(ERROR_LEVEL, ER_SLAVE_FATAL_ERROR, + ER(ER_SLAVE_FATAL_ERROR), msg_stopped); + } + } + else + { + ret= TRUE; + rli->report(ERROR_LEVEL, ER_SLAVE_FATAL_ERROR, ER(ER_SLAVE_FATAL_ERROR), + msg_stopped); + } + } + else + { + ret= TRUE; } } - DBUG_RETURN(0); + if (ret) + rli->last_event_start_time= 0; + + DBUG_RETURN(ret); } + /* skip_load_data_infile() @@ -2525,6 +2578,27 @@ static int exec_relay_log_event(THD* thd, Relay_log_info* rli) delete ev; DBUG_RETURN(1); } + + { /** + The following failure injecion works in cooperation with tests + setting @@global.debug= 'd,incomplete_group_in_relay_log'. + Xid or Commit events are not executed to force the slave sql + read hanging if the realy log does not have any more events. + */ + DBUG_EXECUTE_IF("incomplete_group_in_relay_log", + if ((ev->get_type_code() == XID_EVENT) || + ((ev->get_type_code() == QUERY_EVENT) && + strcmp("COMMIT", ((Query_log_event *) ev)->query) == 0)) + { + DBUG_ASSERT(thd->transaction.all.modified_non_trans_table); + rli->abort_slave= 1; + pthread_mutex_unlock(&rli->data_lock); + delete ev; + rli->inc_event_relay_log_pos(); + DBUG_RETURN(0); + };); + } + exec_res= apply_event_and_update_pos(ev, thd, rli, TRUE); /* @@ -3953,7 +4027,7 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len) goto skip_relay_logging; } break; - + default: inc_pos= event_len; break; |