diff options
author | Andrei Elkin <andrei.elkin@mariadb.com> | 2021-04-12 16:32:58 +0300 |
---|---|---|
committer | Andrei Elkin <andrei.elkin@mariadb.com> | 2021-04-15 17:11:07 +0300 |
commit | 412e696fd2bc53f10298d01469e0915b0bb5806d (patch) | |
tree | ff6bd1dcd675e85c17e1f890439e2427d3abc13d | |
parent | 2fa526e26e859df1adb2fc5100747f17ede86f00 (diff) | |
download | mariadb-git-bb-10.3-MDEV-21117.tar.gz |
MDEV-25395 server recovery hits replication event checksum errorbb-10.3-MDEV-21117
When @@global.binlog_checksum is changing from none to
crc32 and only the first of two Binlog_checkpoint_log_event gets written to
the crc32 rotated binlog file before the server crashes, the recovery faces a checksum
verification error.
The reason was that recovery algorithm did not realize
potentially varying binlog checksum at scanning of
a sequence of binlog files.
That is corrected now.
The master-checksum-verify server recovery aborts now and stops
the server due to the checksum error.
A simulation test confirms the fixes.
Induced changes:
rpl_cant_read_event_incident got affected because the test
adopts an external not properly closed binlog file that contains
events which can't pass checksum verification.
A work-around is done to accept that binlog and evade recovery.
binlog_recover_checksum_error needed refinement as
the master-checksum-verify configured server is made to stop
due to the checksum error.
8 files changed, 126 insertions, 24 deletions
diff --git a/mysql-test/suite/binlog/r/binlog_recover_checksum_error.result b/mysql-test/suite/binlog/r/binlog_recover_checksum_error.result index f65f7797d7f..882903fba38 100644 --- a/mysql-test/suite/binlog/r/binlog_recover_checksum_error.result +++ b/mysql-test/suite/binlog/r/binlog_recover_checksum_error.result @@ -1,6 +1,16 @@ set @@global.binlog_checksum = CRC32; call mtr.add_suppression("Replication event checksum verification failed"); call mtr.add_suppression("Error in Log_event::read_log_event"); +call mtr.add_suppression("Error reading binlog event at file:.*pos:"); +call mtr.add_suppression("Crash recovery failed."); +call mtr.add_suppression("Can.t init tc log"); +call mtr.add_suppression("Aborting"); +# Kill the server +# Failed restart with corrupted events and --master_verify_checksum=ON +NOT FOUND /mysqld got signal 11/ in mysqld.1.err +FOUND 1 /\[ERROR\] Can\'t init tc log/ in mysqld.1.err +FOUND 1 /\[ERROR\] Aborting/ in mysqld.1.err +# Normal start set @@global.debug_dbug = VALUE; set @@global.master_verify_checksum = DO_CHECKSUM; set @@global.binlog_checksum = BINLOG_CHECKSUM; diff --git a/mysql-test/suite/binlog/r/binlog_recovery_after_checksum_change.result b/mysql-test/suite/binlog/r/binlog_recovery_after_checksum_change.result new file mode 100644 index 00000000000..85b0b84c684 --- /dev/null +++ b/mysql-test/suite/binlog/r/binlog_recovery_after_checksum_change.result @@ -0,0 +1,8 @@ +connection default; +set @@global.binlog_checksum=none; +set @@session.debug_dbug='d,crash_before_write_second_checkpoint_event'; +set @@global.binlog_checksum=crc32; +ERROR HY000: Lost connection to MySQL server during query +connection default; +NOT FOUND /Replication event checksum verification failed/ in mysqld.1.err +End of the tests diff --git a/mysql-test/suite/binlog/t/binlog_recover_checksum_error.test b/mysql-test/suite/binlog/t/binlog_recover_checksum_error.test index 12c9f3d914f..5af57c59ef3 100644 --- a/mysql-test/suite/binlog/t/binlog_recover_checksum_error.test +++ b/mysql-test/suite/binlog/t/binlog_recover_checksum_error.test @@ -13,18 +13,37 @@ set @@global.binlog_checksum = CRC32; call mtr.add_suppression("Replication event checksum verification failed"); call mtr.add_suppression("Error in Log_event::read_log_event"); +call mtr.add_suppression("Error reading binlog event at file:.*pos:"); +call mtr.add_suppression("Crash recovery failed."); +call mtr.add_suppression("Can.t init tc log"); +call mtr.add_suppression("Aborting"); -# Proof of no crash follows. # There's no need for actual bin-loggable queries to the server ---let $restart_parameters= --master_verify_checksum=ON --debug_dbug="+d,corrupt_read_log_event_char" ---let $shutdown_timeout=0 ---source include/restart_mysqld.inc ---let $restart_parameters= ---let $shutdown_timeout= + +--source include/kill_mysqld.inc + +--echo # Failed restart with corrupted events and --master_verify_checksum=ON +--error 1 +--exec $MYSQLD_LAST_CMD --master_verify_checksum=ON --debug_dbug="d,corrupt_read_log_event_char" >> $MYSQLTEST_VARDIR/log/mysqld.1.err 2>&1 + +# Proof of no crash occured ... +let $log_error_ = $MYSQLTEST_VARDIR/log/mysqld.1.err; +--let SEARCH_FILE=$log_error_ +--let SEARCH_PATTERN=mysqld got signal 11 +--source include/search_pattern_in_file.inc + +# ... and that the server stopped with expected errors +--let SEARCH_PATTERN=\[ERROR\] Can\'t init tc log +--source include/search_pattern_in_file.inc +--let SEARCH_PATTERN=\[ERROR\] Aborting +--source include/search_pattern_in_file.inc # # Cleanup +--echo # Normal start +--source include/start_mysqld.inc + --replace_regex /= .*/= VALUE/ --eval set @@global.debug_dbug = "$debug_dbug_saved" diff --git a/mysql-test/suite/binlog/t/binlog_recovery_after_checksum_change.test b/mysql-test/suite/binlog/t/binlog_recovery_after_checksum_change.test new file mode 100644 index 00000000000..1c3fc20875f --- /dev/null +++ b/mysql-test/suite/binlog/t/binlog_recovery_after_checksum_change.test @@ -0,0 +1,33 @@ +--source include/have_debug.inc +--source include/have_binlog_format_mixed.inc + +# MDEV-25395 server recovery hits replication event checksum error +# +# Check that the server restarts without any issue, specifically no MDEV reported +# errors exists. + +--connection default +set @@global.binlog_checksum=none; +set @@session.debug_dbug='d,crash_before_write_second_checkpoint_event'; +--exec echo "wait" > $MYSQLTEST_VARDIR/tmp/mysqld.1.expect +--error 2013 +set @@global.binlog_checksum=crc32; +--source include/wait_until_disconnected.inc + +# +--append_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect +restart: --master-verify-checksum=1 +EOF + +--connection default +--enable_reconnect +--source include/wait_until_connected_again.inc + +# Proof: +# check error log for lack of error messages of the report. +--let $log_error_ = $MYSQLTEST_VARDIR/log/mysqld.1.err +--let SEARCH_FILE=$log_error_ +--let SEARCH_PATTERN=Replication event checksum verification failed +--source include/search_pattern_in_file.inc + +--echo End of the tests diff --git a/mysql-test/suite/binlog_encryption/rpl_cant_read_event_incident.result b/mysql-test/suite/binlog_encryption/rpl_cant_read_event_incident.result index 5aff978538f..83747db3aca 100644 --- a/mysql-test/suite/binlog_encryption/rpl_cant_read_event_incident.result +++ b/mysql-test/suite/binlog_encryption/rpl_cant_read_event_incident.result @@ -5,7 +5,8 @@ include/stop_slave.inc connection master; call mtr.add_suppression("Error in Log_event::read_log_event()"); include/rpl_stop_server.inc [server_number=1] -include/rpl_start_server.inc [server_number=1] +include/rpl_start_server.inc [server_number=1 parameters: --debug_dbug="d,accept_binlog_without_recovery"] +set @@global.debug_dbug=""; show binlog events; ERROR HY000: Error when executing command SHOW BINLOG EVENTS: Wrong offset or I/O error connection slave; diff --git a/mysql-test/suite/rpl/include/rpl_cant_read_event_incident.inc b/mysql-test/suite/rpl/include/rpl_cant_read_event_incident.inc index 7dfef023947..4ff71dea74e 100644 --- a/mysql-test/suite/rpl/include/rpl_cant_read_event_incident.inc +++ b/mysql-test/suite/rpl/include/rpl_cant_read_event_incident.inc @@ -17,7 +17,7 @@ # Crash is not simulated. A binlog with partly written event in its end is installed # and replication is started from it. # - +--source include/have_debug.inc --source include/have_binlog_format_mixed.inc --source include/master-slave.inc @@ -39,9 +39,11 @@ call mtr.add_suppression("Error in Log_event::read_log_event()"); --copy_file $MYSQL_TEST_DIR/std_data/bug11747416_32228_binlog.000001 $datadir/master-bin.000001 --let $rpl_server_number= 1 +--let $rpl_server_parameters=--debug_dbug="d,accept_binlog_without_recovery" --source include/rpl_start_server.inc --source include/wait_until_connected_again.inc +set @@global.debug_dbug=""; # evidence of the partial binlog --error ER_ERROR_WHEN_EXECUTING_COMMAND diff --git a/mysql-test/suite/rpl/r/rpl_cant_read_event_incident.result b/mysql-test/suite/rpl/r/rpl_cant_read_event_incident.result index 5aff978538f..83747db3aca 100644 --- a/mysql-test/suite/rpl/r/rpl_cant_read_event_incident.result +++ b/mysql-test/suite/rpl/r/rpl_cant_read_event_incident.result @@ -5,7 +5,8 @@ include/stop_slave.inc connection master; call mtr.add_suppression("Error in Log_event::read_log_event()"); include/rpl_stop_server.inc [server_number=1] -include/rpl_start_server.inc [server_number=1] +include/rpl_start_server.inc [server_number=1 parameters: --debug_dbug="d,accept_binlog_without_recovery"] +set @@global.debug_dbug=""; show binlog events; ERROR HY000: Error when executing command SHOW BINLOG EVENTS: Wrong offset or I/O error connection slave; diff --git a/sql/log.cc b/sql/log.cc index a90d1e757e8..bd48136d317 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -6784,6 +6784,11 @@ void MYSQL_BIN_LOG::purge() void MYSQL_BIN_LOG::checkpoint_and_purge(ulong binlog_id) { + DBUG_EXECUTE_IF("crash_before_write_second_checkpoint_event", + flush_io_cache(&log_file); + mysql_file_sync(log_file.file, MYF(MY_WME)); + DBUG_SUICIDE();); + do_checkpoint_request(binlog_id); purge(); } @@ -10202,7 +10207,6 @@ start_binlog_background_thread() class Recovery_context { public: - my_off_t prev_event_pos; rpl_gtid last_gtid; bool last_gtid_standalone; bool last_gtid_valid; @@ -10300,7 +10304,8 @@ public: Assigns last_gtid and assesses the maximum (in the binlog offset term) unsafe gtid (group of events). */ - void process_gtid(int round, Gtid_log_event *gev, LOG_INFO *linfo); + void process_gtid(int round, Gtid_log_event *gev, LOG_INFO *linfo, + my_off_t prev_event_pos); /* Compute next action at the end of processing of the current binlog file. @@ -10429,7 +10434,6 @@ bool Recovery_context::complete(MYSQL_BIN_LOG *log, HASH &xids) } Recovery_context::Recovery_context() : - prev_event_pos(0), last_gtid_standalone(false), last_gtid_valid(false), last_gtid_no2pc(false), last_gtid_engines(0), do_truncate(rpl_semi_sync_slave_enabled), @@ -10635,7 +10639,7 @@ void Recovery_context::update_binlog_unsafe_coord_if_needed(LOG_INFO *linfo) } void Recovery_context::process_gtid(int round, Gtid_log_event *gev, - LOG_INFO *linfo) + LOG_INFO *linfo, my_off_t prev_event_pos) { last_gtid.domain_id= gev->domain_id; last_gtid.server_id= gev->server_id; @@ -10724,16 +10728,18 @@ int Recovery_context::next_binlog_or_round(int& round, int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name, IO_CACHE *first_log, - Format_description_log_event *fdle, bool do_xa) + Format_description_log_event *fdle_arg, bool do_xa) { Log_event *ev= NULL; + my_off_t prev_event_pos= 0; HASH xids; MEM_ROOT mem_root; char binlog_checkpoint_name[FN_REFLEN]; bool binlog_checkpoint_found; - IO_CACHE log; + IO_CACHE log, *curr_log= first_log; File file= -1; const char *errmsg; + Format_description_log_event *fdle= fdle_arg; #ifdef HAVE_REPLICATION Recovery_context ctx; #endif @@ -10776,13 +10782,21 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name, binlog_checkpoint_found= false; for (round= 1;;) { - while ((ev= Log_event::read_log_event(round == 1 ? first_log : &log, - fdle, opt_master_verify_checksum)) + while ((ev= Log_event::read_log_event(curr_log, fdle, + opt_master_verify_checksum)) && ev->is_valid()) { enum Log_event_type typ= ev->get_type_code(); switch (typ) { + case FORMAT_DESCRIPTION_EVENT: + if (round > 1) + { + if (fdle != fdle_arg) + delete fdle; + fdle= (Format_description_log_event*) ev; + } + break; case XID_EVENT: if (do_xa) { @@ -10835,7 +10849,7 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name, break; case GTID_EVENT: - ctx.process_gtid(round, (Gtid_log_event *)ev, linfo); + ctx.process_gtid(round, (Gtid_log_event *)ev, linfo, prev_event_pos); break; case QUERY_EVENT: @@ -10877,12 +10891,19 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name, goto err2; ctx.last_gtid_valid= false; } - ctx.prev_event_pos= ev->log_pos; #endif - delete ev; + prev_event_pos= ev->log_pos; + if (typ != FORMAT_DESCRIPTION_EVENT) + delete ev; ev= NULL; } // end of while + if (curr_log->error != 0) + { + sql_print_error("Error reading binlog event at file:%s pos:%llu during " + "recovery.", linfo->log_file_name, prev_event_pos); + goto err2; + } if (!do_xa) break; /* @@ -10908,10 +10929,11 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name, "for recovery. Aborting.", binlog_checkpoint_name); goto err2; } + curr_log= &log; } else { - end_io_cache(&log); + end_io_cache(curr_log); mysql_file_close(file, MYF(MY_WME)); file= -1; /* @@ -10939,7 +10961,7 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name, round++; #endif - if ((file= open_binlog(&log, linfo->log_file_name, &errmsg)) < 0) + if ((file= open_binlog(curr_log, linfo->log_file_name, &errmsg)) < 0) { sql_print_error("%s", errmsg); goto err2; @@ -10961,6 +10983,9 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name, free_root(&mem_root, MYF(0)); my_hash_free(&xids); } + if (fdle != fdle_arg) + delete fdle; + return 0; err2: @@ -10980,11 +11005,13 @@ err1: "(if it's, for example, out of memory error) and restart, " "or delete (or rename) binary log and start mysqld with " "--tc-heuristic-recover={commit|rollback}"); + if (fdle != fdle_arg) + delete fdle; + return 1; } - int MYSQL_BIN_LOG::do_binlog_recovery(const char *opt_name, bool do_xa_recovery) { @@ -11046,7 +11073,8 @@ MYSQL_BIN_LOG::do_binlog_recovery(const char *opt_name, bool do_xa_recovery) opt_master_verify_checksum)) && ev->get_type_code() == FORMAT_DESCRIPTION_EVENT) { - if (ev->flags & LOG_EVENT_BINLOG_IN_USE_F) + if (DBUG_EVALUATE_IF("accept_binlog_without_recovery", 0, + (ev->flags & LOG_EVENT_BINLOG_IN_USE_F))) { sql_print_information("Recovering after a crash using %s", opt_name); error= recover(&log_info, log_name, &log, |