summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrei Elkin <andrei.elkin@mariadb.com>2021-04-12 16:32:58 +0300
committerAndrei Elkin <andrei.elkin@mariadb.com>2021-04-15 17:11:07 +0300
commit412e696fd2bc53f10298d01469e0915b0bb5806d (patch)
treeff6bd1dcd675e85c17e1f890439e2427d3abc13d
parent2fa526e26e859df1adb2fc5100747f17ede86f00 (diff)
downloadmariadb-git-bb-10.3-MDEV-21117.tar.gz
MDEV-25395 server recovery hits replication event checksum errorbb-10.3-MDEV-21117
When @@global.binlog_checksum is changing from none to crc32 and only the first of two Binlog_checkpoint_log_event gets written to the crc32 rotated binlog file before the server crashes, the recovery faces a checksum verification error. The reason was that recovery algorithm did not realize potentially varying binlog checksum at scanning of a sequence of binlog files. That is corrected now. The master-checksum-verify server recovery aborts now and stops the server due to the checksum error. A simulation test confirms the fixes. Induced changes: rpl_cant_read_event_incident got affected because the test adopts an external not properly closed binlog file that contains events which can't pass checksum verification. A work-around is done to accept that binlog and evade recovery. binlog_recover_checksum_error needed refinement as the master-checksum-verify configured server is made to stop due to the checksum error.
-rw-r--r--mysql-test/suite/binlog/r/binlog_recover_checksum_error.result10
-rw-r--r--mysql-test/suite/binlog/r/binlog_recovery_after_checksum_change.result8
-rw-r--r--mysql-test/suite/binlog/t/binlog_recover_checksum_error.test31
-rw-r--r--mysql-test/suite/binlog/t/binlog_recovery_after_checksum_change.test33
-rw-r--r--mysql-test/suite/binlog_encryption/rpl_cant_read_event_incident.result3
-rw-r--r--mysql-test/suite/rpl/include/rpl_cant_read_event_incident.inc4
-rw-r--r--mysql-test/suite/rpl/r/rpl_cant_read_event_incident.result3
-rw-r--r--sql/log.cc58
8 files changed, 126 insertions, 24 deletions
diff --git a/mysql-test/suite/binlog/r/binlog_recover_checksum_error.result b/mysql-test/suite/binlog/r/binlog_recover_checksum_error.result
index f65f7797d7f..882903fba38 100644
--- a/mysql-test/suite/binlog/r/binlog_recover_checksum_error.result
+++ b/mysql-test/suite/binlog/r/binlog_recover_checksum_error.result
@@ -1,6 +1,16 @@
set @@global.binlog_checksum = CRC32;
call mtr.add_suppression("Replication event checksum verification failed");
call mtr.add_suppression("Error in Log_event::read_log_event");
+call mtr.add_suppression("Error reading binlog event at file:.*pos:");
+call mtr.add_suppression("Crash recovery failed.");
+call mtr.add_suppression("Can.t init tc log");
+call mtr.add_suppression("Aborting");
+# Kill the server
+# Failed restart with corrupted events and --master_verify_checksum=ON
+NOT FOUND /mysqld got signal 11/ in mysqld.1.err
+FOUND 1 /\[ERROR\] Can\'t init tc log/ in mysqld.1.err
+FOUND 1 /\[ERROR\] Aborting/ in mysqld.1.err
+# Normal start
set @@global.debug_dbug = VALUE;
set @@global.master_verify_checksum = DO_CHECKSUM;
set @@global.binlog_checksum = BINLOG_CHECKSUM;
diff --git a/mysql-test/suite/binlog/r/binlog_recovery_after_checksum_change.result b/mysql-test/suite/binlog/r/binlog_recovery_after_checksum_change.result
new file mode 100644
index 00000000000..85b0b84c684
--- /dev/null
+++ b/mysql-test/suite/binlog/r/binlog_recovery_after_checksum_change.result
@@ -0,0 +1,8 @@
+connection default;
+set @@global.binlog_checksum=none;
+set @@session.debug_dbug='d,crash_before_write_second_checkpoint_event';
+set @@global.binlog_checksum=crc32;
+ERROR HY000: Lost connection to MySQL server during query
+connection default;
+NOT FOUND /Replication event checksum verification failed/ in mysqld.1.err
+End of the tests
diff --git a/mysql-test/suite/binlog/t/binlog_recover_checksum_error.test b/mysql-test/suite/binlog/t/binlog_recover_checksum_error.test
index 12c9f3d914f..5af57c59ef3 100644
--- a/mysql-test/suite/binlog/t/binlog_recover_checksum_error.test
+++ b/mysql-test/suite/binlog/t/binlog_recover_checksum_error.test
@@ -13,18 +13,37 @@ set @@global.binlog_checksum = CRC32;
call mtr.add_suppression("Replication event checksum verification failed");
call mtr.add_suppression("Error in Log_event::read_log_event");
+call mtr.add_suppression("Error reading binlog event at file:.*pos:");
+call mtr.add_suppression("Crash recovery failed.");
+call mtr.add_suppression("Can.t init tc log");
+call mtr.add_suppression("Aborting");
-# Proof of no crash follows.
# There's no need for actual bin-loggable queries to the server
---let $restart_parameters= --master_verify_checksum=ON --debug_dbug="+d,corrupt_read_log_event_char"
---let $shutdown_timeout=0
---source include/restart_mysqld.inc
---let $restart_parameters=
---let $shutdown_timeout=
+
+--source include/kill_mysqld.inc
+
+--echo # Failed restart with corrupted events and --master_verify_checksum=ON
+--error 1
+--exec $MYSQLD_LAST_CMD --master_verify_checksum=ON --debug_dbug="d,corrupt_read_log_event_char" >> $MYSQLTEST_VARDIR/log/mysqld.1.err 2>&1
+
+# Proof of no crash occured ...
+let $log_error_ = $MYSQLTEST_VARDIR/log/mysqld.1.err;
+--let SEARCH_FILE=$log_error_
+--let SEARCH_PATTERN=mysqld got signal 11
+--source include/search_pattern_in_file.inc
+
+# ... and that the server stopped with expected errors
+--let SEARCH_PATTERN=\[ERROR\] Can\'t init tc log
+--source include/search_pattern_in_file.inc
+--let SEARCH_PATTERN=\[ERROR\] Aborting
+--source include/search_pattern_in_file.inc
#
# Cleanup
+--echo # Normal start
+--source include/start_mysqld.inc
+
--replace_regex /= .*/= VALUE/
--eval set @@global.debug_dbug = "$debug_dbug_saved"
diff --git a/mysql-test/suite/binlog/t/binlog_recovery_after_checksum_change.test b/mysql-test/suite/binlog/t/binlog_recovery_after_checksum_change.test
new file mode 100644
index 00000000000..1c3fc20875f
--- /dev/null
+++ b/mysql-test/suite/binlog/t/binlog_recovery_after_checksum_change.test
@@ -0,0 +1,33 @@
+--source include/have_debug.inc
+--source include/have_binlog_format_mixed.inc
+
+# MDEV-25395 server recovery hits replication event checksum error
+#
+# Check that the server restarts without any issue, specifically no MDEV reported
+# errors exists.
+
+--connection default
+set @@global.binlog_checksum=none;
+set @@session.debug_dbug='d,crash_before_write_second_checkpoint_event';
+--exec echo "wait" > $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
+--error 2013
+set @@global.binlog_checksum=crc32;
+--source include/wait_until_disconnected.inc
+
+#
+--append_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
+restart: --master-verify-checksum=1
+EOF
+
+--connection default
+--enable_reconnect
+--source include/wait_until_connected_again.inc
+
+# Proof:
+# check error log for lack of error messages of the report.
+--let $log_error_ = $MYSQLTEST_VARDIR/log/mysqld.1.err
+--let SEARCH_FILE=$log_error_
+--let SEARCH_PATTERN=Replication event checksum verification failed
+--source include/search_pattern_in_file.inc
+
+--echo End of the tests
diff --git a/mysql-test/suite/binlog_encryption/rpl_cant_read_event_incident.result b/mysql-test/suite/binlog_encryption/rpl_cant_read_event_incident.result
index 5aff978538f..83747db3aca 100644
--- a/mysql-test/suite/binlog_encryption/rpl_cant_read_event_incident.result
+++ b/mysql-test/suite/binlog_encryption/rpl_cant_read_event_incident.result
@@ -5,7 +5,8 @@ include/stop_slave.inc
connection master;
call mtr.add_suppression("Error in Log_event::read_log_event()");
include/rpl_stop_server.inc [server_number=1]
-include/rpl_start_server.inc [server_number=1]
+include/rpl_start_server.inc [server_number=1 parameters: --debug_dbug="d,accept_binlog_without_recovery"]
+set @@global.debug_dbug="";
show binlog events;
ERROR HY000: Error when executing command SHOW BINLOG EVENTS: Wrong offset or I/O error
connection slave;
diff --git a/mysql-test/suite/rpl/include/rpl_cant_read_event_incident.inc b/mysql-test/suite/rpl/include/rpl_cant_read_event_incident.inc
index 7dfef023947..4ff71dea74e 100644
--- a/mysql-test/suite/rpl/include/rpl_cant_read_event_incident.inc
+++ b/mysql-test/suite/rpl/include/rpl_cant_read_event_incident.inc
@@ -17,7 +17,7 @@
# Crash is not simulated. A binlog with partly written event in its end is installed
# and replication is started from it.
#
-
+--source include/have_debug.inc
--source include/have_binlog_format_mixed.inc
--source include/master-slave.inc
@@ -39,9 +39,11 @@ call mtr.add_suppression("Error in Log_event::read_log_event()");
--copy_file $MYSQL_TEST_DIR/std_data/bug11747416_32228_binlog.000001 $datadir/master-bin.000001
--let $rpl_server_number= 1
+--let $rpl_server_parameters=--debug_dbug="d,accept_binlog_without_recovery"
--source include/rpl_start_server.inc
--source include/wait_until_connected_again.inc
+set @@global.debug_dbug="";
# evidence of the partial binlog
--error ER_ERROR_WHEN_EXECUTING_COMMAND
diff --git a/mysql-test/suite/rpl/r/rpl_cant_read_event_incident.result b/mysql-test/suite/rpl/r/rpl_cant_read_event_incident.result
index 5aff978538f..83747db3aca 100644
--- a/mysql-test/suite/rpl/r/rpl_cant_read_event_incident.result
+++ b/mysql-test/suite/rpl/r/rpl_cant_read_event_incident.result
@@ -5,7 +5,8 @@ include/stop_slave.inc
connection master;
call mtr.add_suppression("Error in Log_event::read_log_event()");
include/rpl_stop_server.inc [server_number=1]
-include/rpl_start_server.inc [server_number=1]
+include/rpl_start_server.inc [server_number=1 parameters: --debug_dbug="d,accept_binlog_without_recovery"]
+set @@global.debug_dbug="";
show binlog events;
ERROR HY000: Error when executing command SHOW BINLOG EVENTS: Wrong offset or I/O error
connection slave;
diff --git a/sql/log.cc b/sql/log.cc
index a90d1e757e8..bd48136d317 100644
--- a/sql/log.cc
+++ b/sql/log.cc
@@ -6784,6 +6784,11 @@ void MYSQL_BIN_LOG::purge()
void MYSQL_BIN_LOG::checkpoint_and_purge(ulong binlog_id)
{
+ DBUG_EXECUTE_IF("crash_before_write_second_checkpoint_event",
+ flush_io_cache(&log_file);
+ mysql_file_sync(log_file.file, MYF(MY_WME));
+ DBUG_SUICIDE(););
+
do_checkpoint_request(binlog_id);
purge();
}
@@ -10202,7 +10207,6 @@ start_binlog_background_thread()
class Recovery_context
{
public:
- my_off_t prev_event_pos;
rpl_gtid last_gtid;
bool last_gtid_standalone;
bool last_gtid_valid;
@@ -10300,7 +10304,8 @@ public:
Assigns last_gtid and assesses the maximum (in the binlog offset term)
unsafe gtid (group of events).
*/
- void process_gtid(int round, Gtid_log_event *gev, LOG_INFO *linfo);
+ void process_gtid(int round, Gtid_log_event *gev, LOG_INFO *linfo,
+ my_off_t prev_event_pos);
/*
Compute next action at the end of processing of the current binlog file.
@@ -10429,7 +10434,6 @@ bool Recovery_context::complete(MYSQL_BIN_LOG *log, HASH &xids)
}
Recovery_context::Recovery_context() :
- prev_event_pos(0),
last_gtid_standalone(false), last_gtid_valid(false), last_gtid_no2pc(false),
last_gtid_engines(0),
do_truncate(rpl_semi_sync_slave_enabled),
@@ -10635,7 +10639,7 @@ void Recovery_context::update_binlog_unsafe_coord_if_needed(LOG_INFO *linfo)
}
void Recovery_context::process_gtid(int round, Gtid_log_event *gev,
- LOG_INFO *linfo)
+ LOG_INFO *linfo, my_off_t prev_event_pos)
{
last_gtid.domain_id= gev->domain_id;
last_gtid.server_id= gev->server_id;
@@ -10724,16 +10728,18 @@ int Recovery_context::next_binlog_or_round(int& round,
int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
IO_CACHE *first_log,
- Format_description_log_event *fdle, bool do_xa)
+ Format_description_log_event *fdle_arg, bool do_xa)
{
Log_event *ev= NULL;
+ my_off_t prev_event_pos= 0;
HASH xids;
MEM_ROOT mem_root;
char binlog_checkpoint_name[FN_REFLEN];
bool binlog_checkpoint_found;
- IO_CACHE log;
+ IO_CACHE log, *curr_log= first_log;
File file= -1;
const char *errmsg;
+ Format_description_log_event *fdle= fdle_arg;
#ifdef HAVE_REPLICATION
Recovery_context ctx;
#endif
@@ -10776,13 +10782,21 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
binlog_checkpoint_found= false;
for (round= 1;;)
{
- while ((ev= Log_event::read_log_event(round == 1 ? first_log : &log,
- fdle, opt_master_verify_checksum))
+ while ((ev= Log_event::read_log_event(curr_log, fdle,
+ opt_master_verify_checksum))
&& ev->is_valid())
{
enum Log_event_type typ= ev->get_type_code();
switch (typ)
{
+ case FORMAT_DESCRIPTION_EVENT:
+ if (round > 1)
+ {
+ if (fdle != fdle_arg)
+ delete fdle;
+ fdle= (Format_description_log_event*) ev;
+ }
+ break;
case XID_EVENT:
if (do_xa)
{
@@ -10835,7 +10849,7 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
break;
case GTID_EVENT:
- ctx.process_gtid(round, (Gtid_log_event *)ev, linfo);
+ ctx.process_gtid(round, (Gtid_log_event *)ev, linfo, prev_event_pos);
break;
case QUERY_EVENT:
@@ -10877,12 +10891,19 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
goto err2;
ctx.last_gtid_valid= false;
}
- ctx.prev_event_pos= ev->log_pos;
#endif
- delete ev;
+ prev_event_pos= ev->log_pos;
+ if (typ != FORMAT_DESCRIPTION_EVENT)
+ delete ev;
ev= NULL;
} // end of while
+ if (curr_log->error != 0)
+ {
+ sql_print_error("Error reading binlog event at file:%s pos:%llu during "
+ "recovery.", linfo->log_file_name, prev_event_pos);
+ goto err2;
+ }
if (!do_xa)
break;
/*
@@ -10908,10 +10929,11 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
"for recovery. Aborting.", binlog_checkpoint_name);
goto err2;
}
+ curr_log= &log;
}
else
{
- end_io_cache(&log);
+ end_io_cache(curr_log);
mysql_file_close(file, MYF(MY_WME));
file= -1;
/*
@@ -10939,7 +10961,7 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
round++;
#endif
- if ((file= open_binlog(&log, linfo->log_file_name, &errmsg)) < 0)
+ if ((file= open_binlog(curr_log, linfo->log_file_name, &errmsg)) < 0)
{
sql_print_error("%s", errmsg);
goto err2;
@@ -10961,6 +10983,9 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
free_root(&mem_root, MYF(0));
my_hash_free(&xids);
}
+ if (fdle != fdle_arg)
+ delete fdle;
+
return 0;
err2:
@@ -10980,11 +11005,13 @@ err1:
"(if it's, for example, out of memory error) and restart, "
"or delete (or rename) binary log and start mysqld with "
"--tc-heuristic-recover={commit|rollback}");
+ if (fdle != fdle_arg)
+ delete fdle;
+
return 1;
}
-
int
MYSQL_BIN_LOG::do_binlog_recovery(const char *opt_name, bool do_xa_recovery)
{
@@ -11046,7 +11073,8 @@ MYSQL_BIN_LOG::do_binlog_recovery(const char *opt_name, bool do_xa_recovery)
opt_master_verify_checksum)) &&
ev->get_type_code() == FORMAT_DESCRIPTION_EVENT)
{
- if (ev->flags & LOG_EVENT_BINLOG_IN_USE_F)
+ if (DBUG_EVALUATE_IF("accept_binlog_without_recovery", 0,
+ (ev->flags & LOG_EVENT_BINLOG_IN_USE_F)))
{
sql_print_information("Recovering after a crash using %s", opt_name);
error= recover(&log_info, log_name, &log,