From 77bc81b89ca65edb244fe4da30ca61e2f97cb5de Mon Sep 17 00:00:00 2001 From: Libing Song Date: Wed, 19 Jan 2011 01:23:49 +0800 Subject: Bug#58546 test rpl_packet timeout failure sporadically on PB rpl_packet got a timeout failure sporadically on PB when stopping slave. The real reason of this bug is that STOP SLAVE stopped IO thread first and then stopped SQL thread. It was possible that IO thread stopped after replicating part of a transaction which SQL thread was executing. SQL thread would be hung if the transaction could not be rolled back safely. After this patch, STOP SLAVE will stop SQL thread first and then stop IO thread, which guarantees that IO thread will fetch the reset of the events of the transaction that SQL thread is executing, so that SQL thread can finish the transaction if it cannot be rolled back safely. Added below auxiliary files to make the test code neater. restart_slave_sql.inc rpl_connection_master.inc rpl_connection_slave.inc rpl_connection_slave1.inc --- sql/sql_repl.cc | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'sql/sql_repl.cc') diff --git a/sql/sql_repl.cc b/sql/sql_repl.cc index cb5aac863c0..0ade0b759d5 100644 --- a/sql/sql_repl.cc +++ b/sql/sql_repl.cc @@ -21,6 +21,7 @@ #include "log_event.h" #include "rpl_filter.h" #include +#include "debug_sync.h" int max_binlog_dump_events = 0; // unlimited my_bool opt_sporadic_binlog_dump_fail = 0; @@ -556,6 +557,20 @@ impossible position"; } #endif + DBUG_EXECUTE_IF("dump_thread_wait_before_send_xid", + { + if ((*packet)[EVENT_TYPE_OFFSET+1] == XID_EVENT) + { + net_flush(net); + const char act[]= + "now " + "wait_for signal.continue"; + DBUG_ASSERT(opt_debug_sync_timeout > 0); + DBUG_ASSERT(!debug_sync_set_action(current_thd, + STRING_WITH_LEN(act))); + } + }); + if ((*packet)[EVENT_TYPE_OFFSET+1] == FORMAT_DESCRIPTION_EVENT) { binlog_can_be_corrupted= test((*packet)[FLAGS_OFFSET+1] & @@ -572,6 +587,14 @@ impossible position"; goto err; } + DBUG_EXECUTE_IF("dump_thread_wait_before_send_xid", + { + if ((*packet)[EVENT_TYPE_OFFSET+1] == XID_EVENT) + { + net_flush(net); + } + }); + DBUG_PRINT("info", ("log event code %d", (*packet)[LOG_EVENT_OFFSET+1] )); if ((*packet)[LOG_EVENT_OFFSET+1] == LOAD_EVENT) -- cgit v1.2.1 From 49b7a1f48d4e263e572df825ab89004893963f87 Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 24 Jan 2011 11:48:54 +0800 Subject: Postfix BUG#58546 Updated Copyright. --- sql/sql_repl.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'sql/sql_repl.cc') diff --git a/sql/sql_repl.cc b/sql/sql_repl.cc index 0ade0b759d5..8c769ce6acf 100644 --- a/sql/sql_repl.cc +++ b/sql/sql_repl.cc @@ -1,4 +1,4 @@ -/* Copyright (C) 2000-2006 MySQL AB & Sasha +/* Copyright (C) 2000, 2011, Oracle and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by -- cgit v1.2.1 From 235e10d987222b8cc42e1f3a9a79834912d45026 Mon Sep 17 00:00:00 2001 From: Alfranio Correia Date: Fri, 28 Jan 2011 01:25:26 +0000 Subject: BUG#55675 rpl.rpl_log_pos fails sporadically with error binlog truncated in the middle There are two calls to read_log_event() on master in mysql_binlog_send(). Each call reads 19 bytes in this test case and the error of the second read_log_event() is reported to the slave. The second read_log_event() starts from position 94 (75 + 19) to 113 (75 + 19 + 19). Usually, there are two events in the binary log: . 0 - 3 - Header . 4 - 105 - Format Descriptor Event . 106 - 304 - Query Event and both reads fail because operations are reading from invalid positions as expected. However, mysql_binlog_send() does not use the same IO_CACHE that is used to write into binary log (i.e. mysql_bin_log.log_file) for the hot binary log. It opens the binary log file directly by calling open_binlog() and creates a separated IO_CACHE. So there is a possibly that after a master has flushed the binary log file, the content has been cached by the filesystem, and has not updated the disk file. If this happens, then a slave will only see part of the file, and thus the second read_log_event() will report event truncated error. To fix the problem, if the first read_log_event() has failed, we ensure that the second one will try to read from the same position. --- sql/sql_repl.cc | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'sql/sql_repl.cc') diff --git a/sql/sql_repl.cc b/sql/sql_repl.cc index 8c769ce6acf..2a1efab13a9 100644 --- a/sql/sql_repl.cc +++ b/sql/sql_repl.cc @@ -545,8 +545,10 @@ impossible position"; while (!net->error && net->vio != 0 && !thd->killed) { + my_off_t prev_pos= pos; while (!(error = Log_event::read_log_event(&log, packet, log_lock))) { + prev_pos= my_b_tell(&log); #ifndef DBUG_OFF if (max_binlog_dump_events && !left_events--) { @@ -613,8 +615,13 @@ impossible position"; here we were reading binlog that was not closed properly (as a result of a crash ?). treat any corruption as EOF */ - if (binlog_can_be_corrupted && error != LOG_READ_MEM) + if (binlog_can_be_corrupted && + error != LOG_READ_MEM && error != LOG_READ_EOF) + { + my_b_seek(&log, prev_pos); error=LOG_READ_EOF; + } + /* TODO: now that we are logging the offset, check to make sure the recorded offset and the actual match. -- cgit v1.2.1