From 31f09e36c183d026a28f42ddbb9be2229613a3ed Mon Sep 17 00:00:00 2001 From: Brandon Nesterenko Date: Tue, 18 Apr 2023 13:22:43 -0600 Subject: MDEV-31038: Parallel Replication Breaks if XA PREPARE Fails Updating Slave GTID State If a replica failed to update the GTID slave state when committing an XA PREPARE, the replica would retry the transaction and get an out-of-order GTID error. This is because the commit phase of an XA PREPARE is bifurcated. That is, first, the prepare is handled by the relevant storage engines. Then second, the GTID slave state is updated as a separate autocommit transaction. If the second phase fails, and the transaction is retried, then the same transaction is attempted to be committed again, resulting in a GTID out-of-order error. This patch fixes this error by immediately stopping the slave and reporting the appropriate error. That is, there was logic to bypass the error when updating the GTID slave state table if the underlying error is allowed for retry on a parallel slave. This patch adds a parameter to disallow the error bypass, thereby forcing the error state to still happen. Reviewed By ============ Andrei Elkin --- sql/log_event_server.cc | 64 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 19 deletions(-) (limited to 'sql/log_event_server.cc') diff --git a/sql/log_event_server.cc b/sql/log_event_server.cc index c5fb637a000..a2b78bc241d 100644 --- a/sql/log_event_server.cc +++ b/sql/log_event_server.cc @@ -152,6 +152,30 @@ is_parallel_retry_error(rpl_group_info *rgi, int err) return has_temporary_error(rgi->thd); } +/** + Accumulate a Diagnostics_area's errors and warnings into an output buffer + + @param errbuf The output buffer to write error messages + @param errbuf_size The size of the output buffer + @param da The Diagnostics_area to check for errors +*/ +static void inline aggregate_da_errors(char *errbuf, size_t errbuf_size, + Diagnostics_area *da) +{ + const char *errbuf_end= errbuf + errbuf_size; + char *slider; + Diagnostics_area::Sql_condition_iterator it= da->sql_conditions(); + const Sql_condition *err; + size_t len; + for (err= it++, slider= errbuf; err && slider < errbuf_end - 1; + slider += len, err= it++) + { + len= my_snprintf(slider, errbuf_end - slider, + " %s, Error_code: %d;", err->get_message_text(), + err->get_sql_errno()); + } +} + /** Error reporting facility for Rows_log_event::do_apply_event @@ -172,13 +196,8 @@ static void inline slave_rows_error_report(enum loglevel level, int ha_error, const char *log_name, my_off_t pos) { const char *handler_error= (ha_error ? HA_ERR(ha_error) : NULL); - char buff[MAX_SLAVE_ERRMSG], *slider; - const char *buff_end= buff + sizeof(buff); - size_t len; - Diagnostics_area::Sql_condition_iterator it= - thd->get_stmt_da()->sql_conditions(); + char buff[MAX_SLAVE_ERRMSG]; Relay_log_info const *rli= rgi->rli; - const Sql_condition *err; buff[0]= 0; int errcode= thd->is_error() ? thd->get_stmt_da()->sql_errno() : 0; @@ -191,13 +210,7 @@ static void inline slave_rows_error_report(enum loglevel level, int ha_error, if (is_parallel_retry_error(rgi, errcode)) return; - for (err= it++, slider= buff; err && slider < buff_end - 1; - slider += len, err= it++) - { - len= my_snprintf(slider, buff_end - slider, - " %s, Error_code: %d;", err->get_message_text(), - err->get_sql_errno()); - } + aggregate_da_errors(buff, sizeof(buff), thd->get_stmt_da()); if (ha_error != 0) rli->report(level, errcode, rgi->gtid_info(), @@ -3893,7 +3906,8 @@ bool slave_execute_deferred_events(THD *thd) #if defined(HAVE_REPLICATION) int Xid_apply_log_event::do_record_gtid(THD *thd, rpl_group_info *rgi, - bool in_trans, void **out_hton) + bool in_trans, void **out_hton, + bool force_err) { int err= 0; Relay_log_info const *rli= rgi->rli; @@ -3908,14 +3922,26 @@ int Xid_apply_log_event::do_record_gtid(THD *thd, rpl_group_info *rgi, int ec= thd->get_stmt_da()->sql_errno(); /* Do not report an error if this is really a kill due to a deadlock. - In this case, the transaction will be re-tried instead. + In this case, the transaction will be re-tried instead. Unless force_err + is set, as in the case of XA PREPARE, as the GTID state is updated as a + separate transaction, and if that fails, we should not retry but exit in + error immediately. */ - if (!is_parallel_retry_error(rgi, ec)) + if (!is_parallel_retry_error(rgi, ec) || force_err) + { + char buff[MAX_SLAVE_ERRMSG]; + buff[0]= 0; + aggregate_da_errors(buff, sizeof(buff), thd->get_stmt_da()); + + if (force_err) + thd->clear_error(); + rli->report(ERROR_LEVEL, ER_CANNOT_UPDATE_GTID_STATE, rgi->gtid_info(), "Error during XID COMMIT: failed to update GTID state in " - "%s.%s: %d: %s", + "%s.%s: %d: %s the event's master log %s, end_log_pos %llu", "mysql", rpl_gtid_slave_state_table_name.str, ec, - thd->get_stmt_da()->message()); + buff, RPL_LOG_NAME, log_pos); + } thd->is_slave_error= 1; } @@ -3989,7 +4015,7 @@ int Xid_apply_log_event::do_apply_event(rpl_group_info *rgi) { DBUG_ASSERT(!thd->transaction->xid_state.is_explicit_XA()); - if ((err= do_record_gtid(thd, rgi, false, &hton))) + if ((err= do_record_gtid(thd, rgi, false, &hton, true))) return err; } -- cgit v1.2.1