From 31f09e36c183d026a28f42ddbb9be2229613a3ed Mon Sep 17 00:00:00 2001
From: Brandon Nesterenko <brandon.nesterenko@mariadb.com>
Date: Tue, 18 Apr 2023 13:22:43 -0600
Subject: MDEV-31038: Parallel Replication Breaks if XA PREPARE Fails Updating
 Slave GTID State

If a replica failed to update the GTID slave state when committing
an XA PREPARE, the replica would retry the transaction and get an
out-of-order GTID error. This is because the commit phase of an XA
PREPARE is bifurcated. That is, first, the prepare is handled by the
relevant storage engines. Then second, the GTID slave state is
updated as a separate autocommit transaction. If the second phase
fails, and the transaction is retried, then the same transaction is
attempted to be committed again, resulting in a GTID out-of-order
error.

This patch fixes this error by immediately stopping the slave and
reporting the appropriate error. That is, there was logic to bypass
the error when updating the GTID slave state table if the underlying
error is allowed for retry on a parallel slave. This patch adds a
parameter to disallow the error bypass, thereby forcing the error
state to still happen.

Reviewed By
============
Andrei Elkin <andrei.elkin@mariadb.com>
---
 sql/log_event_server.cc | 64 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 45 insertions(+), 19 deletions(-)

(limited to 'sql/log_event_server.cc')

diff --git a/sql/log_event_server.cc b/sql/log_event_server.cc
index c5fb637a000..a2b78bc241d 100644
--- a/sql/log_event_server.cc
+++ b/sql/log_event_server.cc
@@ -152,6 +152,30 @@ is_parallel_retry_error(rpl_group_info *rgi, int err)
   return has_temporary_error(rgi->thd);
 }
 
+/**
+  Accumulate a Diagnostics_area's errors and warnings into an output buffer
+
+    @param errbuf       The output buffer to write error messages
+    @param errbuf_size  The size of the output buffer
+    @param da           The Diagnostics_area to check for errors
+*/
+static void inline aggregate_da_errors(char *errbuf, size_t errbuf_size,
+                                       Diagnostics_area *da)
+{
+  const char *errbuf_end= errbuf + errbuf_size;
+  char *slider;
+  Diagnostics_area::Sql_condition_iterator it= da->sql_conditions();
+  const Sql_condition *err;
+  size_t len;
+  for (err= it++, slider= errbuf; err && slider < errbuf_end - 1;
+       slider += len, err= it++)
+  {
+    len= my_snprintf(slider, errbuf_end - slider,
+                     " %s, Error_code: %d;", err->get_message_text(),
+                     err->get_sql_errno());
+  }
+}
+
 
 /**
    Error reporting facility for Rows_log_event::do_apply_event
@@ -172,13 +196,8 @@ static void inline slave_rows_error_report(enum loglevel level, int ha_error,
                                            const char *log_name, my_off_t pos)
 {
   const char *handler_error= (ha_error ? HA_ERR(ha_error) : NULL);
-  char buff[MAX_SLAVE_ERRMSG], *slider;
-  const char *buff_end= buff + sizeof(buff);
-  size_t len;
-  Diagnostics_area::Sql_condition_iterator it=
-    thd->get_stmt_da()->sql_conditions();
+  char buff[MAX_SLAVE_ERRMSG];
   Relay_log_info const *rli= rgi->rli;
-  const Sql_condition *err;
   buff[0]= 0;
   int errcode= thd->is_error() ? thd->get_stmt_da()->sql_errno() : 0;
 
@@ -191,13 +210,7 @@ static void inline slave_rows_error_report(enum loglevel level, int ha_error,
   if (is_parallel_retry_error(rgi, errcode))
     return;
 
-  for (err= it++, slider= buff; err && slider < buff_end - 1;
-       slider += len, err= it++)
-  {
-    len= my_snprintf(slider, buff_end - slider,
-                     " %s, Error_code: %d;", err->get_message_text(),
-                     err->get_sql_errno());
-  }
+  aggregate_da_errors(buff, sizeof(buff), thd->get_stmt_da());
 
   if (ha_error != 0)
     rli->report(level, errcode, rgi->gtid_info(),
@@ -3893,7 +3906,8 @@ bool slave_execute_deferred_events(THD *thd)
 #if defined(HAVE_REPLICATION)
 
 int Xid_apply_log_event::do_record_gtid(THD *thd, rpl_group_info *rgi,
-                                        bool in_trans, void **out_hton)
+                                        bool in_trans, void **out_hton,
+                                        bool force_err)
 {
   int err= 0;
   Relay_log_info const *rli= rgi->rli;
@@ -3908,14 +3922,26 @@ int Xid_apply_log_event::do_record_gtid(THD *thd, rpl_group_info *rgi,
     int ec= thd->get_stmt_da()->sql_errno();
     /*
       Do not report an error if this is really a kill due to a deadlock.
-      In this case, the transaction will be re-tried instead.
+      In this case, the transaction will be re-tried instead. Unless force_err
+      is set, as in the case of XA PREPARE, as the GTID state is updated as a
+      separate transaction, and if that fails, we should not retry but exit in
+      error immediately.
     */
-    if (!is_parallel_retry_error(rgi, ec))
+    if (!is_parallel_retry_error(rgi, ec) || force_err)
+    {
+      char buff[MAX_SLAVE_ERRMSG];
+      buff[0]= 0;
+      aggregate_da_errors(buff, sizeof(buff), thd->get_stmt_da());
+
+      if (force_err)
+        thd->clear_error();
+
       rli->report(ERROR_LEVEL, ER_CANNOT_UPDATE_GTID_STATE, rgi->gtid_info(),
                   "Error during XID COMMIT: failed to update GTID state in "
-                  "%s.%s: %d: %s",
+                  "%s.%s: %d: %s the event's master log %s, end_log_pos %llu",
                   "mysql", rpl_gtid_slave_state_table_name.str, ec,
-                  thd->get_stmt_da()->message());
+                  buff, RPL_LOG_NAME, log_pos);
+    }
     thd->is_slave_error= 1;
   }
 
@@ -3989,7 +4015,7 @@ int Xid_apply_log_event::do_apply_event(rpl_group_info *rgi)
   {
     DBUG_ASSERT(!thd->transaction->xid_state.is_explicit_XA());
 
-    if ((err= do_record_gtid(thd, rgi, false, &hton)))
+    if ((err= do_record_gtid(thd, rgi, false, &hton, true)))
       return err;
   }
 
-- 
cgit v1.2.1