backporting fixes of bug@45940 to 5.1-rpl+2 to cover failures rpl_heartbeat_* as well.

mysql-test/suite/rpl/r/rpl_stop_middle_group.result: the new result file mysql-test/suite/rpl/t/rpl_stop_middle_group.test: renamed from rpl_row_stop_middle_update and added a regression test for bug#45940.
author: Andrei Elkin <aelkin@mysql.com> 2009-10-09 16:26:37 +0300
committer: Andrei Elkin <aelkin@mysql.com> 2009-10-09 16:26:37 +0300
commit: 6a541923c390a81a64a373fc91aafaea6f748b85 (patch)
tree: 754058f2682c151caea880f5435fd4fd1c0ce3ac /sql
parent: 66b869d63740a146277346ba7f7ab06264cd6527 (diff)
download: mariadb-git-6a541923c390a81a64a373fc91aafaea6f748b85.tar.gz
5 files changed, 132 insertions, 119 deletions
diff --git a/sql/log_event.cc b/sql/log_event.cc
index d8dd40c91de..e2f0029eb6e 100644
--- a/sql/log_event.cc
+++ b/sql/log_event.cc
@@ -3272,6 +3272,21 @@ Default database: '%s'. Query: '%s'",
     */
   } /* End of if (db_ok(... */
 
+  {/**
+      The following failure injecion works in cooperation with tests 
+      setting @@global.debug= 'd,stop_slave_middle_group'.
+      The sql thread receives the killed status and will proceed 
+      to shutdown trying to finish incomplete events group.
+   */
+    DBUG_EXECUTE_IF("stop_slave_middle_group",
+                    if (strcmp("COMMIT", query) != 0 &&
+                        strcmp("BEGIN", query) != 0)
+                    {
+                      if (thd->transaction.all.modified_non_trans_table)
+                        const_cast<Relay_log_info*>(rli)->abort_slave= 1;
+                    };);
+  }
+
 end:
   /*
     Probably we have set thd->query, thd->db, thd->catalog to point to places
@@ -7475,8 +7490,16 @@ int Rows_log_event::do_apply_event(Relay_log_info const *rli)
           thd->transaction.stmt.modified_non_trans_table= TRUE;
     } // row processing loop
 
-    DBUG_EXECUTE_IF("STOP_SLAVE_after_first_Rows_event",
-                    const_cast<Relay_log_info*>(rli)->abort_slave= 1;);
+    {/**
+         The following failure injecion works in cooperation with tests 
+         setting @@global.debug= 'd,stop_slave_middle_group'.
+         The sql thread receives the killed status and will proceed 
+         to shutdown trying to finish incomplete events group.
+     */
+      DBUG_EXECUTE_IF("stop_slave_middle_group",
+                      if (thd->transaction.all.modified_non_trans_table)
+                        const_cast<Relay_log_info*>(rli)->abort_slave= 1;);
+    }
 
     if ((error= do_after_row_operations(rli, error)) &&
         ignored_error_code(convert_handler_error(error, thd, table)))
@@ -7513,32 +7536,6 @@ int Rows_log_event::do_apply_event(Relay_log_info const *rli)
     const_cast<Relay_log_info*>(rli)->cleanup_context(thd, error);
     thd->is_slave_error= 1;
   }
-  /*
-    This code would ideally be placed in do_update_pos() instead, but
-    since we have no access to table there, we do the setting of
-    last_event_start_time here instead.
-  */
-  else if (table && (table->s->primary_key == MAX_KEY) &&
-           !cache_stmt && get_flags(STMT_END_F) == RLE_NO_FLAGS)
-  {
-    /*
-      ------------ Temporary fix until WL#2975 is implemented ---------
-
-      This event is not the last one (no STMT_END_F). If we stop now
-      (in case of terminate_slave_thread()), how will we restart? We
-      have to restart from Table_map_log_event, but as this table is
-      not transactional, the rows already inserted will still be
-      present, and idempotency is not guaranteed (no PK) so we risk
-      that repeating leads to double insert. So we desperately try to
-      continue, hope we'll eventually leave this buggy situation (by
-      executing the final Rows_log_event). If we are in a hopeless
-      wait (reached end of last relay log and nothing gets appended
-      there), we timeout after one minute, and notify DBA about the
-      problem.  When WL#2975 is implemented, just remove the member
-      Relay_log_info::last_event_start_time and all its occurrences.
-    */
-    const_cast<Relay_log_info*>(rli)->last_event_start_time= my_time(0);
-  }
 
   DBUG_RETURN(error);
 }
diff --git a/sql/log_event_old.cc b/sql/log_event_old.cc
index 68cd2bf4673..a335210ce53 100644
--- a/sql/log_event_old.cc
+++ b/sql/log_event_old.cc
@@ -226,7 +226,7 @@ Old_rows_log_event::do_apply_event(Old_rows_log_event *ev, const Relay_log_info
 
       row_start= row_end;
     }
-    DBUG_EXECUTE_IF("STOP_SLAVE_after_first_Rows_event",
+    DBUG_EXECUTE_IF("stop_slave_middle_group",
                     const_cast<Relay_log_info*>(rli)->abort_slave= 1;);
     error= do_after_row_operations(table, error);
     if (!ev->cache_stmt)
@@ -269,34 +269,6 @@ Old_rows_log_event::do_apply_event(Old_rows_log_event *ev, const Relay_log_info
     DBUG_RETURN(error);
   }
 
-  /*
-    This code would ideally be placed in do_update_pos() instead, but
-    since we have no access to table there, we do the setting of
-    last_event_start_time here instead.
-  */
-  if (table && (table->s->primary_key == MAX_KEY) &&
-      !ev->cache_stmt && 
-      ev->get_flags(Old_rows_log_event::STMT_END_F) == Old_rows_log_event::RLE_NO_FLAGS)
-  {
-    /*
-      ------------ Temporary fix until WL#2975 is implemented ---------
-
-      This event is not the last one (no STMT_END_F). If we stop now
-      (in case of terminate_slave_thread()), how will we restart? We
-      have to restart from Table_map_log_event, but as this table is
-      not transactional, the rows already inserted will still be
-      present, and idempotency is not guaranteed (no PK) so we risk
-      that repeating leads to double insert. So we desperately try to
-      continue, hope we'll eventually leave this buggy situation (by
-      executing the final Old_rows_log_event). If we are in a hopeless
-      wait (reached end of last relay log and nothing gets appended
-      there), we timeout after one minute, and notify DBA about the
-      problem.  When WL#2975 is implemented, just remove the member
-      st_relay_log_info::last_event_start_time and all its occurences.
-    */
-    const_cast<Relay_log_info*>(rli)->last_event_start_time= my_time(0);
-  }
-
   DBUG_RETURN(0);
 }
 #endif
@@ -1744,7 +1716,7 @@ int Old_rows_log_event::do_apply_event(Relay_log_info const *rli)
  
     } // row processing loop
 
-    DBUG_EXECUTE_IF("STOP_SLAVE_after_first_Rows_event",
+    DBUG_EXECUTE_IF("stop_slave_middle_group",
                     const_cast<Relay_log_info*>(rli)->abort_slave= 1;);
     error= do_after_row_operations(rli, error);
     if (!cache_stmt)
@@ -1787,33 +1759,6 @@ int Old_rows_log_event::do_apply_event(Relay_log_info const *rli)
     DBUG_RETURN(error);
   }
 
-  /*
-    This code would ideally be placed in do_update_pos() instead, but
-    since we have no access to table there, we do the setting of
-    last_event_start_time here instead.
-  */
-  if (table && (table->s->primary_key == MAX_KEY) &&
-      !cache_stmt && get_flags(STMT_END_F) == RLE_NO_FLAGS)
-  {
-    /*
-      ------------ Temporary fix until WL#2975 is implemented ---------
-
-      This event is not the last one (no STMT_END_F). If we stop now
-      (in case of terminate_slave_thread()), how will we restart? We
-      have to restart from Table_map_log_event, but as this table is
-      not transactional, the rows already inserted will still be
-      present, and idempotency is not guaranteed (no PK) so we risk
-      that repeating leads to double insert. So we desperately try to
-      continue, hope we'll eventually leave this buggy situation (by
-      executing the final Old_rows_log_event). If we are in a hopeless
-      wait (reached end of last relay log and nothing gets appended
-      there), we timeout after one minute, and notify DBA about the
-      problem.  When WL#2975 is implemented, just remove the member
-      Relay_log_info::last_event_start_time and all its occurrences.
-    */
-    const_cast<Relay_log_info*>(rli)->last_event_start_time= my_time(0);
-  }
-
   DBUG_RETURN(0);
 }
 
diff --git a/sql/rpl_rli.cc b/sql/rpl_rli.cc
index b3a1bbc31d2..ec3adc79ca2 100644
--- a/sql/rpl_rli.cc
+++ b/sql/rpl_rli.cc
@@ -1195,7 +1195,6 @@ void Relay_log_info::cleanup_context(THD *thd, bool error)
   */
   thd->options&= ~OPTION_NO_FOREIGN_KEY_CHECKS;
   thd->options&= ~OPTION_RELAXED_UNIQUE_CHECKS;
-  last_event_start_time= 0;
   DBUG_VOID_RETURN;
 }
 
diff --git a/sql/rpl_rli.h b/sql/rpl_rli.h
index a5410dd0c79..fd36d18adae 100644
--- a/sql/rpl_rli.h
+++ b/sql/rpl_rli.h
@@ -349,12 +349,10 @@ public:
   void clear_tables_to_lock();
 
   /*
-    Used by row-based replication to detect that it should not stop at
-    this event, but give it a chance to send more events. The time
-    where the last event inside a group started is stored here. If the
-    variable is zero, we are not in a group (but may be in a
-    transaction).
-   */
+    Used to defer stopping the SQL thread to give it a chance
+    to finish up the current group of events.
+    The timestamp is set and reset in @c sql_slave_killed().
+  */
   time_t last_event_start_time;
 
   /**
diff --git a/sql/slave.cc b/sql/slave.cc
index 44993601e78..834ed8ed933 100644
--- a/sql/slave.cc
+++ b/sql/slave.cc
@@ -49,6 +49,10 @@
 #define FLAGSTR(V,F) ((V)&(F)?#F" ":"")
 
 #define MAX_SLAVE_RETRY_PAUSE 5
+/*
+  a parameter of sql_slave_killed() to defer the killed status
+*/
+#define SLAVE_WAIT_GROUP_DONE 60
 bool use_slave_mask = 0;
 MY_BITMAP slave_error_mask;
 char slave_skip_error_names[SHOW_VAR_FUNC_BUFF_SIZE];
@@ -790,46 +794,95 @@ static bool io_slave_killed(THD* thd, Master_info* mi)
   DBUG_RETURN(mi->abort_slave || abort_loop || thd->killed);
 }
 
+/**
+   The function analyzes a possible killed status and makes
+   a decision whether to accept it or not.
+   Normally upon accepting the sql thread goes to shutdown.
+   In the event of deffering decision @rli->last_event_start_time waiting
+   timer is set to force the killed status be accepted upon its expiration.
 
+   @param thd   pointer to a THD instance
+   @param rli   pointer to Relay_log_info instance
+
+   @return TRUE the killed status is recognized, FALSE a possible killed
+           status is deferred.
+*/
 static bool sql_slave_killed(THD* thd, Relay_log_info* rli)
 {
+  bool ret= FALSE;
   DBUG_ENTER("sql_slave_killed");
 
   DBUG_ASSERT(rli->sql_thd == thd);
   DBUG_ASSERT(rli->slave_running == 1);// tracking buffer overrun
   if (abort_loop || thd->killed || rli->abort_slave)
   {
-    if (rli->abort_slave && rli->is_in_group() &&
-        thd->transaction.all.modified_non_trans_table)
-      DBUG_RETURN(0);
-    /*
-      If we are in an unsafe situation (stopping could corrupt replication),
-      we give one minute to the slave SQL thread of grace before really
-      terminating, in the hope that it will be able to read more events and
-      the unsafe situation will soon be left. Note that this one minute starts
-      from the last time anything happened in the slave SQL thread. So it's
-      really one minute of idleness, we don't timeout if the slave SQL thread
-      is actively working.
-    */
-    if (rli->last_event_start_time == 0)
-      DBUG_RETURN(1);
-    DBUG_PRINT("info", ("Slave SQL thread is in an unsafe situation, giving "
-                        "it some grace period"));
-    if (difftime(time(0), rli->last_event_start_time) > 60)
+    if (thd->transaction.all.modified_non_trans_table && rli->is_in_group())
     {
-      rli->report(ERROR_LEVEL, 0,
-                  "SQL thread had to stop in an unsafe situation, in "
-                  "the middle of applying updates to a "
-                  "non-transactional table without any primary key. "
-                  "There is a risk of duplicate updates when the slave "
-                  "SQL thread is restarted. Please check your tables' "
-                  "contents after restart.");
-      DBUG_RETURN(1);
+      char msg_stopped[]=
+        "... The slave SQL is stopped, leaving the current group "
+        "of events unfinished with a non-transaction table changed. "
+        "If the group consists solely of Row-based events, you can try "
+        "restarting the slave with --slave-exec-mode=IDEMPOTENT, which "
+        "ignores duplicate key, key not found, and similar errors (see "
+        "documentation for details).";
+
+      if (rli->abort_slave)
+      {
+        DBUG_PRINT("info", ("Slave SQL thread is being stopped in the middle of"
+                            " a group having updated a non-trans table, giving"
+                            " it some grace period"));
+
+        /*
+          Slave sql thread shutdown in face of unfinished group modified 
+          Non-trans table is handled via a timer. The slave may eventually
+          give out to complete the current group and in that case there
+          might be issues at consequent slave restart, see the error message.
+          WL#2975 offers a robust solution requiring to store the last exectuted
+          event's coordinates along with the group's coordianates
+          instead of waiting with @c last_event_start_time the timer.
+        */
+
+        if (rli->last_event_start_time == 0)
+          rli->last_event_start_time= my_time(0);
+        ret= difftime(my_time(0), rli->last_event_start_time) <=
+          SLAVE_WAIT_GROUP_DONE ? FALSE : TRUE;
+
+        DBUG_EXECUTE_IF("stop_slave_middle_group", 
+                        DBUG_EXECUTE_IF("incomplete_group_in_relay_log",
+                                        ret= TRUE;);); // time is over
+
+        if (ret == 0)
+        {
+          rli->report(WARNING_LEVEL, 0,
+                      "slave SQL thread is being stopped in the middle "
+                      "of applying of a group having updated a non-transaction "
+                      "table; waiting for the group completion ... ");
+        }
+        else
+        {
+          rli->report(ERROR_LEVEL, ER_SLAVE_FATAL_ERROR,
+                      ER(ER_SLAVE_FATAL_ERROR), msg_stopped);
+        }
+      }
+      else
+      {
+        ret= TRUE;
+        rli->report(ERROR_LEVEL, ER_SLAVE_FATAL_ERROR, ER(ER_SLAVE_FATAL_ERROR),
+                    msg_stopped);
+      }
+    }
+    else
+    {
+      ret= TRUE;
     }
   }
-  DBUG_RETURN(0);
+  if (ret)
+    rli->last_event_start_time= 0;
+  
+  DBUG_RETURN(ret);
 }
 
+
 /*
   skip_load_data_infile()
 
@@ -2525,6 +2578,27 @@ static int exec_relay_log_event(THD* thd, Relay_log_info* rli)
       delete ev;
       DBUG_RETURN(1);
     }
+
+    { /**
+         The following failure injecion works in cooperation with tests 
+         setting @@global.debug= 'd,incomplete_group_in_relay_log'.
+         Xid or Commit events are not executed to force the slave sql
+         read hanging if the realy log does not have any more events.
+      */
+      DBUG_EXECUTE_IF("incomplete_group_in_relay_log",
+                      if ((ev->get_type_code() == XID_EVENT) ||
+                          ((ev->get_type_code() == QUERY_EVENT) &&
+                           strcmp("COMMIT", ((Query_log_event *) ev)->query) == 0))
+                      {
+                        DBUG_ASSERT(thd->transaction.all.modified_non_trans_table);
+                        rli->abort_slave= 1;
+                        pthread_mutex_unlock(&rli->data_lock);
+                        delete ev;
+                        rli->inc_event_relay_log_pos();
+                        DBUG_RETURN(0);
+                      };);
+    }
+
     exec_res= apply_event_and_update_pos(ev, thd, rli, TRUE);
 
     /*
@@ -3953,7 +4027,7 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
     goto skip_relay_logging;
   }
   break;
-    
+
   default:
     inc_pos= event_len;
     break;
author	Andrei Elkin <aelkin@mysql.com>	2009-10-09 16:26:37 +0300
committer	Andrei Elkin <aelkin@mysql.com>	2009-10-09 16:26:37 +0300
commit	6a541923c390a81a64a373fc91aafaea6f748b85 (patch)
tree	754058f2682c151caea880f5435fd4fd1c0ce3ac /sql
parent	66b869d63740a146277346ba7f7ab06264cd6527 (diff)
download	mariadb-git-6a541923c390a81a64a373fc91aafaea6f748b85.tar.gz