diff options
Diffstat (limited to 'sql/slave.cc')
-rw-r--r-- | sql/slave.cc | 218 |
1 files changed, 173 insertions, 45 deletions
diff --git a/sql/slave.cc b/sql/slave.cc index 81be7064f89..3b64e23ece5 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -38,6 +38,7 @@ #include <my_dir.h> #include <sql_common.h> #include <errmsg.h> +#include <mysqld_error.h> #include <mysys_err.h> #ifdef HAVE_REPLICATION @@ -511,7 +512,7 @@ terminate_slave_thread(THD *thd, int error; DBUG_PRINT("loop", ("killing slave thread")); - pthread_mutex_lock(&thd->LOCK_delete); + pthread_mutex_lock(&thd->LOCK_thd_data); #ifndef DONT_USE_THR_ALARM /* Error codes from pthread_kill are: @@ -522,7 +523,7 @@ terminate_slave_thread(THD *thd, DBUG_ASSERT(err != EINVAL); #endif thd->awake(THD::NOT_KILLED); - pthread_mutex_unlock(&thd->LOCK_delete); + pthread_mutex_unlock(&thd->LOCK_thd_data); /* There is a small chance that slave thread might miss the first @@ -859,6 +860,29 @@ int init_intvar_from_file(int* var, IO_CACHE* f, int default_val) DBUG_RETURN(1); } + +/* + Check if the error is caused by network. + @param[in] errorno Number of the error. + RETURNS: + TRUE network error + FALSE not network error +*/ + +bool is_network_error(uint errorno) +{ + if (errorno == CR_CONNECTION_ERROR || + errorno == CR_CONN_HOST_ERROR || + errorno == CR_SERVER_GONE_ERROR || + errorno == CR_SERVER_LOST || + errorno == ER_CON_COUNT_ERROR || + errorno == ER_SERVER_SHUTDOWN) + return TRUE; + + return FALSE; +} + + /* Note that we rely on the master's version (3.23, 4.0.14 etc) instead of relying on the binlog's version. This is not perfect: imagine an upgrade @@ -871,6 +895,7 @@ int init_intvar_from_file(int* var, IO_CACHE* f, int default_val) RETURNS 0 ok 1 error + 2 transient network problem, the caller should try to reconnect */ static int get_master_version_and_clock(MYSQL* mysql, Master_info* mi) @@ -956,6 +981,8 @@ static int get_master_version_and_clock(MYSQL* mysql, Master_info* mi) unavailable (very old master not supporting UNIX_TIMESTAMP()?). */ + DBUG_SYNC_POINT("debug_lock.before_get_UNIX_TIMESTAMP", 10); + master_res= NULL; if (!mysql_real_query(mysql, STRING_WITH_LEN("SELECT UNIX_TIMESTAMP()")) && (master_res= mysql_store_result(mysql)) && (master_row= mysql_fetch_row(master_res))) @@ -963,7 +990,13 @@ static int get_master_version_and_clock(MYSQL* mysql, Master_info* mi) mi->clock_diff_with_master= (long) (time((time_t*) 0) - strtoul(master_row[0], 0, 10)); } - else if (!check_io_slave_killed(mi->io_thd, mi, NULL)) + else if (is_network_error(mysql_errno(mysql))) + { + mi->report(WARNING_LEVEL, mysql_errno(mysql), + "Get master clock failed with error: %s", mysql_error(mysql)); + goto network_err; + } + else { mi->clock_diff_with_master= 0; /* The "most sensible" value */ sql_print_warning("\"SELECT UNIX_TIMESTAMP()\" failed on master, " @@ -972,7 +1005,10 @@ static int get_master_version_and_clock(MYSQL* mysql, Master_info* mi) mysql_error(mysql), mysql_errno(mysql)); } if (master_res) + { mysql_free_result(master_res); + master_res= NULL; + } /* Check that the master's server id and ours are different. Because if they @@ -984,12 +1020,15 @@ static int get_master_version_and_clock(MYSQL* mysql, Master_info* mi) Note: we could have put a @@SERVER_ID in the previous SELECT UNIX_TIMESTAMP() instead, but this would not have worked on 3.23 masters. */ + DBUG_SYNC_POINT("debug_lock.before_get_SERVER_ID", 10); + master_res= NULL; + master_row= NULL; if (!mysql_real_query(mysql, STRING_WITH_LEN("SHOW VARIABLES LIKE 'SERVER_ID'")) && - (master_res= mysql_store_result(mysql))) + (master_res= mysql_store_result(mysql)) && + (master_row= mysql_fetch_row(master_res))) { - if ((master_row= mysql_fetch_row(master_res)) && - (::server_id == strtoul(master_row[1], 0, 10)) && + if ((::server_id == strtoul(master_row[1], 0, 10)) && !mi->rli.replicate_same_server_id) { errmsg= "The slave I/O thread stops because master and slave have equal \ @@ -998,10 +1037,34 @@ the --replicate-same-server-id option must be used on slave but this does \ not always make sense; please check the manual before using it)."; err_code= ER_SLAVE_FATAL_ERROR; sprintf(err_buff, ER(err_code), errmsg); + goto err; } + } + else if (mysql_errno(mysql)) + { + if (is_network_error(mysql_errno(mysql))) + { + mi->report(WARNING_LEVEL, mysql_errno(mysql), + "Get master SERVER_ID failed with error: %s", mysql_error(mysql)); + goto network_err; + } + /* Fatal error */ + errmsg= "The slave I/O thread stops because a fatal error is encountered \ +when it try to get the value of SERVER_ID variable from master."; + err_code= mysql_errno(mysql); + sprintf(err_buff, "%s Error: %s", errmsg, mysql_error(mysql)); + goto err; + } + else if (!master_row && master_res) + { + mi->report(WARNING_LEVEL, ER_UNKNOWN_SYSTEM_VARIABLE, + "Unknown system variable 'SERVER_ID' on master, \ +maybe it is a *VERY OLD MASTER*."); + } + if (master_res) + { mysql_free_result(master_res); - if (errmsg) - goto err; + master_res= NULL; } /* @@ -1025,23 +1088,50 @@ not always make sense; please check the manual before using it)."; if (*mysql->server_version == '3') goto err; - if ((*mysql->server_version == '4') && - !mysql_real_query(mysql, - STRING_WITH_LEN("SELECT @@GLOBAL.COLLATION_SERVER")) && - (master_res= mysql_store_result(mysql))) + if (*mysql->server_version == '4') { - if ((master_row= mysql_fetch_row(master_res)) && - strcmp(master_row[0], global_system_variables.collation_server->name)) + master_res= NULL; + if (!mysql_real_query(mysql, + STRING_WITH_LEN("SELECT @@GLOBAL.COLLATION_SERVER")) && + (master_res= mysql_store_result(mysql)) && + (master_row= mysql_fetch_row(master_res))) { - errmsg= "The slave I/O thread stops because master and slave have \ + if (strcmp(master_row[0], global_system_variables.collation_server->name)) + { + errmsg= "The slave I/O thread stops because master and slave have \ different values for the COLLATION_SERVER global variable. The values must \ -be equal for replication to work"; - err_code= ER_SLAVE_FATAL_ERROR; - sprintf(err_buff, ER(err_code), errmsg); +be equal for the Statement-format replication to work"; + err_code= ER_SLAVE_FATAL_ERROR; + sprintf(err_buff, ER(err_code), errmsg); + goto err; + } } - mysql_free_result(master_res); - if (errmsg) + else if (is_network_error(mysql_errno(mysql))) + { + mi->report(WARNING_LEVEL, mysql_errno(mysql), + "Get master COLLATION_SERVER failed with error: %s", mysql_error(mysql)); + goto network_err; + } + else if (mysql_errno(mysql) != ER_UNKNOWN_SYSTEM_VARIABLE) + { + /* Fatal error */ + errmsg= "The slave I/O thread stops because a fatal error is encountered \ +when it try to get the value of COLLATION_SERVER global variable from master."; + err_code= mysql_errno(mysql); + sprintf(err_buff, "%s Error: %s", errmsg, mysql_error(mysql)); goto err; + } + else + mi->report(WARNING_LEVEL, ER_UNKNOWN_SYSTEM_VARIABLE, + "Unknown system variable 'COLLATION_SERVER' on master, \ +maybe it is a *VERY OLD MASTER*. *NOTE*: slave may experience \ +inconsistency if replicated data deals with collation."); + + if (master_res) + { + mysql_free_result(master_res); + master_res= NULL; + } } /* @@ -1059,35 +1149,62 @@ be equal for replication to work"; This check is only necessary for 4.x masters (and < 5.0.4 masters but those were alpha). */ - if ((*mysql->server_version == '4') && - !mysql_real_query(mysql, STRING_WITH_LEN("SELECT @@GLOBAL.TIME_ZONE")) && - (master_res= mysql_store_result(mysql))) + if (*mysql->server_version == '4') { - if ((master_row= mysql_fetch_row(master_res)) && - strcmp(master_row[0], - global_system_variables.time_zone->get_name()->ptr())) + master_res= NULL; + if (!mysql_real_query(mysql, STRING_WITH_LEN("SELECT @@GLOBAL.TIME_ZONE")) && + (master_res= mysql_store_result(mysql)) && + (master_row= mysql_fetch_row(master_res))) { - errmsg= "The slave I/O thread stops because master and slave have \ + if (strcmp(master_row[0], + global_system_variables.time_zone->get_name()->ptr())) + { + errmsg= "The slave I/O thread stops because master and slave have \ different values for the TIME_ZONE global variable. The values must \ -be equal for replication to work"; - err_code= ER_SLAVE_FATAL_ERROR; - sprintf(err_buff, ER(err_code), errmsg); +be equal for the Statement-format replication to work"; + err_code= ER_SLAVE_FATAL_ERROR; + sprintf(err_buff, ER(err_code), errmsg); + goto err; + } } - mysql_free_result(master_res); - - if (errmsg) + else if (is_network_error(mysql_errno(mysql))) + { + mi->report(WARNING_LEVEL, mysql_errno(mysql), + "Get master TIME_ZONE failed with error: %s", mysql_error(mysql)); + goto network_err; + } + else + { + /* Fatal error */ + errmsg= "The slave I/O thread stops because a fatal error is encountered \ +when it try to get the value of TIME_ZONE global variable from master."; + err_code= mysql_errno(mysql); + sprintf(err_buff, "%s Error: %s", errmsg, mysql_error(mysql)); goto err; + } + if (master_res) + { + mysql_free_result(master_res); + master_res= NULL; + } } err: if (errmsg) { + if (master_res) + mysql_free_result(master_res); DBUG_ASSERT(err_code != 0); mi->report(ERROR_LEVEL, err_code, err_buff); DBUG_RETURN(1); } DBUG_RETURN(0); + +network_err: + if (master_res) + mysql_free_result(master_res); + DBUG_RETURN(2); } /* @@ -1133,15 +1250,13 @@ static int create_table_from_dump(THD* thd, MYSQL *mysql, const char* db, DBUG_RETURN(1); } thd->command = COM_TABLE_DUMP; - thd->query_length= packet_len; - /* Note that we should not set thd->query until the area is initalized */ if (!(query = thd->strmake((char*) net->read_pos, packet_len))) { sql_print_error("create_table_from_dump: out of memory"); my_message(ER_GET_ERRNO, "Out of memory", MYF(0)); DBUG_RETURN(1); } - thd->query= query; + thd->set_query(query, packet_len); thd->is_slave_error = 0; bzero((char*) &tables,sizeof(tables)); @@ -2374,6 +2489,7 @@ pthread_handler_t handle_slave_io(void *arg) char llbuff[22]; uint retry_count; bool suppress_warnings; + int ret; #ifndef DBUG_OFF uint retry_count_reg= 0, retry_count_dump= 0, retry_count_event= 0; #endif @@ -2454,8 +2570,23 @@ connected: mi->slave_running= MYSQL_SLAVE_RUN_CONNECT; thd->slave_net = &mysql->net; thd_proc_info(thd, "Checking master version"); - if (get_master_version_and_clock(mysql, mi)) + ret= get_master_version_and_clock(mysql, mi); + if (ret == 1) + /* Fatal error */ goto err; + + if (ret == 2) + { + if (check_io_slave_killed(mi->io_thd, mi, "Slave I/O thread killed" + "while calling get_master_version_and_clock(...)")) + goto err; + suppress_warnings= FALSE; + /* Try to reconnect because the error was caused by a transient network problem */ + if (try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings, + reconnect_messages[SLAVE_RECON_ACT_REG])) + goto err; + goto connected; + } if (mi->rli.relay_log.description_event_for_queue->binlog_version > 1) { @@ -2622,10 +2753,8 @@ err: // print the current replication position sql_print_information("Slave I/O thread exiting, read up to log '%s', position %s", IO_RPL_LOG_NAME, llstr(mi->master_log_pos,llbuff)); - VOID(pthread_mutex_lock(&LOCK_thread_count)); - thd->query = thd->db = 0; // extra safety - thd->query_length= thd->db_length= 0; - VOID(pthread_mutex_unlock(&LOCK_thread_count)); + thd->set_query(NULL, 0); + thd->reset_db(NULL, 0); if (mysql) { /* @@ -2977,15 +3106,14 @@ the slave SQL thread with \"SLAVE START\". We stopped at log \ must "proactively" clear playgrounds: */ rli->cleanup_context(thd, 1); - VOID(pthread_mutex_lock(&LOCK_thread_count)); /* Some extra safety, which should not been needed (normally, event deletion should already have done these assignments (each event which sets these variables is supposed to set them to 0 before terminating)). */ - thd->query= thd->db= thd->catalog= 0; - thd->query_length= thd->db_length= 0; - VOID(pthread_mutex_unlock(&LOCK_thread_count)); + thd->catalog= 0; + thd->set_query(NULL, 0); + thd->reset_db(NULL, 0); thd_proc_info(thd, "Waiting for slave mutex on exit"); pthread_mutex_lock(&rli->run_lock); /* We need data_lock, at least to wake up any waiting master_pos_wait() */ |