diff options
author | Monty <monty@mariadb.org> | 2017-01-29 22:10:56 +0200 |
---|---|---|
committer | Sergei Golubchik <serg@mariadb.org> | 2017-02-28 16:10:46 +0100 |
commit | e65f667bb60244610512efd7491fc77eccceb9db (patch) | |
tree | ff549849324d917615ab896afb7e25ee0bfc7396 /sql/rpl_mi.cc | |
parent | d5c54f3990b49d7c7d6a410016e85e8e58803895 (diff) | |
download | mariadb-git-e65f667bb60244610512efd7491fc77eccceb9db.tar.gz |
MDEV-9573 'Stop slave' hangs on replication slave
The reason for this is that stop slave takes LOCK_active_mi over the
whole operation while some slave operations will also need LOCK_active_mi
which causes deadlocks.
Fixed by introducing object counting for Master_info and not taking
LOCK_active_mi over stop slave or even stop_all_slaves()
Another benefit of this approach is that it allows:
- Multiple threads can run SHOW SLAVE STATUS at the same time
- START/STOP/RESET/SLAVE STATUS on a slave will not block other slaves
- Simpler interface for handling get_master_info()
- Added some missing unlock of 'log_lock' in error condtions
- Moved rpl_parallel_inactivate_pool(&global_rpl_thread_pool) to end
of stop_slave() to not have to use LOCK_active_mi inside
terminate_slave_threads()
- Changed argument for remove_master_info() to Master_info, as we always
have this available
- Fixed core dump when doing FLUSH TABLES WITH READ LOCK and parallel
replication. Problem was that waiting for pause_for_ftwrl was not done
when deleting rpt->current_owner after a force_abort.
Diffstat (limited to 'sql/rpl_mi.cc')
-rw-r--r-- | sql/rpl_mi.cc | 375 |
1 files changed, 275 insertions, 100 deletions
diff --git a/sql/rpl_mi.cc b/sql/rpl_mi.cc index 249bf7608e5..7aa84e72bef 100644 --- a/sql/rpl_mi.cc +++ b/sql/rpl_mi.cc @@ -40,7 +40,9 @@ Master_info::Master_info(LEX_STRING *connection_name_arg, sync_counter(0), heartbeat_period(0), received_heartbeats(0), master_id(0), prev_master_id(0), using_gtid(USE_GTID_NO), events_queued_since_last_gtid(0), - gtid_reconnect_event_skip_count(0), gtid_event_seen(false) + gtid_reconnect_event_skip_count(0), gtid_event_seen(false), + in_start_all_slaves(0), in_stop_all_slaves(0), + users(0), killed(0) { host[0] = 0; user[0] = 0; password[0] = 0; ssl_ca[0]= 0; ssl_capath[0]= 0; ssl_cert[0]= 0; @@ -87,8 +89,27 @@ Master_info::Master_info(LEX_STRING *connection_name_arg, mysql_cond_init(key_master_info_sleep_cond, &sleep_cond, NULL); } + +/** + Wait until no one is using Master_info +*/ + +void Master_info::wait_until_free() +{ + mysql_mutex_lock(&sleep_lock); + killed= 1; + while (users) + mysql_cond_wait(&sleep_cond, &sleep_lock); + mysql_mutex_unlock(&sleep_lock); +} + +/** + Delete master_info +*/ + Master_info::~Master_info() { + wait_until_free(); rpl_filters.delete_element(connection_name.str, connection_name.length, (void (*)(const char*, uchar*)) free_rpl_filter); my_free(connection_name.str); @@ -709,8 +730,13 @@ uchar *get_key_master_info(Master_info *mi, size_t *length, void free_key_master_info(Master_info *mi) { DBUG_ENTER("free_key_master_info"); + /* Ensure that we are not in reset_slave while this is done */ + lock_slave_threads(mi); terminate_slave_threads(mi,SLAVE_FORCE_ALL); + /* We use 2 here instead of 1 just to make it easier when debugging */ + mi->killed= 2; end_master_info(mi); + unlock_slave_threads(mi); delete mi; DBUG_VOID_RETURN; } @@ -867,9 +893,27 @@ Master_info_index::Master_info_index() index_file.file= -1; } + +/** + Free all connection threads + + This is done during early stages of shutdown + to give connection threads and slave threads time + to die before ~Master_info_index is called +*/ + +void Master_info_index::free_connections() +{ + my_hash_reset(&master_info_hash); +} + + +/** + Free all connection threads and free structures +*/ + Master_info_index::~Master_info_index() { - /* This will close connection for all objects in the cache */ my_hash_free(&master_info_hash); end_io_cache(&index_file); if (index_file.file >= 0) @@ -1001,7 +1045,6 @@ bool Master_info_index::init_all_master_info() if (master_info_index->add_master_info(mi, FALSE)) DBUG_RETURN(1); succ_num++; - unlock_slave_threads(mi); if (!opt_skip_slave_start) { @@ -1022,6 +1065,7 @@ bool Master_info_index::init_all_master_info() (int) connection_name.length, connection_name.str); } + unlock_slave_threads(mi); } } @@ -1073,6 +1117,71 @@ bool Master_info_index::write_master_name_to_index_file(LEX_STRING *name, /** + Get Master_info for a connection and lock the object from deletion + + @param + connection_name Connection name + warning WARN_LEVEL_NOTE -> Don't print anything + WARN_LEVEL_WARN -> Issue warning if not exists + WARN_LEVEL_ERROR-> Issue error if not exists +*/ + +Master_info *get_master_info(LEX_STRING *connection_name, + Sql_condition::enum_warning_level warning) +{ + Master_info *mi; + DBUG_ENTER("get_master_info"); + + /* Protect against inserts into hash */ + mysql_mutex_lock(&LOCK_active_mi); + /* + The following can only be true during shutdown when slave has been killed + but some other threads are still trying to access slave statistics. + */ + if (unlikely(!master_info_index)) + { + if (warning != Sql_condition::WARN_LEVEL_NOTE) + my_error(WARN_NO_MASTER_INFO, + MYF(warning == Sql_condition::WARN_LEVEL_WARN ? + ME_JUST_WARNING : 0), + (int) connection_name->length, connection_name->str); + mysql_mutex_unlock(&LOCK_active_mi); + DBUG_RETURN(0); + } + if ((mi= master_info_index->get_master_info(connection_name, warning))) + { + /* + We have to use sleep_lock here. If we would use LOCK_active_mi + then we would take locks in wrong order in Master_info::release() + */ + mysql_mutex_lock(&mi->sleep_lock); + mi->users++; + DBUG_PRINT("info",("users: %d", mi->users)); + mysql_mutex_unlock(&mi->sleep_lock); + } + mysql_mutex_unlock(&LOCK_active_mi); + DBUG_RETURN(mi); +} + + +/** + Release master info. + Signals ~Master_info that it's now safe to delete it +*/ + +void Master_info::release() +{ + mysql_mutex_lock(&sleep_lock); + if (!--users && killed) + { + /* Signal ~Master_info that it's ok to now free it */ + mysql_cond_signal(&sleep_cond); + } + mysql_mutex_unlock(&sleep_lock); +} + + +/** Get Master_info for a connection @param @@ -1094,8 +1203,6 @@ Master_info_index::get_master_info(LEX_STRING *connection_name, ("connection_name: '%.*s'", (int) connection_name->length, connection_name->str)); - mysql_mutex_assert_owner(&LOCK_active_mi); - /* Make name lower case for comparison */ res= strmake(buff, connection_name->str, connection_name->length); my_casedn_str(system_charset_info, buff); @@ -1187,105 +1294,123 @@ bool Master_info_index::add_master_info(Master_info *mi, bool write_to_file) atomic */ -bool Master_info_index::remove_master_info(LEX_STRING *name) +bool Master_info_index::remove_master_info(Master_info *mi) { - Master_info* mi; DBUG_ENTER("remove_master_info"); + mysql_mutex_assert_owner(&LOCK_active_mi); - if ((mi= get_master_info(name, Sql_condition::WARN_LEVEL_WARN))) + // Delete Master_info and rewrite others to file + if (!my_hash_delete(&master_info_hash, (uchar*) mi)) { - // Delete Master_info and rewrite others to file - if (!my_hash_delete(&master_info_hash, (uchar*) mi)) + File index_file_nr; + + // Close IO_CACHE and FILE handler fisrt + end_io_cache(&index_file); + my_close(index_file.file, MYF(MY_WME)); + + // Reopen File and truncate it + if ((index_file_nr= my_open(index_file_name, + O_RDWR | O_CREAT | O_TRUNC | O_BINARY , + MYF(MY_WME))) < 0 || + init_io_cache(&index_file, index_file_nr, + IO_SIZE, WRITE_CACHE, + my_seek(index_file_nr,0L,MY_SEEK_END,MYF(0)), + 0, MYF(MY_WME | MY_WAIT_IF_FULL))) { - File index_file_nr; - - // Close IO_CACHE and FILE handler fisrt - end_io_cache(&index_file); - my_close(index_file.file, MYF(MY_WME)); - - // Reopen File and truncate it - if ((index_file_nr= my_open(index_file_name, - O_RDWR | O_CREAT | O_TRUNC | O_BINARY , - MYF(MY_WME))) < 0 || - init_io_cache(&index_file, index_file_nr, - IO_SIZE, WRITE_CACHE, - my_seek(index_file_nr,0L,MY_SEEK_END,MYF(0)), - 0, MYF(MY_WME | MY_WAIT_IF_FULL))) - { - int error= my_errno; - if (index_file_nr >= 0) - my_close(index_file_nr,MYF(0)); - - sql_print_error("Create of Master Info Index file '%s' failed with " - "error: %M", - index_file_name, error); - DBUG_RETURN(TRUE); - } + int error= my_errno; + if (index_file_nr >= 0) + my_close(index_file_nr,MYF(0)); - // Rewrite Master_info.index - for (uint i= 0; i< master_info_hash.records; ++i) - { - Master_info *tmp_mi; - tmp_mi= (Master_info *) my_hash_element(&master_info_hash, i); - write_master_name_to_index_file(&tmp_mi->connection_name, 0); - } - my_sync(index_file_nr, MYF(MY_WME)); + sql_print_error("Create of Master Info Index file '%s' failed with " + "error: %M", + index_file_name, error); + DBUG_RETURN(TRUE); + } + + // Rewrite Master_info.index + for (uint i= 0; i< master_info_hash.records; ++i) + { + Master_info *tmp_mi; + tmp_mi= (Master_info *) my_hash_element(&master_info_hash, i); + write_master_name_to_index_file(&tmp_mi->connection_name, 0); } + if (my_sync(index_file_nr, MYF(MY_WME))) + DBUG_RETURN(TRUE); } DBUG_RETURN(FALSE); } /** - Master_info_index::give_error_if_slave_running() + give_error_if_slave_running() + + @param + already_locked 0 if we need to lock, 1 if we have LOCK_active_mi_locked @return TRUE If some slave is running. An error is printed FALSE No slave is running */ -bool Master_info_index::give_error_if_slave_running() +bool give_error_if_slave_running(bool already_locked) { + bool ret= 0; DBUG_ENTER("give_error_if_slave_running"); - mysql_mutex_assert_owner(&LOCK_active_mi); - for (uint i= 0; i< master_info_hash.records; ++i) + if (!already_locked) + mysql_mutex_lock(&LOCK_active_mi); + if (!master_info_index) { - Master_info *mi; - mi= (Master_info *) my_hash_element(&master_info_hash, i); - if (mi->rli.slave_running != MYSQL_SLAVE_NOT_RUN) + my_error(ER_SERVER_SHUTDOWN, MYF(0)); + ret= 1; + } + else + { + HASH *hash= &master_info_index->master_info_hash; + for (uint i= 0; i< hash->records; ++i) { - my_error(ER_SLAVE_MUST_STOP, MYF(0), (int) mi->connection_name.length, - mi->connection_name.str); - DBUG_RETURN(TRUE); + Master_info *mi; + mi= (Master_info *) my_hash_element(hash, i); + if (mi->rli.slave_running != MYSQL_SLAVE_NOT_RUN) + { + my_error(ER_SLAVE_MUST_STOP, MYF(0), (int) mi->connection_name.length, + mi->connection_name.str); + ret= 1; + break; + } } } - DBUG_RETURN(FALSE); + if (!already_locked) + mysql_mutex_unlock(&LOCK_active_mi); + DBUG_RETURN(ret); } /** - Master_info_index::any_slave_sql_running() - - The LOCK_active_mi must be held while calling this function. + any_slave_sql_running() @return 0 No Slave SQL thread is running # Number of slave SQL thread running */ -uint Master_info_index::any_slave_sql_running() +uint any_slave_sql_running() { uint count= 0; DBUG_ENTER("any_slave_sql_running"); - mysql_mutex_assert_owner(&LOCK_active_mi); - for (uint i= 0; i< master_info_hash.records; ++i) + mysql_mutex_lock(&LOCK_active_mi); + if (likely(master_info_index)) // Not shutdown { - Master_info *mi= (Master_info *)my_hash_element(&master_info_hash, i); - if (mi->rli.slave_running != MYSQL_SLAVE_NOT_RUN) - count++; + HASH *hash= &master_info_index->master_info_hash; + for (uint i= 0; i< hash->records; ++i) + { + Master_info *mi= (Master_info *)my_hash_element(hash, i); + if (mi->rli.slave_running != MYSQL_SLAVE_NOT_RUN) + count++; + } } + mysql_mutex_unlock(&LOCK_active_mi); DBUG_RETURN(count); } @@ -1298,15 +1423,25 @@ uint Master_info_index::any_slave_sql_running() @return TRUE Error FALSE Everything ok. + + This code is written so that we don't keep LOCK_active_mi active + while we are starting a slave. */ bool Master_info_index::start_all_slaves(THD *thd) { bool result= FALSE; - DBUG_ENTER("warn_if_slave_running"); + DBUG_ENTER("start_all_slaves"); mysql_mutex_assert_owner(&LOCK_active_mi); - for (uint i= 0; i< master_info_hash.records; ++i) + for (uint i= 0; i< master_info_hash.records; i++) + { + Master_info *mi; + mi= (Master_info *) my_hash_element(&master_info_hash, i); + mi->in_start_all_slaves= 0; + } + + for (uint i= 0; i< master_info_hash.records; ) { int error; Master_info *mi; @@ -1316,25 +1451,40 @@ bool Master_info_index::start_all_slaves(THD *thd) Try to start all slaves that are configured (host is defined) and are not already running */ - if ((mi->slave_running == MYSQL_SLAVE_NOT_RUN || - !mi->rli.slave_running) && *mi->host) + if (!((mi->slave_running == MYSQL_SLAVE_NOT_RUN || + !mi->rli.slave_running) && *mi->host) || + mi->in_start_all_slaves) { - if ((error= start_slave(thd, mi, 1))) - { - my_error(ER_CANT_START_STOP_SLAVE, MYF(0), - "START", - (int) mi->connection_name.length, - mi->connection_name.str); - result= 1; - if (error < 0) // fatal error - break; - } - else - push_warning_printf(thd, Sql_condition::WARN_LEVEL_NOTE, - ER_SLAVE_STARTED, ER(ER_SLAVE_STARTED), - (int) mi->connection_name.length, - mi->connection_name.str); + i++; + continue; + } + mi->in_start_all_slaves= 1; + + mysql_mutex_lock(&mi->sleep_lock); + mi->users++; // Mark used + mysql_mutex_unlock(&mi->sleep_lock); + mysql_mutex_unlock(&LOCK_active_mi); + error= start_slave(thd, mi, 1); + mi->release(); + mysql_mutex_lock(&LOCK_active_mi); + if (error) + { + my_error(ER_CANT_START_STOP_SLAVE, MYF(0), + "START", + (int) mi->connection_name.length, + mi->connection_name.str); + result= 1; + if (error < 0) // fatal error + break; } + else + push_warning_printf(thd, Sql_condition::WARN_LEVEL_NOTE, + ER_SLAVE_STARTED, ER(ER_SLAVE_STARTED), + (int) mi->connection_name.length, + mi->connection_name.str); + /* Restart from first element as master_info_hash may have changed */ + i= 0; + continue; } DBUG_RETURN(result); } @@ -1348,38 +1498,63 @@ bool Master_info_index::start_all_slaves(THD *thd) @return TRUE Error FALSE Everything ok. + + This code is written so that we don't keep LOCK_active_mi active + while we are stopping a slave. */ bool Master_info_index::stop_all_slaves(THD *thd) { bool result= FALSE; - DBUG_ENTER("warn_if_slave_running"); + DBUG_ENTER("stop_all_slaves"); mysql_mutex_assert_owner(&LOCK_active_mi); - for (uint i= 0; i< master_info_hash.records; ++i) + for (uint i= 0; i< master_info_hash.records; i++) + { + Master_info *mi; + mi= (Master_info *) my_hash_element(&master_info_hash, i); + mi->in_stop_all_slaves= 0; + } + + for (uint i= 0; i< master_info_hash.records ;) { int error; Master_info *mi; mi= (Master_info *) my_hash_element(&master_info_hash, i); - if ((mi->slave_running != MYSQL_SLAVE_NOT_RUN || - mi->rli.slave_running)) + if (!(mi->slave_running != MYSQL_SLAVE_NOT_RUN || + mi->rli.slave_running) || + mi->in_stop_all_slaves) { - if ((error= stop_slave(thd, mi, 1))) - { - my_error(ER_CANT_START_STOP_SLAVE, MYF(0), - "STOP", - (int) mi->connection_name.length, - mi->connection_name.str); - result= 1; - if (error < 0) // Fatal error - break; - } - else - push_warning_printf(thd, Sql_condition::WARN_LEVEL_NOTE, - ER_SLAVE_STOPPED, ER(ER_SLAVE_STOPPED), - (int) mi->connection_name.length, - mi->connection_name.str); + i++; + continue; } + mi->in_stop_all_slaves= 1; // Protection for loops + + mysql_mutex_lock(&mi->sleep_lock); + mi->users++; // Mark used + mysql_mutex_unlock(&mi->sleep_lock); + mysql_mutex_unlock(&LOCK_active_mi); + error= stop_slave(thd, mi, 1); + mi->release(); + mysql_mutex_lock(&LOCK_active_mi); + if (error) + { + my_error(ER_CANT_START_STOP_SLAVE, MYF(0), + "STOP", + (int) mi->connection_name.length, + mi->connection_name.str); + result= 1; + if (error < 0) // Fatal error + break; + } + else + push_warning_printf(thd, Sql_condition::WARN_LEVEL_NOTE, + ER_SLAVE_STOPPED, ER(ER_SLAVE_STOPPED), + (int) mi->connection_name.length, + mi->connection_name.str); + /* Restart from first element as master_info_hash may have changed */ + i= 0; + continue; } DBUG_RETURN(result); } |