From b91e77cff3fb5fbb32ebb061ed342469b434c4e8 Mon Sep 17 00:00:00 2001 From: Sergei Golubchik Date: Fri, 12 Feb 2021 11:29:40 +0100 Subject: fix a 3-way deadlock in galera_sr.galera-features#56 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rarely (try --repeat 1000), the following happens: * from wsrep_bf_abort (when a thread is being killed), wsrep-lib starts streaming_rollback that wants to convert_streaming_client_to_applier. wsrep_create_streaming_applier creates a new THD(). All while the other THD is being killed, so under LOCK_thd_kill and LOCK_thd_data. In particular, THD::init() takes LOCK_global_system_variables under LOCK_thd_kill. * updating @@wsrep_slave_threads takes LOCK_global_system_variables and LOCK_wsrep_cluster_config (in that order) and invokes wsrep_slave_threads_update() that takes LOCK_wsrep_slave_threads * wsrep_replication_process() takes LOCK_wsrep_slave_threads and invokes wsrep_close_applier(), that does thd->set_killed() which takes LOCK_thd_kill. et voilĂ . As a fix I copied a workaround from wsrep_cluster_address_update() to wsrep_slave_threads_update(). It seems to be safe: without mutexes a race condition is possible and a concurrent SET might change wsrep_slave_threads, but wsrep_slave_threads_update() always verifies if there's a need to do something, so it will not run twice in this case, it'll be a no-op. --- sql/wsrep_var.cc | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'sql/wsrep_var.cc') diff --git a/sql/wsrep_var.cc b/sql/wsrep_var.cc index dea388d30de..239daadc4f6 100644 --- a/sql/wsrep_var.cc +++ b/sql/wsrep_var.cc @@ -674,7 +674,11 @@ static void wsrep_slave_count_change_update () bool wsrep_slave_threads_update (sys_var *self, THD* thd, enum_var_type type) { + mysql_mutex_unlock(&LOCK_wsrep_cluster_config); + mysql_mutex_unlock(&LOCK_global_system_variables); mysql_mutex_lock(&LOCK_wsrep_slave_threads); + mysql_mutex_lock(&LOCK_global_system_variables); + mysql_mutex_lock(&LOCK_wsrep_cluster_config); bool res= false; wsrep_slave_count_change_update(); -- cgit v1.2.1 From 26965387230a9b13fb716344477d108bb87dea98 Mon Sep 17 00:00:00 2001 From: Sergei Golubchik Date: Fri, 12 Feb 2021 17:31:25 +0100 Subject: updating @@wsrep_cluster_address deadlocks wsrep_cluster_address_update() causes LOCK_wsrep_slave_threads to be locked under LOCK_wsrep_cluster_config, while normally the order should be the opposite. Fix: don't protect @@wsrep_cluster_address value with the LOCK_wsrep_cluster_config, LOCK_global_system_variables is enough. Only protect wsrep reinitialization with the LOCK_wsrep_cluster_config. And make it use a local copy of the global @@wsrep_cluster_address. Also, introduce a helper function that checks whether wsrep_cluster_address is set and also asserts that it can be safely read by the caller. --- sql/wsrep_var.cc | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'sql/wsrep_var.cc') diff --git a/sql/wsrep_var.cc b/sql/wsrep_var.cc index 239daadc4f6..35f14532e85 100644 --- a/sql/wsrep_var.cc +++ b/sql/wsrep_var.cc @@ -554,30 +554,31 @@ bool wsrep_cluster_address_update (sys_var *self, THD* thd, enum_var_type type) /* stop replication is heavy operation, and includes closing all client connections. Closing clients may need to get LOCK_global_system_variables at least in MariaDB. - - Note: releasing LOCK_global_system_variables may cause race condition, if - there can be several concurrent clients changing wsrep_provider */ + char *tmp= my_strdup(wsrep_cluster_address, MYF(MY_WME)); WSREP_DEBUG("wsrep_cluster_address_update: %s", wsrep_cluster_address); mysql_mutex_unlock(&LOCK_global_system_variables); + + mysql_mutex_lock(&LOCK_wsrep_cluster_config); wsrep_stop_replication(thd); - if (wsrep_start_replication()) + if (*tmp && wsrep_start_replication(tmp)) { wsrep_create_rollbacker(); WSREP_DEBUG("Cluster address update creating %ld applier threads running %lu", wsrep_slave_threads, wsrep_running_applier_threads); wsrep_create_appliers(wsrep_slave_threads); } - /* locking order to be enforced is: - 1. LOCK_global_system_variables - 2. LOCK_wsrep_cluster_config - => have to juggle mutexes to comply with this - */ - mysql_mutex_unlock(&LOCK_wsrep_cluster_config); + mysql_mutex_lock(&LOCK_global_system_variables); - mysql_mutex_lock(&LOCK_wsrep_cluster_config); + if (strcmp(tmp, wsrep_cluster_address)) + { + my_free((void*)wsrep_cluster_address); + wsrep_cluster_address= tmp; + } + else + my_free(tmp); return false; } @@ -674,11 +675,12 @@ static void wsrep_slave_count_change_update () bool wsrep_slave_threads_update (sys_var *self, THD* thd, enum_var_type type) { - mysql_mutex_unlock(&LOCK_wsrep_cluster_config); + if (!wsrep_cluster_address_exists()) + return false; + mysql_mutex_unlock(&LOCK_global_system_variables); mysql_mutex_lock(&LOCK_wsrep_slave_threads); mysql_mutex_lock(&LOCK_global_system_variables); - mysql_mutex_lock(&LOCK_wsrep_cluster_config); bool res= false; wsrep_slave_count_change_update(); -- cgit v1.2.1