From 5caff20216f47fc10540f7de14548cc80cd1c369 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Fri, 22 Oct 2021 11:59:44 +0300 Subject: MDEV-26883 InnoDB hang due to table lock conflict In a stress test campaign of a 10.6-based branch by Matthias Leich, a deadlock between two InnoDB threads occurred, involving lock_sys.wait_mutex and a dict_table_t::lock_mutex. The cause of the hang is a latching order violation in lock_sys_t::cancel(). That function and the latching order violation were originally introduced in commit 8d16da14873d880b9b5121de1619b7cb5e0f7135 (MDEV-24789). lock_sys_t::cancel(): Invoke table->lock_mutex_trylock() in order to avoid a deadlock. If that fails, release lock_sys.wait_mutex, and acquire both latches. In that way, we will be obeying the latching order and no hangs will occur. This hang should mostly affect DDL operations. DML operations will acquire only IX or IS table locks, which are compatible with each other. --- storage/innobase/lock/lock0lock.cc | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index 86c44d2e52f..33c827235be 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -5619,10 +5619,25 @@ dberr_t lock_sys_t::cancel(trx_t *trx, lock_t *lock, bool check_victim) { resolve_table_lock: dict_table_t *table= lock->un_member.tab_lock.table; - table->lock_mutex_lock(); + if (!table->lock_mutex_trylock()) + { + /* The correct latching order is: + lock_sys.latch, table->lock_mutex_lock(), lock_sys.wait_mutex. + Thus, we must release lock_sys.wait_mutex for a blocking wait. */ + mysql_mutex_unlock(&lock_sys.wait_mutex); + table->lock_mutex_lock(); + mysql_mutex_lock(&lock_sys.wait_mutex); + lock= trx->lock.wait_lock; + if (!lock) + goto retreat; + else if (check_victim && trx->lock.was_chosen_as_deadlock_victim) + { + err= DB_DEADLOCK; + goto retreat; + } + } if (lock->is_waiting()) lock_cancel_waiting_and_release(lock); - table->lock_mutex_unlock(); /* Even if lock->is_waiting() did not hold above, we must return DB_LOCK_WAIT, or otherwise optimistic parallel replication could occasionally hang. Potentially affected tests: @@ -5630,6 +5645,8 @@ resolve_table_lock: rpl.rpl_parallel_optimistic_nobinlog rpl.rpl_parallel_optimistic_xa_lsu_off */ err= DB_LOCK_WAIT; +retreat: + table->lock_mutex_unlock(); } lock_sys.rd_unlock(); } -- cgit v1.2.1