From 5caff20216f47fc10540f7de14548cc80cd1c369 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= <marko.makela@mariadb.com>
Date: Fri, 22 Oct 2021 11:59:44 +0300
Subject: MDEV-26883 InnoDB hang due to table lock conflict

In a stress test campaign of a 10.6-based branch by Matthias Leich,
a deadlock between two InnoDB threads occurred, involving
lock_sys.wait_mutex and a dict_table_t::lock_mutex.

The cause of the hang is a latching order violation in
lock_sys_t::cancel(). That function and the latching order
violation were originally introduced in
commit 8d16da14873d880b9b5121de1619b7cb5e0f7135 (MDEV-24789).

lock_sys_t::cancel(): Invoke table->lock_mutex_trylock() in order
to avoid a deadlock. If that fails, release lock_sys.wait_mutex,
and acquire both latches. In that way, we will be obeying the
latching order and no hangs will occur.

This hang should mostly affect DDL operations. DML operations will
acquire only IX or IS table locks, which are compatible with each other.
---
 storage/innobase/lock/lock0lock.cc | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index 86c44d2e52f..33c827235be 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -5619,10 +5619,25 @@ dberr_t lock_sys_t::cancel(trx_t *trx, lock_t *lock, bool check_victim)
     {
 resolve_table_lock:
       dict_table_t *table= lock->un_member.tab_lock.table;
-      table->lock_mutex_lock();
+      if (!table->lock_mutex_trylock())
+      {
+        /* The correct latching order is:
+        lock_sys.latch, table->lock_mutex_lock(), lock_sys.wait_mutex.
+        Thus, we must release lock_sys.wait_mutex for a blocking wait. */
+        mysql_mutex_unlock(&lock_sys.wait_mutex);
+        table->lock_mutex_lock();
+        mysql_mutex_lock(&lock_sys.wait_mutex);
+        lock= trx->lock.wait_lock;
+        if (!lock)
+          goto retreat;
+        else if (check_victim && trx->lock.was_chosen_as_deadlock_victim)
+        {
+          err= DB_DEADLOCK;
+          goto retreat;
+        }
+      }
       if (lock->is_waiting())
         lock_cancel_waiting_and_release(lock);
-      table->lock_mutex_unlock();
       /* Even if lock->is_waiting() did not hold above, we must return
       DB_LOCK_WAIT, or otherwise optimistic parallel replication could
       occasionally hang. Potentially affected tests:
@@ -5630,6 +5645,8 @@ resolve_table_lock:
       rpl.rpl_parallel_optimistic_nobinlog
       rpl.rpl_parallel_optimistic_xa_lsu_off */
       err= DB_LOCK_WAIT;
+retreat:
+      table->lock_mutex_unlock();
     }
     lock_sys.rd_unlock();
   }
-- 
cgit v1.2.1