1 files changed, 168 insertions, 103 deletions
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index 81525680a33..e733a6a1d03 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2014, 2020, MariaDB Corporation.
+Copyright (c) 2014, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -637,56 +637,82 @@ lock_rec_get_insert_intention(
 	return(lock->type_mode & LOCK_INSERT_INTENTION);
 }
 
+#ifdef UNIV_DEBUG
 #ifdef WITH_WSREP
-/** Check if both conflicting lock and other record lock are brute force
-(BF). This case is a bug so report lock information and wsrep state.
-@param[in]	lock_rec1	conflicting waiting record lock or NULL
-@param[in]	lock_rec2	other waiting record lock
-@param[in]	trx1		lock_rec1 can be NULL, trx
+/** Check if both conflicting lock transaction and other transaction
+requesting record lock are brute force (BF). If they are check is
+this BF-BF wait correct and if not report BF wait and assert.
+
+@param[in]	lock_rec	other waiting record lock
+@param[in]	trx		trx requesting conflicting record lock
 */
-static void wsrep_assert_no_bf_bf_wait(
-	const lock_t* lock_rec1,
-	const lock_t* lock_rec2,
-	const trx_t* trx1)
+static void wsrep_assert_no_bf_bf_wait(const lock_t *lock, const trx_t *trx)
 {
-	ut_ad(!lock_rec1 || lock_get_type_low(lock_rec1) == LOCK_REC);
-	ut_ad(lock_get_type_low(lock_rec2) == LOCK_REC);
+	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+	ut_ad(lock_mutex_own());
+	trx_t* lock_trx= lock->trx;
 
-	if (!trx1->is_wsrep() || !lock_rec2->trx->is_wsrep())
-		return;
-	if (UNIV_LIKELY(!wsrep_thd_is_BF(trx1->mysql_thd, FALSE)))
+	/* Note that we are holding lock_sys->mutex, thus we should
+	not acquire THD::LOCK_thd_data mutex below to avoid mutexing
+	order violation. */
+
+	if (!trx->is_wsrep() || !lock_trx->is_wsrep())
 		return;
-	if (UNIV_LIKELY(!wsrep_thd_is_BF(lock_rec2->trx->mysql_thd, FALSE)))
+	if (UNIV_LIKELY(!wsrep_thd_is_BF(trx->mysql_thd, FALSE))
+	    || UNIV_LIKELY(!wsrep_thd_is_BF(lock_trx->mysql_thd, FALSE)))
 		return;
 
-	mtr_t mtr;
+	ut_ad(trx->state == TRX_STATE_ACTIVE);
+
+	trx_mutex_enter(lock_trx);
+	const trx_state_t trx2_state= lock_trx->state;
+	trx_mutex_exit(lock_trx);
+
+	/* If transaction is already committed in memory or
+	prepared we should wait. When transaction is committed in
+	memory we held trx mutex, but not lock_sys->mutex. Therefore,
+	we could end here before transaction has time to do
+	lock_release() that is protected with lock_sys->mutex. */
+	switch (trx2_state) {
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+	case TRX_STATE_PREPARED:
+		return;
+	case TRX_STATE_ACTIVE:
+		break;
+	default:
+		ut_ad("invalid state" == 0);
+	}
 
-	if (lock_rec1) {
-		ib::error() << "Waiting lock on table: "
-			    << lock_rec1->index->table->name
-			    << " index: "
-			    << lock_rec1->index->name()
-			    << " that has conflicting lock ";
-		lock_rec_print(stderr, lock_rec1, mtr);
+	/* If BF - BF order is honored, i.e. trx already holding
+	record lock should be ordered before this new lock request
+	we can keep trx waiting for the lock. If conflicting
+	transaction is already aborting or rolling back for replaying
+	we can also let new transaction waiting. */
+	if (wsrep_thd_order_before(lock_trx->mysql_thd, trx->mysql_thd)
+	    || wsrep_thd_is_aborting(lock_trx->mysql_thd)) {
+		return;
 	}
 
+	mtr_t mtr;
+
 	ib::error() << "Conflicting lock on table: "
-		    << lock_rec2->index->table->name
+		    << lock->index->table->name
 		    << " index: "
-		    << lock_rec2->index->name()
+		    << lock->index->name()
 		    << " that has lock ";
-	lock_rec_print(stderr, lock_rec2, mtr);
+	lock_rec_print(stderr, lock, mtr);
 
 	ib::error() << "WSREP state: ";
 
-	wsrep_report_bf_lock_wait(trx1->mysql_thd,
-				  trx1->id);
-	wsrep_report_bf_lock_wait(lock_rec2->trx->mysql_thd,
-				  lock_rec2->trx->id);
+	wsrep_report_bf_lock_wait(trx->mysql_thd,
+				  trx->id);
+	wsrep_report_bf_lock_wait(lock_trx->mysql_thd,
+				  lock_trx->id);
 	/* BF-BF wait is a bug */
 	ut_error;
 }
 #endif /* WITH_WSREP */
+#endif /* UNIV_DEBUG */
 
 /*********************************************************************//**
 Checks if a lock request for a new lock has to wait for request lock2.
@@ -714,6 +740,7 @@ lock_rec_has_to_wait(
 {
 	ut_ad(trx && lock2);
 	ut_ad(lock_get_type_low(lock2) == LOCK_REC);
+	ut_ad(lock_mutex_own());
 
 	if (trx == lock2->trx
 	    || lock_mode_compatible(
@@ -794,9 +821,25 @@ lock_rec_has_to_wait(
 	}
 
 #ifdef WITH_WSREP
-	/* There should not be two conflicting locks that are
-	brute force. If there is it is a bug. */
-	wsrep_assert_no_bf_bf_wait(NULL, lock2, trx);
+		/* New lock request from a transaction is using unique key
+		scan and this transaction is a wsrep high priority transaction
+		(brute force). If conflicting transaction is also wsrep high
+		priority transaction we should avoid lock conflict because
+		ordering of these transactions is already decided and
+		conflicting transaction will be later replayed. Note
+		that thread holding conflicting lock can't be
+		committed or rolled back while we hold
+		lock_sys->mutex. */
+		if (trx->is_wsrep_UK_scan()
+		    && wsrep_thd_is_BF(lock2->trx->mysql_thd, false)) {
+			return false;
+		}
+
+		/* We very well can let bf to wait normally as other
+		BF will be replayed in case of conflict. For debug
+		builds we will do additional sanity checks to catch
+		unsupported bf wait if any. */
+		ut_d(wsrep_assert_no_bf_bf_wait(lock2, trx));
 #endif /* WITH_WSREP */
 
 	return true;
@@ -1065,65 +1108,31 @@ lock_rec_other_has_expl_req(
 #endif /* UNIV_DEBUG */
 
 #ifdef WITH_WSREP
-static
-void
-wsrep_kill_victim(
-/*==============*/
-	const trx_t * const trx,
-	const lock_t *lock)
+static void wsrep_kill_victim(const trx_t * const trx, const lock_t *lock)
 {
 	ut_ad(lock_mutex_own());
-	ut_ad(trx_mutex_own(lock->trx));
+	ut_ad(trx->is_wsrep());
+	trx_t* lock_trx = lock->trx;
+	ut_ad(trx_mutex_own(lock_trx));
+	ut_ad(lock_trx != trx);
 
-	/* quit for native mysql */
-	if (!trx->is_wsrep()) return;
-
-	if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE)) {
+	if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE))
 		return;
-	}
 
-	my_bool bf_other = wsrep_thd_is_BF(lock->trx->mysql_thd, FALSE);
-	mtr_t mtr;
+	if (lock_trx->state == TRX_STATE_COMMITTED_IN_MEMORY
+	    || lock_trx->lock.was_chosen_as_deadlock_victim)
+              return;
 
-	if ((!bf_other) ||
-		(wsrep_thd_order_before(
-			trx->mysql_thd, lock->trx->mysql_thd))) {
-
-		if (lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
-			if (UNIV_UNLIKELY(wsrep_debug)) {
-				ib::info() << "WSREP: BF victim waiting\n";
-			}
+	if (!wsrep_thd_is_BF(lock_trx->mysql_thd, FALSE)
+	    || wsrep_thd_order_before(trx->mysql_thd, lock_trx->mysql_thd)) {
+		if (lock_trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+			if (UNIV_UNLIKELY(wsrep_debug))
+				WSREP_INFO("BF victim waiting");
 			/* cannot release lock, until our lock
 			is in the queue*/
-		} else if (lock->trx != trx) {
-			if (wsrep_log_conflicts) {
-				ib::info() << "*** Priority TRANSACTION:";
-
-				trx_print_latched(stderr, trx, 3000);
-
-				if (bf_other) {
-					ib::info() << "*** Priority TRANSACTION:";
-				} else {
-					ib::info() << "*** Victim TRANSACTION:";
-				}
-                                trx_print_latched(stderr, lock->trx, 3000);
-
-				ib::info() << "*** WAITING FOR THIS LOCK TO BE GRANTED:";
-
-				if (lock_get_type(lock) == LOCK_REC) {
-					lock_rec_print(stderr, lock, mtr);
-				} else {
-					lock_table_print(stderr, lock);
-				}
-
-				ib::info() << " SQL1: "
-					   << wsrep_thd_query(trx->mysql_thd);
-				ib::info() << " SQL2: "
-					   << wsrep_thd_query(lock->trx->mysql_thd);
-			}
-
+		} else {
 			wsrep_innobase_kill_one_trx(trx->mysql_thd,
-						    lock->trx, true);
+						    lock_trx, true);
 		}
 	}
 }
@@ -1454,11 +1463,6 @@ lock_rec_create_low(
 
 			trx_mutex_exit(c_lock->trx);
 
-			if (UNIV_UNLIKELY(wsrep_debug)) {
-				wsrep_report_bf_lock_wait(trx->mysql_thd, trx->id);
-				wsrep_report_bf_lock_wait(c_lock->trx->mysql_thd, c_lock->trx->id);
-			}
-
 			/* have to bail out here to avoid lock_set_lock... */
 			return(lock);
 		}
@@ -2222,10 +2226,6 @@ static void lock_rec_dequeue_from_page(lock_t* in_lock)
 				/* Grant the lock */
 				ut_ad(lock->trx != in_lock->trx);
 				lock_grant(lock);
-#ifdef WITH_WSREP
-			} else {
-				wsrep_assert_no_bf_bf_wait(c, lock, c->trx);
-#endif /* WITH_WSREP */
 			}
 		}
 	} else {
@@ -4178,10 +4178,6 @@ released:
 				/* Grant the lock */
 				ut_ad(trx != lock->trx);
 				lock_grant(lock);
-#ifdef WITH_WSREP
-			} else {
-				wsrep_assert_no_bf_bf_wait(c, lock, c->trx);
-#endif /* WITH_WSREP */
 			}
 		}
 	} else {
@@ -4237,6 +4233,18 @@ lock_check_dict_lock(
 and release possible other transactions waiting because of these locks. */
 void lock_release(trx_t* trx)
 {
+#ifdef UNIV_DEBUG
+	std::set<table_id_t> to_evict;
+	if (innodb_evict_tables_on_commit_debug && !trx->is_recovered)
+# if 1 /* if dict_stats_exec_sql() were not playing dirty tricks */
+	if (!mutex_own(&dict_sys.mutex))
+# else /* this would be more proper way to do it */
+	if (!trx->dict_operation_lock_mode && !trx->dict_operation)
+# endif
+	for (const auto& p : trx->mod_tables)
+		if (!p.first->is_temporary())
+			to_evict.emplace(p.first->id);
+#endif
 	ulint		count = 0;
 	trx_id_t	max_trx_id = trx_sys.get_max_trx_id();
 
@@ -4285,6 +4293,25 @@ void lock_release(trx_t* trx)
 	}
 
 	lock_mutex_exit();
+
+#ifdef UNIV_DEBUG
+	if (to_evict.empty()) {
+		return;
+	}
+	mutex_enter(&dict_sys.mutex);
+	lock_mutex_enter();
+	for (table_id_t id : to_evict) {
+		if (dict_table_t *table = dict_table_open_on_id(
+			    id, TRUE, DICT_TABLE_OP_OPEN_ONLY_IF_CACHED)) {
+			if (!table->get_ref_count()
+			    && !UT_LIST_GET_LEN(table->locks)) {
+				dict_sys.remove(table, true);
+			}
+		}
+	}
+	lock_mutex_exit();
+	mutex_exit(&dict_sys.mutex);
+#endif
 }
 
 /* True if a lock mode is S or X */
@@ -4454,7 +4481,8 @@ static void lock_rec_print(FILE* file, const lock_t* lock, mtr_t& mtr)
 			ut_ad(!page_rec_is_metadata(rec));
 
 			offsets = rec_get_offsets(
-				rec, lock->index, offsets, true,
+				rec, lock->index, offsets,
+				lock->index->n_core_fields,
 				ULINT_UNDEFINED, &heap);
 
 			putc(' ', file);
@@ -5000,8 +5028,8 @@ loop:
 			ut_ad(!lock_rec_get_nth_bit(lock, i)
 			      || page_rec_is_leaf(rec));
 			offsets = rec_get_offsets(rec, lock->index, offsets,
-						  true, ULINT_UNDEFINED,
-						  &heap);
+						  lock->index->n_core_fields,
+						  ULINT_UNDEFINED, &heap);
 
 			/* If this thread is holding the file space
 			latch (fil_space_t::latch), the following
@@ -5332,7 +5360,8 @@ lock_rec_insert_check_and_lock(
 		const rec_offs*	offsets;
 		rec_offs_init(offsets_);
 
-		offsets = rec_get_offsets(next_rec, index, offsets_, true,
+		offsets = rec_get_offsets(next_rec, index, offsets_,
+					  index->n_core_fields,
 					  ULINT_UNDEFINED, &heap);
 
 		ut_ad(lock_rec_queue_validate(
@@ -5640,6 +5669,19 @@ lock_sec_rec_modify_check_and_lock(
 
 	heap_no = page_rec_get_heap_no(rec);
 
+#ifdef WITH_WSREP
+	trx_t *trx= thr_get_trx(thr);
+	/* If transaction scanning an unique secondary key is wsrep
+	high priority thread (brute force) this scanning may involve
+	GAP-locking in the index. As this locking happens also when
+	applying replication events in high priority applier threads,
+	there is a probability for lock conflicts between two wsrep
+	high priority threads. To avoid this GAP-locking we mark that
+	this transaction is using unique key scan here. */
+	if (trx->is_wsrep() && wsrep_thd_is_BF(trx->mysql_thd, false))
+		trx->wsrep_UK_scan= true;
+#endif /* WITH_WSREP */
+
 	/* Another transaction cannot have an implicit lock on the record,
 	because when we come here, we already have modified the clustered
 	index record, and this would not have been possible if another active
@@ -5648,6 +5690,10 @@ lock_sec_rec_modify_check_and_lock(
 	err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP,
 			    block, heap_no, index, thr);
 
+#ifdef WITH_WSREP
+	trx->wsrep_UK_scan= false;
+#endif /* WITH_WSREP */
+
 #ifdef UNIV_DEBUG
 	{
 		mem_heap_t*	heap		= NULL;
@@ -5655,7 +5701,8 @@ lock_sec_rec_modify_check_and_lock(
 		const rec_offs*	offsets;
 		rec_offs_init(offsets_);
 
-		offsets = rec_get_offsets(rec, index, offsets_, true,
+		offsets = rec_get_offsets(rec, index, offsets_,
+					  index->n_core_fields,
 					  ULINT_UNDEFINED, &heap);
 
 		ut_ad(lock_rec_queue_validate(
@@ -5739,9 +5786,26 @@ lock_sec_rec_read_check_and_lock(
 		return DB_SUCCESS;
 	}
 
+#ifdef WITH_WSREP
+	trx_t *trx= thr_get_trx(thr);
+	/* If transaction scanning an unique secondary key is wsrep
+	high priority thread (brute force) this scanning may involve
+	GAP-locking in the index. As this locking happens also when
+	applying replication events in high priority applier threads,
+	there is a probability for lock conflicts between two wsrep
+	high priority threads. To avoid this GAP-locking we mark that
+	this transaction is using unique key scan here. */
+	if (trx->is_wsrep() && wsrep_thd_is_BF(trx->mysql_thd, false))
+		trx->wsrep_UK_scan= true;
+#endif /* WITH_WSREP */
+
 	err = lock_rec_lock(FALSE, ulint(mode) | gap_mode,
 			    block, heap_no, index, thr);
 
+#ifdef WITH_WSREP
+	trx->wsrep_UK_scan= false;
+#endif /* WITH_WSREP */
+
 	ut_ad(lock_rec_queue_validate(FALSE, block, rec, index, offsets));
 
 	return(err);
@@ -5850,7 +5914,7 @@ lock_clust_rec_read_check_and_lock_alt(
 	rec_offs_init(offsets_);
 
 	ut_ad(page_rec_is_leaf(rec));
-	offsets = rec_get_offsets(rec, index, offsets, true,
+	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
 				  ULINT_UNDEFINED, &tmp_heap);
 	err = lock_clust_rec_read_check_and_lock(flags, block, rec, index,
 						 offsets, mode, gap_mode, thr);
@@ -6141,6 +6205,7 @@ lock_cancel_waiting_and_release(
 
 	ut_ad(lock_mutex_own());
 	ut_ad(trx_mutex_own(lock->trx));
+	ut_ad(lock->trx->state == TRX_STATE_ACTIVE);
 
 	lock->trx->lock.cancel = true;