1 files changed, 321 insertions, 372 deletions
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
index 1e181872e87..fb911fc29f5 100644
--- a/storage/innobase/buf/buf0buf.cc
+++ b/storage/innobase/buf/buf0buf.cc
@@ -126,24 +126,15 @@ in the file along with the file page, resides in the control block.
 
 		Buffer pool struct
 		------------------
-The buffer buf_pool contains a single mutex which protects all the
+The buffer buf_pool contains several mutexes which protect all the
 control data structures of the buf_pool. The content of a buffer frame is
 protected by a separate read-write lock in its control block, though.
-These locks can be locked and unlocked without owning the buf_pool.mutex.
-The OS events in the buf_pool struct can be waited for without owning the
-buf_pool.mutex.
-
-The buf_pool.mutex is a hot-spot in main memory, causing a lot of
-memory bus traffic on multiprocessor systems when processors
-alternately access the mutex. On our Pentium, the mutex is accessed
-maybe every 10 microseconds. We gave up the solution to have mutexes
-for each control block, for instance, because it seemed to be
-complicated.
-
-A solution to reduce mutex contention of the buf_pool.mutex is to
-create a separate mutex for the page hash table. On Pentium,
-accessing the hash table takes 2 microseconds, about half
-of the total buf_pool.mutex hold time.
+
+buf_pool.mutex protects the buf_pool.LRU list and buf_page_t::state;
+buf_pool.free_list_mutex protects the free_list and withdraw list;
+buf_pool.flush_state_mutex protects the flush state related data structures;
+buf_pool.zip_free mutex protects the zip_free arrays;
+buf_pool.zip_hash mutex protects the zip_hash hash and in_zip_hash flag.
 
 		Control blocks
 		--------------
@@ -158,16 +149,6 @@ The buffer frames have to be aligned so that the start memory
 address of a frame is divisible by the universal page size, which
 is a power of two.
 
-We intend to make the buffer buf_pool size on-line reconfigurable,
-that is, the buf_pool size can be changed without closing the database.
-Then the database administarator may adjust it to be bigger
-at night, for example. The control block array must
-contain enough control blocks for the maximum buffer buf_pool size
-which is used in the particular database.
-If the buf_pool size is cut, we exploit the virtual memory mechanism of
-the OS, and just refrain from using frames at high addresses. Then the OS
-can swap them to disk.
-
 The control blocks containing file pages are put to a hash table
 according to the file address of the page.
 We could speed up the access to an individual page by using
@@ -1522,8 +1503,7 @@ bool buf_pool_t::create()
   n_chunks= srv_buf_pool_size / srv_buf_pool_chunk_unit;
   const size_t chunk_size= srv_buf_pool_chunk_unit;
 
-  chunks= static_cast<buf_pool_t::chunk_t*>(ut_zalloc_nokey(n_chunks *
-                                                            sizeof *chunks));
+  chunks= static_cast<chunk_t*>(ut_zalloc_nokey(n_chunks * sizeof *chunks));
   UT_LIST_INIT(free, &buf_page_t::list);
   curr_size= 0;
   auto chunk= chunks;
@@ -1555,8 +1535,12 @@ bool buf_pool_t::create()
   while (++chunk < chunks + n_chunks);
 
   ut_ad(is_initialised());
-  mutex_create(LATCH_ID_BUF_POOL, &mutex);
+  mutex_create(LATCH_ID_BUF_POOL_LRU_LIST, &mutex);
+  mutex_create(LATCH_ID_BUF_POOL_FREE_LIST, &free_list_mutex);
+  mutex_create(LATCH_ID_BUF_POOL_ZIP_FREE, &zip_free_mutex);
+  mutex_create(LATCH_ID_BUF_POOL_ZIP_HASH, &zip_hash_mutex);
   mutex_create(LATCH_ID_BUF_POOL_ZIP, &zip_mutex);
+  mutex_create(LATCH_ID_BUF_POOL_FLUSH_STATE, &flush_state_mutex);
 
   UT_LIST_INIT(LRU, &buf_page_t::LRU);
   UT_LIST_INIT(withdraw, &buf_page_t::list);
@@ -1610,14 +1594,9 @@ bool buf_pool_t::create()
   io_buf.create((srv_n_read_io_threads + srv_n_write_io_threads) *
                 OS_AIO_N_PENDING_IOS_PER_THREAD);
 
-  /* FIXME: remove some of these variables */
-  srv_buf_pool_curr_size= curr_pool_size;
-  srv_buf_pool_old_size= srv_buf_pool_size;
-  srv_buf_pool_base_size= srv_buf_pool_size;
-
   chunk_t::map_ref= chunk_t::map_reg;
   buf_LRU_old_ratio_update(100 * 3 / 8, false);
-  btr_search_sys_create(srv_buf_pool_curr_size / sizeof(void*) / 64);
+  btr_search_sys_create(curr_pool_size / sizeof(void*) / 64);
   ut_ad(is_initialised());
   return false;
 }
@@ -1630,6 +1609,10 @@ void buf_pool_t::close()
     return;
 
   mutex_free(&mutex);
+  mutex_free(&free_list_mutex);
+  mutex_free(&zip_free_mutex);
+  mutex_free(&zip_hash_mutex);
+  mutex_free(&flush_state_mutex);
   mutex_free(&zip_mutex);
   mutex_free(&flush_list_mutex);
 
@@ -1807,21 +1790,18 @@ inline bool buf_pool_t::realloc(buf_block_t *block)
 			new_block->page.id.page_no()));
 
 		rw_lock_x_unlock(hash_lock);
+		mutex_exit(&block->mutex);
 		mutex_exit(&new_block->mutex);
 
 		/* free block */
 		buf_block_set_state(block, BUF_BLOCK_MEMORY);
 		buf_LRU_block_free_non_file_page(block);
-
-		mutex_exit(&block->mutex);
 	} else {
 		rw_lock_x_unlock(hash_lock);
 		mutex_exit(&block->mutex);
 
 		/* free new_block */
-		mutex_enter(&new_block->mutex);
 		buf_LRU_block_free_non_file_page(new_block);
-		mutex_exit(&new_block->mutex);
 	}
 
 	return(true); /* free_list was enough */
@@ -1858,21 +1838,24 @@ inline bool buf_pool_t::withdraw_blocks()
 {
 	buf_block_t*	block;
 	ulint		loop_count = 0;
+	ulint		lru_len;
 
 	ib::info() << "start to withdraw the last "
 		<< withdraw_target << " blocks";
 
 	/* Minimize zip_free[i] lists */
-	mutex_enter(&mutex);
 	buf_buddy_condense_free();
+
+	mutex_enter(&mutex);
+	lru_len = UT_LIST_GET_LEN(LRU);
 	mutex_exit(&mutex);
 
+	mutex_enter(&free_list_mutex);
 	while (UT_LIST_GET_LEN(withdraw) < withdraw_target) {
 
 		/* try to withdraw from free_list */
 		ulint	count1 = 0;
 
-		mutex_enter(&mutex);
 		block = reinterpret_cast<buf_block_t*>(
 			UT_LIST_GET_FIRST(free));
 		while (block != NULL
@@ -1887,7 +1870,7 @@ inline bool buf_pool_t::withdraw_blocks()
 				UT_LIST_GET_NEXT(
 					list, &block->page));
 
-			if (buf_pool.will_be_withdrawn(block->page)) {
+			if (will_be_withdrawn(block->page)) {
 				/* This should be withdrawn */
 				UT_LIST_REMOVE(free, &block->page);
 				UT_LIST_ADD_LAST(withdraw, &block->page);
@@ -1897,7 +1880,6 @@ inline bool buf_pool_t::withdraw_blocks()
 
 			block = next_block;
 		}
-		mutex_exit(&mutex);
 
 		/* reserve free_list length */
 		if (UT_LIST_GET_LEN(withdraw) < withdraw_target) {
@@ -1905,15 +1887,12 @@ inline bool buf_pool_t::withdraw_blocks()
 			flush_counters_t n;
 
 			/* cap scan_depth with current LRU size. */
-			mutex_enter(&mutex);
-			scan_depth = UT_LIST_GET_LEN(LRU);
-			mutex_exit(&mutex);
-
 			scan_depth = ut_min(
 				ut_max(withdraw_target
 				       - UT_LIST_GET_LEN(withdraw),
 				       static_cast<ulint>(srv_LRU_scan_depth)),
-				scan_depth);
+				lru_len);
+			mutex_exit(&free_list_mutex);
 
 			buf_flush_do_batch(BUF_FLUSH_LRU, scan_depth, 0, &n);
 			buf_flush_wait_batch_end(BUF_FLUSH_LRU);
@@ -1925,6 +1904,9 @@ inline bool buf_pool_t::withdraw_blocks()
 					MONITOR_LRU_BATCH_FLUSH_PAGES,
 					n.flushed);
 			}
+		} else {
+
+			mutex_exit(&free_list_mutex);
 		}
 
 		/* relocate blocks/buddies in withdrawn area */
@@ -1946,33 +1928,27 @@ inline bool buf_pool_t::withdraw_blocks()
 			    && will_be_withdrawn(bpage->zip.data)
 			    && buf_page_can_relocate(bpage)) {
 				mutex_exit(block_mutex);
-				buf_pool_mutex_exit_forbid();
 				if (!buf_buddy_realloc(
 					    bpage->zip.data,
 					    page_zip_get_size(&bpage->zip))) {
 					/* failed to allocate block */
-					buf_pool_mutex_exit_allow();
 					break;
 				}
-				buf_pool_mutex_exit_allow();
 				mutex_enter(block_mutex);
 				count2++;
 			}
 
 			if (buf_page_get_state(bpage)
 			    == BUF_BLOCK_FILE_PAGE
-			    && buf_pool.will_be_withdrawn(*bpage)) {
+			    && will_be_withdrawn(*bpage)) {
 				if (buf_page_can_relocate(bpage)) {
 					mutex_exit(block_mutex);
-					buf_pool_mutex_exit_forbid();
 					if (!realloc(
 						reinterpret_cast<buf_block_t*>(
 							bpage))) {
 						/* failed to allocate block */
-						buf_pool_mutex_exit_allow();
 						break;
 					}
-					buf_pool_mutex_exit_allow();
 					count2++;
 				} else {
 					mutex_exit(block_mutex);
@@ -1985,8 +1961,16 @@ inline bool buf_pool_t::withdraw_blocks()
 
 			bpage = next_bpage;
 		}
+
 		mutex_exit(&mutex);
 
+		if (++loop_count >= 10) {
+			ib::info() << "will retry to withdraw later";
+			return true;
+		}
+
+		mutex_enter(&free_list_mutex);
+
 		buf_resize_status(
 			"withdrawing blocks. (" ULINTPF "/" ULINTPF ")",
 			UT_LIST_GET_LEN(withdraw),
@@ -1997,17 +1981,8 @@ inline bool buf_pool_t::withdraw_blocks()
 			<< " Tried to relocate " << count2 << " pages ("
 			<< UT_LIST_GET_LEN(withdraw) << "/"
 			<< withdraw_target << ")";
-
-		if (++loop_count >= 10) {
-			/* give up for now.
-			retried after user threads paused. */
-
-			ib::info() << "will retry to withdraw later";
-
-			/* need retry later */
-			return(true);
-		}
 	}
+	mutex_exit(&free_list_mutex);
 
 	/* confirm withdrawn enough */
 	for (const chunk_t* chunk = chunks + n_chunks_new,
@@ -2019,9 +1994,13 @@ inline bool buf_pool_t::withdraw_blocks()
 		}
 	}
 
+	mutex_enter(&free_list_mutex);
+
 	ib::info() << "withdrawn target: " << UT_LIST_GET_LEN(withdraw)
 		   << " blocks";
 
+	mutex_exit(&free_list_mutex);
+
 	/* retry is not needed */
 	++withdraw_clock_;
 
@@ -2033,6 +2012,7 @@ static void buf_pool_resize_hash()
 {
 	hash_table_t*	new_hash_table;
 
+	ut_ad(mutex_own(&buf_pool.zip_hash_mutex));
 	ut_ad(buf_pool.page_hash_old == NULL);
 
 	/* recreate page_hash */
@@ -2118,21 +2098,27 @@ inline void buf_pool_t::resize()
 
 	ulint new_instance_size = srv_buf_pool_size >> srv_page_size_shift;
 
-	buf_resize_status("Resizing buffer pool from " ULINTPF " to "
+	buf_resize_status("Resizing buffer pool to "
 			  ULINTPF " (unit=" ULINTPF ").",
-			  srv_buf_pool_old_size, srv_buf_pool_size,
+			  srv_buf_pool_size,
 			  srv_buf_pool_chunk_unit);
 
-	mutex_enter(&mutex);
+	// No locking needed to read, same thread updated those
 	ut_ad(curr_size == old_size);
 	ut_ad(n_chunks_new == n_chunks);
+#ifdef UNIV_DEBUG
+	mutex_enter(&free_list_mutex);
 	ut_ad(UT_LIST_GET_LEN(withdraw) == 0);
+	mutex_exit(&free_list_mutex);
+
+	mutex_enter(&flush_list_mutex);
 	ut_ad(flush_rbt == NULL);
+	mutex_exit(&flush_list_mutex);
+#endif
 
 	n_chunks_new = (new_instance_size << srv_page_size_shift)
 		/ srv_buf_pool_chunk_unit;
 	curr_size = n_chunks_new * chunks->size;
-	mutex_exit(&mutex);
 
 #ifdef BTR_CUR_HASH_ADAPT
 	/* disable AHI if needed */
@@ -2267,8 +2253,18 @@ withdraw_retry:
 	/* Indicate critical path */
 	resizing.store(true, std::memory_order_relaxed);
 
+	/* Acquire all buffer pool mutexes and hash table locks */
+	/* TODO: while we certainly lock a lot here, it does not necessarily
+	buy us enough correctness. Exploits the fact that freed pages must
+	have no pointers to them from the buffer pool nor from any other thread
+	except for the freeing one to remove redundant locking. The same applies
+	to freshly allocated pages before any pointers to them are published.*/
 	mutex_enter(&mutex);
 	hash_lock_x_all(page_hash);
+	mutex_enter(&zip_free_mutex);
+	mutex_enter(&free_list_mutex);
+	mutex_enter(&zip_hash_mutex);
+	mutex_enter(&flush_state_mutex);
 	chunk_t::map_reg = UT_NEW_NOKEY(chunk_t::map());
 
 	/* add/delete chunks */
@@ -2399,14 +2395,14 @@ calc_buf_pool_size:
 	read_ahead_area = ut_min(
 		BUF_READ_AHEAD_PAGES,
 		ut_2_power_up(curr_size / BUF_READ_AHEAD_PORTION));
+	ulint old_pool_size = curr_pool_size;
 	curr_pool_size = n_chunks * srv_buf_pool_chunk_unit;
-	srv_buf_pool_curr_size = curr_pool_size;/* FIXME: remove*/
 	old_size = curr_size;
-	innodb_set_buf_pool_size(buf_pool_size_align(srv_buf_pool_curr_size));
+	innodb_set_buf_pool_size(buf_pool_size_align(curr_pool_size));
 
 	const bool	new_size_too_diff
-		= srv_buf_pool_base_size > srv_buf_pool_size * 2
-			|| srv_buf_pool_base_size * 2 < srv_buf_pool_size;
+		= old_pool_size/2 > curr_pool_size
+			|| old_pool_size < curr_pool_size/2;
 
 	/* Normalize page_hash and zip_hash,
 	if the new size is too different */
@@ -2416,8 +2412,12 @@ calc_buf_pool_size:
 		ib::info() << "hash tables were resized";
 	}
 
-	hash_unlock_x_all(page_hash);
 	mutex_exit(&mutex);
+	hash_unlock_x_all(page_hash);
+	mutex_exit(&zip_free_mutex);
+	mutex_exit(&free_list_mutex);
+	mutex_exit(&zip_hash_mutex);
+	mutex_exit(&flush_state_mutex);
 
 	if (page_hash_old != NULL) {
 		hash_table_free(page_hash_old);
@@ -2430,8 +2430,6 @@ calc_buf_pool_size:
 
 	/* Normalize other components, if the new size is too different */
 	if (!warning && new_size_too_diff) {
-		srv_buf_pool_base_size = srv_buf_pool_size;
-
 		buf_resize_status("Resizing also other hash tables.");
 
 		/* normalize lock_sys */
@@ -2440,8 +2438,7 @@ calc_buf_pool_size:
 		lock_sys.resize(srv_lock_table_size);
 
 		/* normalize btr_search_sys */
-		btr_search_sys_resize(
-			buf_pool_get_curr_size() / sizeof(void*) / 64);
+		btr_search_sys_resize(curr_pool_size / sizeof(void*) / 64);
 
 		dict_sys.resize();
 
@@ -2455,13 +2452,8 @@ calc_buf_pool_size:
 	/* normalize ibuf.max_size */
 	ibuf_max_size_update(srv_change_buffer_max_size);
 
-	if (srv_buf_pool_old_size != srv_buf_pool_size) {
-
-		ib::info() << "Completed to resize buffer pool from "
-			<< srv_buf_pool_old_size
-			<< " to " << srv_buf_pool_size << ".";
-		srv_buf_pool_old_size = srv_buf_pool_size;
-	}
+	ib::info() << "Completed to resize buffer pool"
+		" to " << srv_buf_pool_size << ".";
 
 #ifdef BTR_CUR_HASH_ADAPT
 	/* enable AHI if needed */
@@ -2494,19 +2486,9 @@ static void buf_resize_callback(void *)
 {
   DBUG_ENTER("buf_resize_callback");
   ut_a(srv_shutdown_state == SRV_SHUTDOWN_NONE);
-  mutex_enter(&buf_pool.mutex);
-  const auto size= srv_buf_pool_size;
-  const bool work= srv_buf_pool_old_size != size;
-  mutex_exit(&buf_pool.mutex);
-
-  if (work)
-    buf_pool.resize();
-  else
-  {
-    std::ostringstream sout;
-    sout << "Size did not change: old size = new size = " << size;
-    buf_resize_status(sout.str().c_str());
-  }
+  ut_a(srv_buf_pool_size_changing);
+  buf_pool.resize();
+  srv_buf_pool_size_changing= false;
   DBUG_VOID_RETURN;
 }
 
@@ -2526,18 +2508,17 @@ void buf_resize_shutdown()
 }
 
 
-/********************************************************************//**
-Relocate a buffer control block.  Relocates the block on the LRU list
+/** Relocate a buffer control block.  Relocates the block on the LRU list
 and in buf_pool.page_hash.  Does not relocate bpage->list.
-The caller must take care of relocating bpage->list. */
+The caller must take care of relocating bpage->list.
+@param[in,out]	bpage	control block being relocated, buf_page_get_state()
+			must be BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE
+@param[in,out]	dpage	destination control block */
 static
 void
 buf_relocate(
-/*=========*/
-	buf_page_t*	bpage,	/*!< in/out: control block being relocated;
-				buf_page_get_state(bpage) must be
-				BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */
-	buf_page_t*	dpage)	/*!< in/out: destination control block */
+	buf_page_t*	bpage,
+	buf_page_t*	dpage)
 {
 	buf_page_t*	b;
 
@@ -2637,8 +2618,9 @@ bool buf_pool_watch_is_sentinel(const buf_page_t* bpage)
 }
 
 /** Add watch for the given page to be read in. Caller must have
-appropriate hash_lock for the bpage. This function may release the
-hash_lock and reacquire it.
+appropriate hash_lock for the bpage and hold the LRU list mutex to avoid a race
+condition with buf_LRU_free_page inserting the same page into the page hash.
+This function may release the hash_lock and reacquire it.
 @param[in]	page_id		page id
 @param[in,out]	hash_lock	hash_lock currently latched
 @return NULL if watch set, block if the page is in the buffer pool */
@@ -2670,9 +2652,7 @@ page_found:
 	}
 
 	/* From this point this function becomes fairly heavy in terms
-	of latching. We acquire the buf_pool mutex as well as all the
-	hash_locks. buf_pool mutex is needed because any changes to
-	the page_hash must be covered by it and hash_locks are needed
+	of latching. We acquire all the hash_locks. They are needed
 	because we don't want to read any stale information in
 	buf_pool.watch[]. However, it is not in the critical code path
 	as this function will be called only by the purge thread. */
@@ -2680,20 +2660,16 @@ page_found:
 	/* To obey latching order first release the hash_lock. */
 	rw_lock_x_unlock(*hash_lock);
 
-	mutex_enter(&buf_pool.mutex);
 	hash_lock_x_all(buf_pool.page_hash);
 
 	/* We have to recheck that the page
 	was not loaded or a watch set by some other
 	purge thread. This is because of the small
 	time window between when we release the
-	hash_lock to acquire buf_pool.mutex above. */
-
+	hash_lock to lock all the hash_locks. */
 	*hash_lock = buf_page_hash_lock_get(page_id);
-
 	bpage = buf_page_hash_get_low(page_id);
-	if (UNIV_LIKELY_NULL(bpage)) {
-		mutex_exit(&buf_pool.mutex);
+	if (bpage) {
 		hash_unlock_x_all_but(buf_pool.page_hash, *hash_lock);
 		goto page_found;
 	}
@@ -2714,11 +2690,6 @@ page_found:
 			ut_ad(!bpage->in_page_hash);
 			ut_ad(bpage->buf_fix_count == 0);
 
-			/* bpage is pointing to buf_pool.watch[],
-			which is protected by buf_pool.mutex.
-			Normally, buf_page_t objects are protected by
-			buf_block_t::mutex or buf_pool.zip_mutex or both. */
-
 			bpage->state = BUF_BLOCK_ZIP_PAGE;
 			bpage->id = page_id;
 			bpage->buf_fix_count = 1;
@@ -2727,7 +2698,6 @@ page_found:
 			HASH_INSERT(buf_page_t, hash, buf_pool.page_hash,
 				    page_id.fold(), bpage);
 
-			mutex_exit(&buf_pool.mutex);
 			/* Once the sentinel is in the page_hash we can
 			safely release all locks except just the
 			relevant hash_lock */
@@ -2755,27 +2725,19 @@ page_found:
 }
 
 /** Remove the sentinel block for the watch before replacing it with a
-real block. buf_page_watch_clear() or buf_page_watch_occurred() will notice
+real block. buf_pool_watch_unset() or buf_pool_watch_occurred() will notice
 that the block has been replaced with the real block.
 @param[in,out]	watch		sentinel for watch
 @return reference count, to be added to the replacement block */
-static
-void
-buf_pool_watch_remove(buf_page_t* watch)
+static void buf_pool_watch_remove(buf_page_t *watch)
 {
-#ifdef UNIV_DEBUG
-	/* We must also own the appropriate hash_bucket mutex. */
-	rw_lock_t* hash_lock = buf_page_hash_lock_get(watch->id);
-	ut_ad(rw_lock_own(hash_lock, RW_LOCK_X));
-#endif /* UNIV_DEBUG */
-
-	ut_ad(mutex_own(&buf_pool.mutex));
-
-	HASH_DELETE(buf_page_t, hash, buf_pool.page_hash, watch->id.fold(),
-		    watch);
-	ut_d(watch->in_page_hash = FALSE);
-	watch->buf_fix_count = 0;
-	watch->state = BUF_BLOCK_POOL_WATCH;
+  ut_ad(rw_lock_own(buf_page_hash_lock_get(watch->id), RW_LOCK_X));
+  ut_ad(watch->state == BUF_BLOCK_ZIP_PAGE);
+  ut_ad(watch->in_page_hash);
+  HASH_DELETE(buf_page_t, hash, buf_pool.page_hash, watch->id.fold(), watch);
+  ut_d(watch->in_page_hash= FALSE);
+  watch->buf_fix_count= 0;
+  watch->state= BUF_BLOCK_POOL_WATCH;
 }
 
 /** Stop watching if the page has been read in.
@@ -2783,27 +2745,17 @@ buf_pool_watch_set(same_page_id) must have returned NULL before.
 @param[in]	page_id	page id */
 void buf_pool_watch_unset(const page_id_t page_id)
 {
-	buf_page_t*	bpage;
-	/* We only need to have buf_pool.mutex in case where we end
-	up calling buf_pool_watch_remove but to obey latching order
-	we acquire it here before acquiring hash_lock. This should
-	not cause too much grief as this function is only ever
-	called from the purge thread. */
-	mutex_enter(&buf_pool.mutex);
-
-	rw_lock_t*	hash_lock = buf_page_hash_lock_get(page_id);
-	rw_lock_x_lock(hash_lock);
+  rw_lock_t *hash_lock= buf_page_hash_lock_get(page_id);
+  rw_lock_x_lock(hash_lock);
 
-	/* The page must exist because buf_pool_watch_set()
-	increments buf_fix_count. */
-	bpage = buf_page_hash_get_low(page_id);
+  /* The page must exist because buf_pool_watch_set()
+  increments buf_fix_count. */
+  buf_page_t *bpage= buf_page_hash_get_low(page_id);
 
-	if (bpage->unfix() == 0 && buf_pool_watch_is_sentinel(bpage)) {
-		buf_pool_watch_remove(bpage);
-	}
+  if (bpage->unfix() == 0 && buf_pool_watch_is_sentinel(bpage))
+    buf_pool_watch_remove(bpage);
 
-	mutex_exit(&buf_pool.mutex);
-	rw_lock_x_unlock(hash_lock);
+  rw_lock_x_unlock(hash_lock);
 }
 
 /** Check if the page has been read in.
@@ -2832,8 +2784,7 @@ bool buf_pool_watch_occurred(const page_id_t page_id)
 	return(ret);
 }
 
-/********************************************************************//**
-Moves a page to the start of the buffer pool LRU list. This high-level
+/** Moves a page to the start of the buffer pool LRU list. This high-level
 function can be used to prevent an important page from slipping out of
 the buffer pool.
 @param[in,out]	bpage	buffer block of a file page */
@@ -2914,18 +2865,27 @@ static void buf_block_try_discard_uncompressed(const page_id_t page_id)
 {
 	buf_page_t*	bpage;
 
-	/* Since we need to acquire buf_pool mutex to discard
-	the uncompressed frame and because page_hash mutex resides
-	below buf_pool mutex in sync ordering therefore we must
-	first release the page_hash mutex. This means that the
-	block in question can move out of page_hash. Therefore
-	we need to check again if the block is still in page_hash. */
+	/* Since we need to acquire buf_pool.mutex to discard
+	the uncompressed frame and because page_hash mutex resides below
+	buf_pool.mutex in sync ordering therefore we must first
+	release the page_hash mutex. This means that the block in question
+	can move out of page_hash. Therefore we need to check again if the
+	block is still in page_hash. */
 	mutex_enter(&buf_pool.mutex);
 
 	bpage = buf_page_hash_get(page_id);
 
 	if (bpage) {
-		buf_LRU_free_page(bpage, false);
+
+		BPageMutex* block_mutex = buf_page_get_mutex(bpage);
+
+		mutex_enter(block_mutex);
+
+		if (buf_LRU_free_page(bpage, false)) {
+
+			return;
+		}
+		mutex_exit(block_mutex);
 	}
 
 	mutex_exit(&buf_pool.mutex);
@@ -3212,22 +3172,12 @@ buf_wait_for_read(
 	access the block (and check for IO state) after the block has been
 	added to the page hashtable. */
 
-	if (buf_block_get_io_fix(block) == BUF_IO_READ) {
+	if (buf_block_get_io_fix_unlocked(block) == BUF_IO_READ) {
 
 		/* Wait until the read operation completes */
-
-		BPageMutex*	mutex = buf_page_get_mutex(&block->page);
-
 		for (;;) {
-			buf_io_fix	io_fix;
-
-			mutex_enter(mutex);
-
-			io_fix = buf_block_get_io_fix(block);
-
-			mutex_exit(mutex);
-
-			if (io_fix == BUF_IO_READ) {
+			if (buf_block_get_io_fix_unlocked(block)
+			    == BUF_IO_READ) {
 				/* Wait by temporaly s-latch */
 				rw_lock_s_lock(&block->lock);
 				rw_lock_s_unlock(&block->lock);
@@ -3271,6 +3221,7 @@ buf_page_get_gen(
 	unsigned	access_time;
 	rw_lock_t*	hash_lock;
 	buf_block_t*	fix_block;
+	BPageMutex* fix_mutex = NULL;
 	ulint		retries = 0;
 
 	ut_ad((mtr == NULL) == (mode == BUF_EVICT_IF_IN_POOL));
@@ -3362,8 +3313,7 @@ loop:
 		if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
 			rw_lock_x_lock(hash_lock);
 
-			/* If not own buf_pool_mutex,
-			page_hash can be changed. */
+			/* page_hash can be changed. */
 			hash_lock = buf_page_hash_lock_x_confirm(
 				hash_lock, page_id);
 
@@ -3385,7 +3335,7 @@ loop:
 					buf_flush_page() for the flush
 					thread counterpart. */
 
-					BPageMutex*	fix_mutex
+					fix_mutex
 						= buf_page_get_mutex(
 							&fix_block->page);
 					mutex_enter(fix_mutex);
@@ -3501,7 +3451,7 @@ loop:
 		for synchorization between user thread and flush thread,
 		instead of block->lock. See buf_flush_page() for the flush
 		thread counterpart. */
-		BPageMutex*	fix_mutex = buf_page_get_mutex(
+		fix_mutex = buf_page_get_mutex(
 						&fix_block->page);
 		mutex_enter(fix_mutex);
 		fix_block->fix();
@@ -3522,11 +3472,8 @@ got_block:
 	case BUF_PEEK_IF_IN_POOL:
 	case BUF_EVICT_IF_IN_POOL:
 		buf_page_t*	fix_page = &fix_block->page;
-		BPageMutex*	fix_mutex = buf_page_get_mutex(fix_page);
-		mutex_enter(fix_mutex);
 		const bool	must_read
-			= (buf_page_get_io_fix(fix_page) == BUF_IO_READ);
-		mutex_exit(fix_mutex);
+			= (buf_page_get_io_fix_unlocked(fix_page) == BUF_IO_READ);
 
 		if (must_read) {
 			/* The page is being read to buffer pool,
@@ -3541,8 +3488,9 @@ got_block:
 	switch (UNIV_EXPECT(buf_block_get_state(fix_block),
 			    BUF_BLOCK_FILE_PAGE)) {
 	case BUF_BLOCK_FILE_PAGE:
+		ut_ad(fix_mutex != &buf_pool.zip_mutex);
 		if (fsp_is_system_temporary(page_id.space())
-		    && buf_block_get_io_fix(block) != BUF_IO_NONE) {
+		    && buf_block_get_io_fix_unlocked(block) != BUF_IO_NONE) {
 			/* This suggests that the page is being flushed.
 			Avoid returning reference to this page.
 			Instead wait for the flush action to complete. */
@@ -3555,13 +3503,19 @@ got_block:
 evict_from_pool:
 			ut_ad(!fix_block->page.oldest_modification);
 			mutex_enter(&buf_pool.mutex);
+			fix_mutex
+				= buf_page_get_mutex(
+					&fix_block->page);
+			mutex_enter(fix_mutex);
 			fix_block->unfix();
 
 			if (!buf_LRU_free_page(&fix_block->page, true)) {
 				ut_ad(0);
 			}
+			// buf_LRU_free_page frees the mutexes we locked.
+			ut_ad(!mutex_own(fix_mutex));
+			ut_ad(!mutex_own(&buf_pool.mutex));
 
-			mutex_exit(&buf_pool.mutex);
 			return(NULL);
 		}
 		break;
@@ -3586,10 +3540,13 @@ evict_from_pool:
 		}
 
 		buf_page_t* bpage = &block->page;
+		/* MDEV-15053-TODO innodb.table_flags-16k fails on it
+		ut_ad(fix_mutex == &buf_pool.zip_mutex); */
+		ut_ad(fix_mutex == &buf_pool.zip_mutex || !fix_mutex);
 
 		/* Note: We have already buffer fixed this block. */
 		if (bpage->buf_fix_count > 1
-		    || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+		    || buf_page_get_io_fix_unlocked(bpage) != BUF_IO_NONE) {
 
 			/* This condition often occurs when the buffer
 			is not buffer-fixed, but I/O-fixed by
@@ -3611,8 +3568,6 @@ evict_from_pool:
 
 		mutex_enter(&buf_pool.mutex);
 
-		hash_lock = buf_page_hash_lock_get(page_id);
-
 		rw_lock_x_lock(hash_lock);
 
 		/* Buffer-fixing prevents the page_hash from changing. */
@@ -3635,10 +3590,10 @@ evict_from_pool:
 			This should be extremely unlikely, for example,
 			if buf_page_get_zip() was invoked. */
 
-			buf_LRU_block_free_non_file_page(block);
 			mutex_exit(&buf_pool.mutex);
 			rw_lock_x_unlock(hash_lock);
 			buf_page_mutex_exit(block);
+			buf_LRU_block_free_non_file_page(block);
 
 			/* Try again */
 			goto loop;
@@ -3681,15 +3636,15 @@ evict_from_pool:
 		/* Insert at the front of unzip_LRU list */
 		buf_unzip_LRU_add_block(block, FALSE);
 
+		mutex_exit(&buf_pool.mutex);
+
 		buf_block_set_io_fix(block, BUF_IO_READ);
 		rw_lock_x_lock_inline(&block->lock, 0, file, line);
 
 		UNIV_MEM_INVALID(bpage, sizeof *bpage);
 
 		rw_lock_x_unlock(hash_lock);
-		buf_pool.n_pend_unzip++;
 		mutex_exit(&buf_pool.zip_mutex);
-		mutex_exit(&buf_pool.mutex);
 
 		access_time = buf_page_is_accessed(&block->page);
 
@@ -3703,16 +3658,14 @@ evict_from_pool:
 		buf_page_free_descriptor(bpage);
 
 		/* Decompress the page while not holding
-		buf_pool.mutex or block->mutex. */
+		any buf_pool or block->mutex. */
 
 		if (!buf_zip_decompress(block, TRUE)) {
-			mutex_enter(&buf_pool.mutex);
 			buf_page_mutex_enter(fix_block);
 			buf_block_set_io_fix(fix_block, BUF_IO_NONE);
 			buf_page_mutex_exit(fix_block);
 
 			--buf_pool.n_pend_unzip;
-			mutex_exit(&buf_pool.mutex);
 			fix_block->unfix();
 			rw_lock_x_unlock(&fix_block->lock);
 
@@ -3722,17 +3675,13 @@ evict_from_pool:
 			return NULL;
 		}
 
-		mutex_enter(&buf_pool.mutex);
-
 		buf_page_mutex_enter(fix_block);
 
 		buf_block_set_io_fix(fix_block, BUF_IO_NONE);
 
 		buf_page_mutex_exit(fix_block);
 
-		--buf_pool.n_pend_unzip;
-
-		mutex_exit(&buf_pool.mutex);
+		buf_pool.n_pend_unzip++;
 
 		rw_lock_x_unlock(&block->lock);
 
@@ -3764,16 +3713,20 @@ evict_from_pool:
 		relocated or enter or exit the buf_pool while we
 		are holding the buf_pool.mutex. */
 
+		fix_mutex = buf_page_get_mutex(&fix_block->page);
+		mutex_enter(fix_mutex);
+
 		if (buf_LRU_free_page(&fix_block->page, true)) {
 
-			mutex_exit(&buf_pool.mutex);
+			if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
+				/* Hold LRU list mutex, see comment
+				in buf_pool_watch_set(). */
+				mutex_enter(&buf_pool.mutex);
+			}
 
 			/* page_hash can be changed. */
 			hash_lock = buf_page_hash_lock_get(page_id);
 			rw_lock_x_lock(hash_lock);
-
-			/* If not own buf_pool_mutex,
-			page_hash can be changed. */
 			hash_lock = buf_page_hash_lock_x_confirm(
 				hash_lock, page_id);
 
@@ -3783,6 +3736,7 @@ evict_from_pool:
 				buffer pool in the first place. */
 				block = (buf_block_t*) buf_pool_watch_set(
 					page_id, &hash_lock);
+				mutex_exit(&buf_pool.mutex);
 			} else {
 				block = (buf_block_t*) buf_page_hash_get_low(
 					page_id);
@@ -3793,7 +3747,7 @@ evict_from_pool:
 			if (block != NULL) {
 				/* Either the page has been read in or
 				a watch was set on that in the window
-				where we released the buf_pool::mutex
+				where we released the buf_pool.mutex
 				and before we acquire the hash_lock
 				above. Try again. */
 				guess = block;
@@ -3804,21 +3758,19 @@ evict_from_pool:
 			return(NULL);
 		}
 
-		buf_page_mutex_enter(fix_block);
-
 		if (buf_flush_page_try(fix_block)) {
 			guess = fix_block;
 
 			goto loop;
 		}
 
+		mutex_exit(&buf_pool.mutex);
+
 		buf_page_mutex_exit(fix_block);
 
 		fix_block->fix();
 
 		/* Failed to evict the page; change it directly */
-
-		mutex_exit(&buf_pool.mutex);
 	}
 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
 
@@ -4081,16 +4033,16 @@ buf_page_try_get_func(
 
 	ut_ad(!buf_pool_watch_is_sentinel(&block->page));
 
-	buf_page_mutex_enter(block);
+	buf_block_buf_fix_inc(block, file, line);
+
 	rw_lock_s_unlock(hash_lock);
 
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	buf_page_mutex_enter(block);
 	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
 	ut_a(page_id == block->page.id);
-#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
-
-	buf_block_buf_fix_inc(block, file, line);
 	buf_page_mutex_exit(block);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 
 	mtr_memo_type_t	fix_type = MTR_MEMO_PAGE_S_FIX;
 	success = rw_lock_s_lock_nowait(&block->lock, file, line);
@@ -4148,7 +4100,8 @@ buf_page_init_low(
 	HASH_INVALIDATE(bpage, hash);
 }
 
-/** Inits a page to the buffer buf_pool.
+/** Inits a page to the buffer buf_pool. The block pointer must be private to
+the calling thread at the start of this function.
 @param[in]	page_id		page id
 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
 @param[in,out]	block		block to init */
@@ -4157,8 +4110,7 @@ static void buf_page_init(const page_id_t page_id, ulint zip_size,
 {
 	buf_page_t*	hash_page;
 
-	ut_ad(mutex_own(&buf_pool.mutex));
-	ut_ad(buf_page_mutex_own(block));
+	ut_ad(!mutex_own(buf_page_get_mutex(&block->page)));
 	ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE);
 	ut_ad(rw_lock_own(buf_page_hash_lock_get(page_id), RW_LOCK_X));
 
@@ -4202,8 +4154,6 @@ static void buf_page_init(const page_id_t page_id, ulint zip_size,
 			<< " already found in the hash table: "
 			<< hash_page << ", " << block;
 
-		ut_d(buf_page_mutex_exit(block));
-		ut_d(mutex_exit(&buf_pool.mutex));
 		ut_d(buf_pool.print());
 		ut_d(buf_LRU_print());
 		ut_d(buf_LRU_validate());
@@ -4247,12 +4197,9 @@ buf_page_init_for_read(
 	bool			unzip)
 {
 	buf_block_t*	block;
-	buf_page_t*	bpage	= NULL;
-	buf_page_t*	watch_page;
 	rw_lock_t*	hash_lock;
 	mtr_t		mtr;
-	bool		lru	= false;
-	void*		data;
+	void*		data	= NULL;
 
 	*err = DB_SUCCESS;
 
@@ -4281,20 +4228,41 @@ buf_page_init_for_read(
 		ut_ad(block);
 	}
 
+	buf_page_t*	bpage = NULL;
+	if (block == NULL) {
+		bpage = buf_page_alloc_descriptor();
+	}
+
+	if (!block || zip_size) {
+		data = buf_buddy_alloc(zip_size);
+	}
+
 	mutex_enter(&buf_pool.mutex);
 
 	hash_lock = buf_page_hash_lock_get(page_id);
 	rw_lock_x_lock(hash_lock);
 
+	buf_page_t*	watch_page;
+
 	watch_page = buf_page_hash_get_low(page_id);
 	if (watch_page && !buf_pool_watch_is_sentinel(watch_page)) {
 		/* The page is already in the buffer pool. */
 		watch_page = NULL;
+
+		mutex_exit(&buf_pool.mutex);
+
 		rw_lock_x_unlock(hash_lock);
-		if (block) {
-			buf_page_mutex_enter(block);
+
+		if (bpage != NULL) {
+			buf_page_free_descriptor(bpage);
+		}
+
+		if (data != NULL) {
+			buf_buddy_free(data, zip_size);
+		}
+
+		if (block != NULL) {
 			buf_LRU_block_free_non_file_page(block);
-			buf_page_mutex_exit(block);
 		}
 
 		bpage = NULL;
@@ -4302,46 +4270,23 @@ buf_page_init_for_read(
 	}
 
 	if (block) {
+		ut_ad(!bpage);
 		bpage = &block->page;
 
-		buf_page_mutex_enter(block);
-
 		buf_page_init(page_id, zip_size, block);
 
+		buf_page_mutex_enter(block);
+
 		/* Note: We are using the hash_lock for protection. This is
 		safe because no other thread can lookup the block from the
 		page hashtable yet. */
 
 		buf_page_set_io_fix(bpage, BUF_IO_READ);
 
-		rw_lock_x_unlock(hash_lock);
-
 		/* The block must be put to the LRU list, to the old blocks */
 		buf_LRU_add_block(bpage, TRUE/* to old blocks */);
 
-		/* We set a pass-type x-lock on the frame because then
-		the same thread which called for the read operation
-		(and is running now at this point of code) can wait
-		for the read to complete by waiting for the x-lock on
-		the frame; if the x-lock were recursive, the same
-		thread would illegally get the x-lock before the page
-		read is completed.  The x-lock is cleared by the
-		io-handler thread. */
-
-		rw_lock_x_lock_gen(&block->lock, BUF_IO_READ);
-
 		if (zip_size) {
-			/* buf_pool.mutex may be released and
-			reacquired by buf_buddy_alloc().  Thus, we
-			must release block->mutex in order not to
-			break the latching order in the reacquisition
-			of buf_pool.mutex.  We also must defer this
-			operation until after the block descriptor has
-			been added to buf_pool.LRU and
-			buf_pool.page_hash. */
-			buf_page_mutex_exit(block);
-			data = buf_buddy_alloc(zip_size, &lru);
-			buf_page_mutex_enter(block);
 			block->page.zip.data = (page_zip_t*) data;
 
 			/* To maintain the invariant
@@ -4353,41 +4298,27 @@ buf_page_init_for_read(
 			buf_unzip_LRU_add_block(block, TRUE);
 		}
 
-		buf_page_mutex_exit(block);
-	} else {
-		rw_lock_x_unlock(hash_lock);
-
-		/* The compressed page must be allocated before the
-		control block (bpage), in order to avoid the
-		invocation of buf_buddy_relocate_block() on
-		uninitialized data. */
-		data = buf_buddy_alloc(zip_size, &lru);
-
-		rw_lock_x_lock(hash_lock);
-
-		/* If buf_buddy_alloc() allocated storage from the LRU list,
-		it released and reacquired buf_pool.mutex.  Thus, we must
-		check the page_hash again, as it may have been modified. */
-		if (UNIV_UNLIKELY(lru)) {
+		mutex_exit(&buf_pool.mutex);
 			watch_page = buf_page_hash_get_low(page_id);
+		/* We set a pass-type x-lock on the frame because then
+		the same thread which called for the read operation
+		(and is running now at this point of code) can wait
+		for the read to complete by waiting for the x-lock on
+		the frame; if the x-lock were recursive, the same
+		thread would illegally get the x-lock before the page
+		read is completed.  The x-lock is cleared by the
+		io-handler thread. */
 
-			if (UNIV_UNLIKELY(watch_page
-			    && !buf_pool_watch_is_sentinel(watch_page))) {
-
-				/* The block was added by some other thread. */
-				rw_lock_x_unlock(hash_lock);
-				watch_page = NULL;
-				buf_buddy_free(data, zip_size);
+		rw_lock_x_lock_gen(&block->lock, BUF_IO_READ);
 
-				bpage = NULL;
-				goto func_exit;
-			}
-		}
+		rw_lock_x_unlock(hash_lock);
 
-		bpage = buf_page_alloc_descriptor();
+		buf_page_mutex_exit(block);
+	} else {
 
 		page_zip_des_init(&bpage->zip);
 		page_zip_set_size(&bpage->zip, zip_size);
+		ut_ad(data);
 		bpage->zip.data = (page_zip_t*) data;
 
 		mutex_enter(&buf_pool.zip_mutex);
@@ -4441,7 +4372,6 @@ buf_page_init_for_read(
 
 	buf_pool.n_pend_reads++;
 func_exit:
-	mutex_exit(&buf_pool.mutex);
 
 	if (mode == BUF_READ_IBUF_PAGES_ONLY) {
 
@@ -4518,10 +4448,10 @@ buf_page_create(
 
 	block = free_block;
 
-	buf_page_mutex_enter(block);
-
 	buf_page_init(page_id, zip_size, block);
 
+	buf_page_mutex_enter(block);
+
 	rw_lock_x_unlock(hash_lock);
 
 	/* The block must be put to the LRU list */
@@ -4538,14 +4468,10 @@ buf_page_create(
 		buf_page_set_io_fix(&block->page, BUF_IO_READ);
 		rw_lock_x_lock(&block->lock);
 
+		mutex_exit(&buf_pool.mutex);
 		buf_page_mutex_exit(block);
-		/* buf_pool.mutex may be released and reacquired by
-		buf_buddy_alloc().  Thus, we must release block->mutex
-		in order not to break the latching order in
-		the reacquisition of buf_pool.mutex.  We also must
-		defer this operation until after the block descriptor
-		has been added to buf_pool.LRU and buf_pool.page_hash. */
 		block->page.zip.data = buf_buddy_alloc(zip_size);
+		mutex_enter(&buf_pool.mutex);
 		buf_page_mutex_enter(block);
 
 		/* To maintain the invariant
@@ -4733,9 +4659,13 @@ buf_corrupt_page_release(buf_page_t* bpage, const fil_space_t* space)
 	const ibool	uncompressed = (buf_page_get_state(bpage)
 					== BUF_BLOCK_FILE_PAGE);
 	page_id_t	old_page_id = bpage->id;
+	rw_lock_t*	hash_lock = buf_page_hash_lock_get(bpage->id);
 
 	/* First unfix and release lock on the bpage */
 	mutex_enter(&buf_pool.mutex);
+
+	rw_lock_x_lock(hash_lock);
+
 	mutex_enter(buf_page_get_mutex(bpage));
 	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ);
 	ut_ad(bpage->id.space() == space->id);
@@ -4753,19 +4683,20 @@ buf_corrupt_page_release(buf_page_t* bpage, const fil_space_t* space)
 			BUF_IO_READ);
 	}
 
-	mutex_exit(buf_page_get_mutex(bpage));
-
 	if (!srv_force_recovery) {
 		buf_mark_space_corrupt(bpage, *space);
 	}
 
-	/* After this point bpage can't be referenced. */
+	/* The hash lock and block mutex will be released during the "free" */
 	buf_LRU_free_one_page(bpage, old_page_id);
 
-	ut_ad(buf_pool.n_pend_reads > 0);
-	buf_pool.n_pend_reads--;
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_X)
+	      && !rw_lock_own(hash_lock, RW_LOCK_S));
 
 	mutex_exit(&buf_pool.mutex);
+
+	ut_ad(buf_pool.n_pend_reads > 0);
+	buf_pool.n_pend_reads--;
 }
 
 /** Check if the encrypted page is corrupted for the full crc32 format.
@@ -4877,6 +4808,8 @@ buf_page_io_complete(buf_page_t* bpage, bool dblwr, bool evict)
 	enum buf_io_fix	io_type;
 	const bool	uncompressed = (buf_page_get_state(bpage)
 					== BUF_BLOCK_FILE_PAGE);
+	bool		have_LRU_mutex = false;
+
 	ut_a(buf_page_in_file(bpage));
 
 	/* We do not need protect io_fix here by mutex to read
@@ -4885,7 +4818,7 @@ buf_page_io_complete(buf_page_t* bpage, bool dblwr, bool evict)
 	ensures that this is the only thread that handles the i/o for this
 	block. */
 
-	io_type = buf_page_get_io_fix(bpage);
+	io_type = buf_page_get_io_fix_unlocked(bpage);
 	ut_ad(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
 	ut_ad(!!bpage->zip.ssize == (bpage->zip.data != NULL));
 	ut_ad(uncompressed || bpage->zip.data);
@@ -5071,19 +5004,40 @@ release_page:
 		}
 	}
 
-	BPageMutex* block_mutex = buf_page_get_mutex(bpage);
+
 	mutex_enter(&buf_pool.mutex);
-	mutex_enter(block_mutex);
+
+	BPageMutex*	page_mutex = buf_page_get_mutex(bpage);
+	mutex_enter(page_mutex);
+
+	if (io_type == BUF_IO_WRITE
+	    && (
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+		/* to keep consistency at buf_LRU_insert_zip_clean() */
+		buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY ||
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+		buf_page_get_flush_type(bpage) == BUF_FLUSH_LRU ||
+		buf_page_get_flush_type(bpage) == BUF_FLUSH_SINGLE_PAGE)) {
+
+		have_LRU_mutex = true; /* optimistic */
+	} else {
+		mutex_exit(&buf_pool.mutex);
+	}
+
 
 	/* Because this thread which does the unlocking is not the same that
 	did the locking, we use a pass value != 0 in unlock, which simply
 	removes the newest lock debug record, without checking the thread
 	id. */
 
-	buf_page_set_io_fix(bpage, BUF_IO_NONE);
 	buf_page_monitor(bpage, io_type);
 
 	if (io_type == BUF_IO_READ) {
+
+		ut_ad(!have_LRU_mutex);
+
+		buf_page_set_io_fix(bpage, BUF_IO_NONE);
+
 		/* NOTE that the call to ibuf may have moved the ownership of
 		the x-latch to this OS thread: do not let this confuse you in
 		debugging! */
@@ -5097,7 +5051,7 @@ release_page:
 					     BUF_IO_READ);
 		}
 
-		mutex_exit(block_mutex);
+		mutex_exit(page_mutex);
 	} else {
 		/* Write means a flush operation: call the completion
 		routine in the flush system */
@@ -5119,19 +5073,22 @@ release_page:
 		by the caller explicitly. */
 		if (buf_page_get_flush_type(bpage) == BUF_FLUSH_LRU) {
 			evict = true;
+			ut_ad(have_LRU_mutex);
 		}
 
-		mutex_exit(block_mutex);
-
-		if (evict) {
-			buf_LRU_free_page(bpage, true);
+		if (evict && buf_LRU_free_page(bpage, true)) {
+			have_LRU_mutex = false;
+		} else {
+			mutex_exit(buf_page_get_mutex(bpage));
+		}
+		if (have_LRU_mutex) {
+			mutex_exit(&buf_pool.mutex);
 		}
 	}
 
 	DBUG_PRINT("ib_buf", ("%s page %u:%u",
 			      io_type == BUF_IO_READ ? "read" : "wrote",
 			      bpage->id.space(), bpage->id.page_no()));
-	mutex_exit(&buf_pool.mutex);
 	return DB_SUCCESS;
 }
 
@@ -5161,7 +5118,9 @@ void buf_refresh_io_stats()
 All pages must be in a replaceable state (not modified or latched). */
 void buf_pool_invalidate()
 {
-	mutex_enter(&buf_pool.mutex);
+	ut_ad(!mutex_own(&buf_pool.mutex));
+
+	mutex_enter(&buf_pool.flush_state_mutex);
 
 	for (unsigned i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
 
@@ -5179,18 +5138,19 @@ void buf_pool_invalidate()
 		if (buf_pool.n_flush[i] > 0) {
 			buf_flush_t	type = buf_flush_t(i);
 
-			mutex_exit(&buf_pool.mutex);
+			mutex_exit(&buf_pool.flush_state_mutex);
 			buf_flush_wait_batch_end(type);
-			mutex_enter(&buf_pool.mutex);
+			mutex_enter(&buf_pool.flush_state_mutex);
 		}
 	}
 
-	ut_d(mutex_exit(&buf_pool.mutex));
+	mutex_exit(&buf_pool.flush_state_mutex);
 	ut_d(buf_pool.assert_all_freed());
-	ut_d(mutex_enter(&buf_pool.mutex));
 
 	while (buf_LRU_scan_and_free_block(true));
 
+	mutex_enter(&buf_pool.mutex);
+
 	ut_ad(UT_LIST_GET_LEN(buf_pool.LRU) == 0);
 	ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0);
 
@@ -5198,9 +5158,10 @@ void buf_pool_invalidate()
 	buf_pool.LRU_old = NULL;
 	buf_pool.LRU_old_len = 0;
 
+	mutex_exit(&buf_pool.mutex);
+
 	memset(&buf_pool.stat, 0x00, sizeof(buf_pool.stat));
 	buf_refresh_io_stats();
-	mutex_exit(&buf_pool.mutex);
 }
 
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
@@ -5208,8 +5169,6 @@ void buf_pool_invalidate()
 void buf_pool_t::validate()
 {
 	buf_page_t*	b;
-	buf_pool_t::chunk_t*	chunk;
-	ulint		i;
 	ulint		n_lru_flush	= 0;
 	ulint		n_page_flush	= 0;
 	ulint		n_list_flush	= 0;
@@ -5218,22 +5177,23 @@ void buf_pool_t::validate()
 	ulint		n_free		= 0;
 	ulint		n_zip		= 0;
 
-	mutex_enter(&buf_pool.mutex);
-	hash_lock_x_all(buf_pool.page_hash);
+	mutex_enter(&mutex);
+	hash_lock_x_all(page_hash);
+	mutex_enter(&zip_mutex);
+	mutex_enter(&free_list_mutex);
+	mutex_enter(&flush_state_mutex);
 
-	chunk = buf_pool.chunks;
+	chunk_t* chunk = chunks;
 
 	/* Check the uncompressed blocks. */
 
-	for (i = buf_pool.n_chunks; i--; chunk++) {
+	for (ulint i = n_chunks; i--; chunk++) {
 
 		ulint		j;
 		buf_block_t*	block = chunk->blocks;
 
 		for (j = chunk->size; j--; block++) {
 
-			buf_page_mutex_enter(block);
-
 			switch (buf_block_get_state(block)) {
 			case BUF_BLOCK_POOL_WATCH:
 			case BUF_BLOCK_ZIP_PAGE:
@@ -5247,7 +5207,7 @@ void buf_pool_t::validate()
 				ut_ad(buf_page_hash_get_low(block->page.id)
 				      == &block->page);
 
-				switch (buf_page_get_io_fix(&block->page)) {
+				switch (buf_page_get_io_fix_unlocked(&block->page)) {
 				case BUF_IO_NONE:
 					break;
 
@@ -5255,20 +5215,8 @@ void buf_pool_t::validate()
 					switch (buf_page_get_flush_type(
 							&block->page)) {
 					case BUF_FLUSH_LRU:
-						n_lru_flush++;
-						goto assert_s_latched;
 					case BUF_FLUSH_SINGLE_PAGE:
-						n_page_flush++;
-assert_s_latched:
-						ut_a(rw_lock_is_locked(
-							     &block->lock,
-								     RW_LOCK_S)
-						     || rw_lock_is_locked(
-								&block->lock,
-								RW_LOCK_SX));
-						break;
 					case BUF_FLUSH_LIST:
-						n_list_flush++;
 						break;
 					default:
 						ut_error;
@@ -5295,16 +5243,12 @@ assert_s_latched:
 				/* do nothing */
 				break;
 			}
-
-			buf_page_mutex_exit(block);
 		}
 	}
 
-	mutex_enter(&buf_pool.zip_mutex);
-
 	/* Check clean compressed-only blocks. */
 
-	for (b = UT_LIST_GET_FIRST(buf_pool.zip_clean); b;
+	for (b = UT_LIST_GET_FIRST(zip_clean); b;
 	     b = UT_LIST_GET_NEXT(list, b)) {
 		ut_ad(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
 		switch (buf_page_get_io_fix(b)) {
@@ -5324,7 +5268,7 @@ assert_s_latched:
 		}
 
 		/* It is OK to read oldest_modification here because
-		we have acquired buf_pool.zip_mutex above which acts
+		we have acquired zip_mutex above which acts
 		as the 'block->mutex' for these bpages. */
 		ut_ad(!b->oldest_modification);
 		ut_ad(buf_page_hash_get_low(b->id) == b);
@@ -5334,8 +5278,8 @@ assert_s_latched:
 
 	/* Check dirty blocks. */
 
-	mutex_enter(&buf_pool.flush_list_mutex);
-	for (b = UT_LIST_GET_FIRST(buf_pool.flush_list); b;
+	mutex_enter(&flush_list_mutex);
+	for (b = UT_LIST_GET_FIRST(flush_list); b;
 	     b = UT_LIST_GET_NEXT(list, b)) {
 		ut_ad(b->in_flush_list);
 		ut_ad(b->oldest_modification);
@@ -5345,7 +5289,9 @@ assert_s_latched:
 		case BUF_BLOCK_ZIP_DIRTY:
 			n_lru++;
 			n_zip++;
-			switch (buf_page_get_io_fix(b)) {
+			/* fall through */
+		case BUF_BLOCK_FILE_PAGE:
+			switch (buf_page_get_io_fix_unlocked(b)) {
 			case BUF_IO_NONE:
 			case BUF_IO_READ:
 			case BUF_IO_PIN:
@@ -5367,51 +5313,50 @@ assert_s_latched:
 				break;
 			}
 			break;
-		case BUF_BLOCK_FILE_PAGE:
-			/* uncompressed page */
+		case BUF_BLOCK_REMOVE_HASH:
+			/* We do not hold buf_pool.mutex here. */
 			break;
 		case BUF_BLOCK_POOL_WATCH:
 		case BUF_BLOCK_ZIP_PAGE:
 		case BUF_BLOCK_NOT_USED:
 		case BUF_BLOCK_READY_FOR_USE:
 		case BUF_BLOCK_MEMORY:
-		case BUF_BLOCK_REMOVE_HASH:
 			ut_error;
 			break;
 		}
 		ut_ad(buf_page_hash_get_low(b->id) == b);
 	}
 
-	ut_ad(UT_LIST_GET_LEN(buf_pool.flush_list) == n_flush);
-
-	hash_unlock_x_all(buf_pool.page_hash);
-	mutex_exit(&buf_pool.flush_list_mutex);
+	ut_ad(UT_LIST_GET_LEN(flush_list) == n_flush);
 
-	mutex_exit(&buf_pool.zip_mutex);
+	hash_unlock_x_all(page_hash);
+	mutex_exit(&flush_list_mutex);
+	mutex_exit(&zip_mutex);
 
-	if (buf_pool.curr_size == buf_pool.old_size
-	    && n_lru + n_free > buf_pool.curr_size + n_zip) {
+	if (curr_size == old_size
+	    && n_lru + n_free > curr_size + n_zip) {
 
 		ib::fatal() << "n_LRU " << n_lru << ", n_free " << n_free
-			<< ", pool " << buf_pool.curr_size
+			<< ", pool " << curr_size
 			<< " zip " << n_zip << ". Aborting...";
 	}
 
-	ut_ad(UT_LIST_GET_LEN(buf_pool.LRU) == n_lru);
+	ut_ad(UT_LIST_GET_LEN(LRU) == n_lru);
 
-	if (buf_pool.curr_size == buf_pool.old_size
-	    && UT_LIST_GET_LEN(buf_pool.free) != n_free) {
+	mutex_exit(&mutex);
+
+	if (curr_size == old_size
+	    && UT_LIST_GET_LEN(free) > n_free) {
 
 		ib::fatal() << "Free list len "
-			<< UT_LIST_GET_LEN(buf_pool.free)
+			<< UT_LIST_GET_LEN(free)
 			<< ", free blocks " << n_free << ". Aborting...";
 	}
 
-	ut_ad(buf_pool.n_flush[BUF_FLUSH_LIST] == n_list_flush);
-	ut_ad(buf_pool.n_flush[BUF_FLUSH_LRU] == n_lru_flush);
-	ut_ad(buf_pool.n_flush[BUF_FLUSH_SINGLE_PAGE] == n_page_flush);
+	mutex_exit(&free_list_mutex);
 
-	mutex_exit(&buf_pool.mutex);
+	ut_ad(this->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_page_flush);
+	mutex_exit(&flush_state_mutex);
 
 	ut_d(buf_LRU_validate());
 	ut_d(buf_flush_validate());
@@ -5429,7 +5374,7 @@ void buf_pool_t::print()
 	ulint		j;
 	index_id_t	id;
 	ulint		n_found;
-	buf_pool_t::chunk_t*	chunk;
+	chunk_t*	chunk;
 	dict_index_t*	index;
 
 	size = curr_size;
@@ -5549,18 +5494,15 @@ ulint buf_get_latched_pages_number()
 				continue;
 			}
 
-			buf_page_mutex_enter(block);
-
 			if (block->page.buf_fix_count != 0
-			    || buf_page_get_io_fix(&block->page)
+			    || buf_page_get_io_fix_unlocked(&block->page)
 			    != BUF_IO_NONE) {
 				fixed_pages_number++;
 			}
-
-			buf_page_mutex_exit(block);
 		}
 	}
 
+	mutex_exit(&buf_pool.mutex);
 	mutex_enter(&buf_pool.zip_mutex);
 
 	/* Traverse the lists of clean and dirty compressed-only blocks. */
@@ -5604,7 +5546,6 @@ ulint buf_get_latched_pages_number()
 
 	mutex_exit(&buf_pool.flush_list_mutex);
 	mutex_exit(&buf_pool.zip_mutex);
-	mutex_exit(&buf_pool.mutex);
 
 	return(fixed_pages_number);
 }
@@ -5618,6 +5559,8 @@ void buf_stats_get_pool_info(buf_pool_info_t *pool_info)
 	double			time_elapsed;
 
 	mutex_enter(&buf_pool.mutex);
+	mutex_enter(&buf_pool.free_list_mutex);
+	mutex_enter(&buf_pool.flush_state_mutex);
 	mutex_enter(&buf_pool.flush_list_mutex);
 
 	pool_info->pool_size = buf_pool.curr_size;
@@ -5647,6 +5590,9 @@ void buf_stats_get_pool_info(buf_pool_info_t *pool_info)
 		  + buf_pool.init_flush[BUF_FLUSH_SINGLE_PAGE]);
 
 	mutex_exit(&buf_pool.flush_list_mutex);
+	mutex_exit(&buf_pool.flush_state_mutex);
+	mutex_exit(&buf_pool.free_list_mutex);
+	mutex_exit(&buf_pool.mutex);
 
 	current_time = time(NULL);
 	time_elapsed = 0.001 + difftime(current_time,
@@ -5737,7 +5683,6 @@ void buf_stats_get_pool_info(buf_pool_info_t *pool_info)
 	pool_info->unzip_cur = buf_LRU_stat_cur.unzip;
 
 	buf_refresh_io_stats();
-	mutex_exit(&buf_pool.mutex);
 }
 
 /*********************************************************************//**
@@ -5872,12 +5817,12 @@ ulint buf_pool_check_no_pending_io()
 {
 	/* FIXME: use atomics, no mutex */
 	ulint pending_io = buf_pool.n_pend_reads;
-	mutex_enter(&buf_pool.mutex);
+	mutex_enter(&buf_pool.flush_state_mutex);
 	pending_io +=
 		+ buf_pool.n_flush[BUF_FLUSH_LRU]
 		+ buf_pool.n_flush[BUF_FLUSH_SINGLE_PAGE]
 		+ buf_pool.n_flush[BUF_FLUSH_LIST];
-	mutex_exit(&buf_pool.mutex);
+	mutex_exit(&buf_pool.flush_state_mutex);
 
 	return(pending_io);
 }
@@ -5915,5 +5860,9 @@ buf_page_get_trim_length(
 	ulint			write_length)
 {
 	return bpage->physical_size() - write_length;
+	ut_ad(mutex_own(&buf_pool.mutex));
+	ut_ad(mutex_own(&buf_pool.free_list_mutex));
+	ut_ad(mutex_own(&buf_pool.flush_state_mutex));
+	ut_ad(mutex_own(&buf_pool.flush_list_mutex));
 }
 #endif /* !UNIV_INNOCHECKSUM */