1 files changed, 46 insertions, 35 deletions
diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc
index 4dceaf8f524..83956fb6c8a 100644
--- a/storage/innobase/buf/buf0rea.cc
+++ b/storage/innobase/buf/buf0rea.cc
@@ -58,9 +58,11 @@ buf_read_page_handle_error(
 	const bool	uncompressed = (buf_page_get_state(bpage)
 					== BUF_BLOCK_FILE_PAGE);
 	const page_id_t	old_page_id = bpage->id;
+	rw_lock_t * hash_lock = buf_page_hash_lock_get(bpage->id);
 
 	/* First unfix and release lock on the bpage */
 	mutex_enter(&buf_pool.mutex);
+	rw_lock_x_lock(hash_lock);
 	mutex_enter(buf_page_get_mutex(bpage));
 	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ);
 
@@ -74,15 +76,16 @@ buf_read_page_handle_error(
 			BUF_IO_READ);
 	}
 
-	mutex_exit(buf_page_get_mutex(bpage));
-
-	/* remove the block from LRU list */
+	/* The hash lock and block mutex will be released during the "free" */
 	buf_LRU_free_one_page(bpage, old_page_id);
 
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_X)
+		 && !rw_lock_own(hash_lock, RW_LOCK_S));
+
 	ut_ad(buf_pool.n_pend_reads > 0);
 	buf_pool.n_pend_reads--;
-
 	mutex_exit(&buf_pool.mutex);
+
 }
 
 /** Low-level function which reads a page asynchronously from a file to the
@@ -153,6 +156,7 @@ buf_read_page_low(
 		 << " unzip=" << unzip << ',' << (sync ? "sync" : "async"));
 
 	ut_ad(buf_page_in_file(bpage));
+	ut_ad(!mutex_own(&buf_pool.mutex));
 
 	if (sync) {
 		thd_wait_begin(NULL, THD_WAIT_DISKIO);
@@ -282,11 +286,8 @@ buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf)
 		return(0);
 	}
 
-	mutex_enter(&buf_pool.mutex);
-
 	if (buf_pool.n_pend_reads
 	    > buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
-		mutex_exit(&buf_pool.mutex);
 
 		return(0);
 	}
@@ -295,19 +296,23 @@ buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf)
 	that is, reside near the start of the LRU list. */
 
 	for (i = low; i < high; i++) {
-		if (const buf_page_t* bpage = buf_page_hash_get(
-			    page_id_t(page_id.space(), i))) {
-			if (buf_page_is_accessed(bpage)
-			    && buf_page_peek_if_young(bpage)
-			    && ++recent_blocks
-			    >= 5 + buf_pool.read_ahead_area / 8) {
-				mutex_exit(&buf_pool.mutex);
+		rw_lock_t* hash_lock;
+		const buf_page_t* bpage;
+
+		bpage = buf_page_hash_get_s_locked(page_id_t(page_id.space(), i), &hash_lock);
+
+		if (bpage
+			&& buf_page_is_accessed(bpage)
+			&& buf_page_peek_if_young(bpage)
+			&& ++recent_blocks >= 5 + buf_pool.read_ahead_area / 8) {
+				rw_lock_s_unlock(hash_lock);
 				goto read_ahead;
-			}
+		}
+		if (bpage) {
+			rw_lock_s_unlock(hash_lock);
 		}
 	}
 
-	mutex_exit(&buf_pool.mutex);
 	/* Do nothing */
 	return(0);
 
@@ -362,10 +367,13 @@ read_ahead:
 	return(count);
 }
 
-/** High-level function which reads a page asynchronously from a file to the
+/** High-level function which reads a page from a file to the
 buffer buf_pool if it is not already there. Sets the io_fix flag and sets
 an exclusive lock on the buffer frame. The flag is cleared and the x-lock
-released by the i/o-handler thread.
+released by the buf_page_io_complete function.
+We use synchronous reads here, because in this case the page is used
+right after reading.
+
 @param[in]	page_id		page id
 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
 @retval DB_SUCCESS if the page was read and is not corrupted,
@@ -378,12 +386,6 @@ dberr_t buf_read_page(const page_id_t page_id, ulint zip_size)
 	ulint		count;
 	dberr_t		err = DB_SUCCESS;
 
-	/* We do synchronous IO because our AIO completion code
-	is sub-optimal. See buf_page_io_complete(), we have to
-	acquire the buffer pool mutex before acquiring the block
-	mutex, required for updating the page state. The acquire
-	of the buffer pool mutex becomes an expensive bottleneck. */
-
 	count = buf_read_page_low(
 		&err, true, BUF_READ_ANY_PAGE, page_id, zip_size, false);
 
@@ -480,6 +482,7 @@ buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
 	buf_page_t*	bpage;
 	buf_frame_t*	frame;
 	buf_page_t*	pred_bpage	= NULL;
+	unsigned	pred_bpage_is_accessed = 0;
 	ulint		pred_offset;
 	ulint		succ_offset;
 	int		asc_or_desc;
@@ -539,11 +542,8 @@ buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
 		return(0);
 	}
 
-	mutex_enter(&buf_pool.mutex);
-
 	if (buf_pool.n_pend_reads
 	    > buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
-		mutex_exit(&buf_pool.mutex);
 
 		return(0);
 	}
@@ -565,8 +565,12 @@ buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
 
 	fail_count = 0;
 
+	rw_lock_t*	hash_lock;
+
 	for (i = low; i < high; i++) {
-		bpage = buf_page_hash_get(page_id_t(page_id.space(), i));
+		bpage = buf_page_hash_get_s_locked(
+						   page_id_t(page_id.space(),
+							     i), &hash_lock);
 
 		if (bpage == NULL || !buf_page_is_accessed(bpage)) {
 			/* Not accessed */
@@ -583,7 +587,7 @@ buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
 			a little against this. */
 			int res = ut_ulint_cmp(
 				buf_page_is_accessed(bpage),
-				buf_page_is_accessed(pred_bpage));
+				pred_bpage_is_accessed);
 			/* Accesses not in the right order */
 			if (res != 0 && res != asc_or_desc) {
 				fail_count++;
@@ -592,22 +596,29 @@ buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
 
 		if (fail_count > threshold) {
 			/* Too many failures: return */
-			mutex_exit(&buf_pool.mutex);
+			if (bpage) {
+				rw_lock_s_unlock(hash_lock);
+			}
 			return(0);
 		}
 
-		if (bpage && buf_page_is_accessed(bpage)) {
-			pred_bpage = bpage;
+		if (bpage) {
+			if (buf_page_is_accessed(bpage)) {
+				pred_bpage = bpage;
+				pred_bpage_is_accessed
+					= buf_page_is_accessed(bpage);
+			}
+
+			rw_lock_s_unlock(hash_lock);
 		}
 	}
 
 	/* If we got this far, we know that enough pages in the area have
 	been accessed in the right order: linear read-ahead can be sensible */
 
-	bpage = buf_page_hash_get(page_id);
+	bpage = buf_page_hash_get_s_locked( page_id, &hash_lock);
 
 	if (bpage == NULL) {
-		mutex_exit(&buf_pool.mutex);
 
 		return(0);
 	}
@@ -633,7 +644,7 @@ buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
 	pred_offset = fil_page_get_prev(frame);
 	succ_offset = fil_page_get_next(frame);
 
-	mutex_exit(&buf_pool.mutex);
+	rw_lock_s_unlock(hash_lock);
 
 	if ((page_id.page_no() == low)
 	    && (succ_offset == page_id.page_no() + 1)) {