diff options
author | Marko Mäkelä <marko.makela@mariadb.com> | 2020-06-11 17:30:33 +0300 |
---|---|---|
committer | Marko Mäkelä <marko.makela@mariadb.com> | 2020-06-11 17:30:33 +0300 |
commit | 757e756d6eeddf124c11a7a82a649bc6425ca355 (patch) | |
tree | 926e1fd6c90183238c5622321485ef0780856423 | |
parent | 0af1b0bd213db385fa9073e0936d59f73bceddf6 (diff) | |
download | mariadb-git-757e756d6eeddf124c11a7a82a649bc6425ca355.tar.gz |
MDEV-22850 Reduce buf_pool.page_hash latch contention
For reads, the buf_pool.page_hash is protected by buf_pool.mutex or
by the hash_lock. There is no need to compute or acquire hash_lock
if we are not modifying the buf_pool.page_hash.
However, the buf_pool.page_hash latch must be held exclusively
when changing buf_page_t::in_file(), or if we desire to prevent
buf_page_t::can_relocate() or buf_page_t::buf_fix_count()
from changing.
rw_lock_lock_word_decr(): Add a comment that explains the polling logic.
buf_page_t::set_state(): When in_file() is to be changed, assert that
an exclusive buf_pool.page_hash latch is being held. Unfortunately
we cannot assert this for set_state(BUF_BLOCK_REMOVE_HASH) because
set_corrupt_id() may already have been called.
buf_LRU_free_page(): Check buf_page_t::can_relocate() before
aqcuiring the hash_lock.
buf_block_t::initialise(): Initialize also page.buf_fix_count().
buf_page_create(): Initialize buf_fix_count while not holding
any mutex or hash_lock. Acquire the hash_lock only for the
duration of inserting the block to the buf_pool.page_hash.
buf_LRU_old_init(), buf_LRU_add_block(),
buf_page_t::belongs_to_unzip_LRU(): Do not assert buf_page_t::in_file(),
because buf_page_create() will invoke buf_LRU_add_block()
before acquiring hash_lock and buf_page_t::set_state().
buf_pool_t::validate(): Rely on the buf_pool.mutex and do not
unnecessarily acquire any buf_pool.page_hash latches.
buf_page_init_for_read(): Clarify that we must acquire the hash_lock
upfront in order to prevent a race with buf_pool_t::watch_remove().
-rw-r--r-- | storage/innobase/buf/buf0buf.cc | 28 | ||||
-rw-r--r-- | storage/innobase/buf/buf0lru.cc | 12 | ||||
-rw-r--r-- | storage/innobase/buf/buf0rea.cc | 8 | ||||
-rw-r--r-- | storage/innobase/include/buf0buf.h | 42 | ||||
-rw-r--r-- | storage/innobase/include/sync0rw.ic | 23 |
5 files changed, 88 insertions, 25 deletions
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index e17460a8cee..47b4eac0ed2 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -3776,15 +3776,16 @@ buf_page_try_get_func( } /** Initialize the block. -@param page_id page id -@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 */ -void buf_block_t::initialise(const page_id_t page_id, ulint zip_size) +@param page_id page identifier +@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param fix initial buf_fix_count() */ +void buf_block_t::initialise(const page_id_t page_id, ulint zip_size, + uint32_t fix) { ut_ad(page.state() != BUF_BLOCK_FILE_PAGE); buf_block_init_low(this); lock_hash_val= lock_rec_hash(page_id.space(), page_id.page_no()); - page.init(); - page.id_= page_id; + page.init(page_id, fix); page_zip_set_size(&page.zip, zip_size); } @@ -3803,11 +3804,9 @@ buf_page_create(const page_id_t page_id, ulint zip_size, mtr_t *mtr) ut_ad(page_id.space() != 0 || !zip_size); buf_block_t *free_block= buf_LRU_get_free_block(false); - free_block->initialise(page_id, zip_size); + free_block->initialise(page_id, zip_size, 1); - rw_lock_t *hash_lock= buf_pool.hash_lock_get(page_id); mutex_enter(&buf_pool.mutex); - rw_lock_x_lock(hash_lock); buf_block_t *block= reinterpret_cast<buf_block_t*> (buf_pool.page_hash_get_low(page_id)); @@ -3816,7 +3815,6 @@ buf_page_create(const page_id_t page_id, ulint zip_size, mtr_t *mtr) !buf_pool.watch_is_sentinel(block->page)) { /* Page can be found in buf_pool */ - rw_lock_x_unlock(hash_lock); buf_LRU_block_free_non_file_page(free_block); mutex_exit(&buf_pool.mutex); @@ -3846,11 +3844,17 @@ buf_page_create(const page_id_t page_id, ulint zip_size, mtr_t *mtr) page_id.space(), page_id.page_no())); block= free_block; - buf_block_buf_fix_inc(block, __FILE__, __LINE__); + + /* Duplicate buf_block_buf_fix_inc_func() */ + ut_ad(block->page.buf_fix_count() == 1); + ut_ad(fsp_is_system_temporary(page_id.space()) || + rw_lock_s_lock_nowait(block->debug_latch, __FILE__, __LINE__)); /* The block must be put to the LRU list */ - block->page.set_state(BUF_BLOCK_FILE_PAGE); buf_LRU_add_block(&block->page, false); + rw_lock_t *hash_lock= buf_pool.hash_lock_get(page_id); + rw_lock_x_lock(hash_lock); + block->page.set_state(BUF_BLOCK_FILE_PAGE); ut_d(block->page.in_page_hash= true); HASH_INSERT(buf_page_t, hash, buf_pool.page_hash, page_id.fold(), &block->page); @@ -4401,7 +4405,6 @@ void buf_pool_t::validate() ulint n_zip = 0; mutex_enter(&mutex); - page_hash_lock_all(); chunk_t* chunk = chunks; @@ -4492,7 +4495,6 @@ void buf_pool_t::validate() ut_ad(UT_LIST_GET_LEN(flush_list) == n_flushing); - page_hash_unlock_all(); mutex_exit(&flush_list_mutex); if (curr_size == old_size diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc index 293aba71320..885397f2f6c 100644 --- a/storage/innobase/buf/buf0lru.cc +++ b/storage/innobase/buf/buf0lru.cc @@ -940,7 +940,6 @@ static void buf_LRU_old_init() bpage = UT_LIST_GET_PREV(LRU, bpage)) { ut_ad(bpage->in_LRU_list); - ut_ad(bpage->in_file()); /* This loop temporarily violates the assertions of buf_page_t::set_old(). */ @@ -1066,7 +1065,6 @@ buf_LRU_add_block( the start, regardless of this parameter */ { ut_ad(mutex_own(&buf_pool.mutex)); - ut_a(bpage->in_file()); ut_ad(!bpage->in_LRU_list); if (!old || (UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN)) { @@ -1153,10 +1151,18 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip) ut_ad(bpage->in_file()); ut_ad(bpage->in_LRU_list); + /* First, perform a quick check before we acquire hash_lock. */ + if (!bpage->can_relocate()) { + return false; + } + + /* We must hold an exclusive hash_lock to prevent + bpage->can_relocate() from changing due to a concurrent + execution of buf_page_get_low(). */ rw_lock_t* hash_lock = buf_pool.hash_lock_get(id); rw_lock_x_lock(hash_lock); - if (!bpage->can_relocate()) { + if (UNIV_UNLIKELY(!bpage->can_relocate())) { /* Do not free buffer fixed and I/O-fixed blocks. */ goto func_exit; } diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc index e0322dee2da..9c3b347a236 100644 --- a/storage/innobase/buf/buf0rea.cc +++ b/storage/innobase/buf/buf0rea.cc @@ -121,6 +121,8 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id, mutex_enter(&buf_pool.mutex); + /* We must acquire hash_lock this early to prevent + a race condition with buf_pool_t::watch_remove() */ rw_lock_t *hash_lock= buf_pool.hash_lock_get(page_id); rw_lock_x_lock(hash_lock); @@ -151,8 +153,8 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id, buf_pool.watch_remove(hash_page); } - block->page.set_state(BUF_BLOCK_FILE_PAGE); block->page.set_io_fix(BUF_IO_READ); + block->page.set_state(BUF_BLOCK_FILE_PAGE); ut_ad(!block->page.in_page_hash); ut_d(block->page.in_page_hash= true); HASH_INSERT(buf_page_t, hash, buf_pool.page_hash, page_id.fold(), bpage); @@ -202,8 +204,8 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id, { /* The block was added by some other thread. */ rw_lock_x_unlock(hash_lock); - buf_buddy_free(data, zip_size); - goto func_exit; + buf_buddy_free(data, zip_size); + goto func_exit; } } diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 522441259dd..825eb7631fe 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -995,6 +995,14 @@ public: buf_fix_count_= buf_fix_count; } + /** Initialize some more fields */ + void init(page_id_t id, uint32_t buf_fix_count= 0) + { + init(); + id_= id; + buf_fix_count_= buf_fix_count; + } + public: const page_id_t &id() const { return id_; } buf_page_state state() const { return state_; } @@ -1010,8 +1018,7 @@ public: /** @return if this belongs to buf_pool.unzip_LRU */ bool belongs_to_unzip_LRU() const { - ut_ad(in_file()); - return zip.data && state() == BUF_BLOCK_FILE_PAGE; + return zip.data && state() != BUF_BLOCK_ZIP_PAGE; } inline void add_buf_fix_count(uint32_t count); @@ -1277,9 +1284,10 @@ struct buf_block_t{ ulint zip_size() const { return page.zip_size(); } /** Initialize the block. - @param page_id page id - @param zip_size ROW_FORMAT=COMPRESSED page size, or 0 */ - void initialise(const page_id_t page_id, ulint zip_size); + @param page_id page identifier + @param zip_size ROW_FORMAT=COMPRESSED page size, or 0 + @param fix initial buf_fix_count() */ + void initialise(const page_id_t page_id, ulint zip_size, uint32_t fix= 0); }; /**********************************************************************//** @@ -2134,6 +2142,29 @@ inline void buf_page_t::set_buf_fix_count(uint32_t count) inline void buf_page_t::set_state(buf_page_state state) { ut_ad(mutex_own(&buf_pool.mutex)); +#ifdef UNIV_DEBUG + switch (state) { + case BUF_BLOCK_REMOVE_HASH: + /* buf_pool_t::corrupted_evict() invokes set_corrupt_id() + before buf_LRU_free_one_page(), so we cannot assert that + we are holding the hash_lock. */ + break; + case BUF_BLOCK_MEMORY: + if (!in_file()) break; + /* fall through */ + case BUF_BLOCK_FILE_PAGE: + ut_ad(rw_lock_own(buf_pool.hash_lock_get(id_), RW_LOCK_X)); + break; + case BUF_BLOCK_NOT_USED: + if (!in_file()) break; + /* fall through */ + case BUF_BLOCK_ZIP_PAGE: + ut_ad((this >= &buf_pool.watch[0] && + this <= &buf_pool.watch[UT_ARR_SIZE(buf_pool.watch)]) || + rw_lock_own(buf_pool.hash_lock_get(id_), RW_LOCK_X)); + break; + } +#endif state_= state; } @@ -2212,7 +2243,6 @@ inline bool buf_page_t::is_old() const /** Set whether a block is old in buf_pool.LRU */ inline void buf_page_t::set_old(bool old) { - ut_ad(in_file()); ut_ad(mutex_own(&buf_pool.mutex)); ut_ad(in_LRU_list); diff --git a/storage/innobase/include/sync0rw.ic b/storage/innobase/include/sync0rw.ic index 18a6c69ece4..7fcac01e5ba 100644 --- a/storage/innobase/include/sync0rw.ic +++ b/storage/innobase/include/sync0rw.ic @@ -220,8 +220,31 @@ rw_lock_lock_word_decr( return(true); } + + /* Note that lock_copy was reloaded above. We will + keep trying if a spurious conflict occurred, typically + caused by concurrent executions of + rw_lock_s_lock(). */ + +#if 1 /* FIXME: MDEV-22871 Spurious contention between rw_lock_s_lock() */ + + /* When the number of concurrently executing threads + exceeds the number of available processor cores, + multiple buf_pool.page_hash S-latch requests would + conflict here, mostly in buf_page_get_low(). We should + implement a simpler rw-lock where the S-latch + acquisition would be a simple fetch_add(1) followed by + either an optional load() loop to wait for the X-latch + to be released, or a fetch_sub(1) and a retry. + + For now, we work around the problem with a delay in + this loop. It helped a little on some systems and was + reducing performance on others. */ (void) LF_BACKOFF(); +#endif } + + /* A real conflict was detected. */ return(false); } |