summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarko Mäkelä <marko.makela@mariadb.com>2021-09-09 09:05:26 +0300
committerMarko Mäkelä <marko.makela@mariadb.com>2021-09-09 09:05:26 +0300
commit6e3bd663d0fc669ffa6ab4c5f5aa57a1244ac555 (patch)
tree2d275b7bedafefeeed71b2d77afbe2bb16dcaad9
parenteb2f2c1e5f5989a225154f6a7eb3487fb722f65e (diff)
downloadmariadb-git-bb-10.5-MDEV-25776.tar.gz
MDEV-25776 Race conditions in buf_pool.page_hash around buf_pool.watchbb-10.5-MDEV-25776
Any modification of buf_pool.page_hash is supposed to be protected by buf_pool.mutex and page_hash_latch::write_lock(). The buffer pool watch mechanism of the InnoDB change buffer was violating that ever since commit b1ab211dee599eabd9a5b886fafa3adea29ae041 (MDEV-15053). buf_pool_t::watch_set(): Extend the critical section of buf_pool.mutex. buf_pool_t::watch_unset(): Define non-inline, because calls are infrequent and this function became larger. If we have to detach a sentinel from page_hash, do it while holding both the mutex and the exclusive hash latch. buf_pool_t::watch_remove(): Assert that the mutex is being held. buf_page_init_for_read(): Remove some work-arounds for previously encountered race conditions related to buf_pool.watch.
-rw-r--r--storage/innobase/buf/buf0buf.cc50
-rw-r--r--storage/innobase/buf/buf0rea.cc18
-rw-r--r--storage/innobase/include/buf0buf.h28
3 files changed, 53 insertions, 43 deletions
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
index f2f443dd4b0..113da7746fa 100644
--- a/storage/innobase/buf/buf0buf.cc
+++ b/storage/innobase/buf/buf0buf.cc
@@ -2393,14 +2393,10 @@ retry:
w->id_= id;
*hash_lock= page_hash.lock_get(fold);
- (*hash_lock)->write_lock();
- mysql_mutex_unlock(&mutex);
buf_page_t *bpage= page_hash_get_low(id, fold);
if (UNIV_LIKELY_NULL(bpage))
{
- (*hash_lock)->write_unlock();
- mysql_mutex_lock(&mutex);
w->set_state(BUF_BLOCK_NOT_USED);
*hash_lock= page_hash.lock_get(fold);
(*hash_lock)->write_lock();
@@ -2408,11 +2404,13 @@ retry:
goto retry;
}
+ (*hash_lock)->write_lock();
ut_ad(!w->buf_fix_count_);
w->buf_fix_count_= 1;
ut_ad(!w->in_page_hash);
- ut_d(w->in_page_hash= true); /* Not holding buf_pool.mutex here! */
+ ut_d(w->in_page_hash= true);
HASH_INSERT(buf_page_t, hash, &page_hash, fold, w);
+ mysql_mutex_unlock(&mutex);
return nullptr;
}
@@ -2421,6 +2419,48 @@ retry:
return nullptr;
}
+/** Stop watching whether a page has been read in.
+watch_set(id) must have returned nullptr before.
+@param id page identifier */
+void buf_pool_t::watch_unset(const page_id_t id)
+{
+ mysql_mutex_assert_not_owner(&mutex);
+ const ulint fold= id.fold();
+ page_hash_latch *hash_lock= page_hash.lock<true>(fold);
+ /* The page must exist because watch_set() increments buf_fix_count. */
+ buf_page_t *w= page_hash_get_low(id, fold);
+ const auto buf_fix_count= w->buf_fix_count();
+ ut_ad(buf_fix_count);
+ const bool must_remove= buf_fix_count == 1 && watch_is_sentinel(*w);
+ ut_ad(w->in_page_hash);
+ if (!must_remove)
+ w->unfix();
+ hash_lock->write_unlock();
+
+ if (must_remove)
+ {
+ const auto old= w;
+ /* The following is based on buf_pool_t::watch_remove(). */
+ mysql_mutex_lock(&mutex);
+ w= page_hash_get_low(id, fold);
+ page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
+ hash_lock->write_lock();
+ if (w->unfix() == 0 && w == old)
+ {
+ ut_ad(w->in_page_hash);
+ ut_d(w->in_page_hash= false);
+ HASH_DELETE(buf_page_t, hash, &page_hash, fold, w);
+ // Now that the watch is detached from page_hash, release it to watch[].
+ ut_ad(w->id_ == id);
+ ut_ad(!w->buf_fix_count());
+ ut_ad(w->state() == BUF_BLOCK_ZIP_PAGE);
+ w->set_state(BUF_BLOCK_NOT_USED);
+ }
+ hash_lock->write_unlock();
+ mysql_mutex_unlock(&mutex);
+ }
+}
+
/** Mark the page status as FREED for the given tablespace id and
page number. If the page is not in buffer pool then ignore it.
@param[in,out] space tablespace
diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc
index 2303ef625e7..7bc16f4918f 100644
--- a/storage/innobase/buf/buf0rea.cc
+++ b/storage/innobase/buf/buf0rea.cc
@@ -53,6 +53,7 @@ that the block has been replaced with the real block.
@param watch sentinel */
inline void buf_pool_t::watch_remove(buf_page_t *watch)
{
+ mysql_mutex_assert_owner(&buf_pool.mutex);
ut_ad(hash_lock_get(watch->id())->is_write_locked());
ut_a(watch_is_sentinel(*watch));
if (watch->buf_fix_count())
@@ -123,16 +124,10 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
mysql_mutex_lock(&buf_pool.mutex);
- /* We must acquire hash_lock this early to prevent
- a race condition with buf_pool_t::watch_remove() */
- page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
- hash_lock->write_lock();
-
buf_page_t *hash_page= buf_pool.page_hash_get_low(page_id, fold);
if (hash_page && !buf_pool.watch_is_sentinel(*hash_page))
{
/* The page is already in the buffer pool. */
- hash_lock->write_unlock();
if (block)
{
rw_lock_x_unlock_gen(&block->lock, BUF_IO_READ);
@@ -146,6 +141,9 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
bpage= &block->page;
/* Insert into the hash table of file pages */
+ page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
+ hash_lock->write_lock();
+
if (hash_page)
{
/* Preserve the reference count. */
@@ -184,8 +182,6 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
}
else
{
- hash_lock->write_unlock();
-
/* The compressed page must be allocated before the
control block (bpage), in order to avoid the
invocation of buf_buddy_relocate_block() on
@@ -193,8 +189,6 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
bool lru= false;
void *data= buf_buddy_alloc(zip_size, &lru);
- hash_lock->write_lock();
-
/* If buf_buddy_alloc() allocated storage from the LRU list,
it released and reacquired buf_pool.mutex. Thus, we must
check the page_hash again, as it may have been modified. */
@@ -205,7 +199,6 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
if (UNIV_UNLIKELY(hash_page && !buf_pool.watch_is_sentinel(*hash_page)))
{
/* The block was added by some other thread. */
- hash_lock->write_unlock();
buf_buddy_free(data, zip_size);
goto func_exit;
}
@@ -219,6 +212,9 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
bpage->init(BUF_BLOCK_ZIP_PAGE, page_id);
+ page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
+ hash_lock->write_lock();
+
if (hash_page)
{
/* Preserve the reference count. It can be 0 if
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
index 5a118df48e1..d1928196989 100644
--- a/storage/innobase/include/buf0buf.h
+++ b/storage/innobase/include/buf0buf.h
@@ -1739,33 +1739,7 @@ public:
/** Stop watching whether a page has been read in.
watch_set(id) must have returned nullptr before.
@param id page identifier */
- void watch_unset(const page_id_t id)
- {
- const ulint fold= id.fold();
- page_hash_latch *hash_lock= page_hash.lock<true>(fold);
- /* The page must exist because watch_set() increments buf_fix_count. */
- buf_page_t *watch= page_hash_get_low(id, fold);
- if (watch->unfix() == 0 && watch_is_sentinel(*watch))
- {
- /* The following is based on watch_remove(). */
- ut_ad(watch->in_page_hash);
- ut_d(watch->in_page_hash= false);
- HASH_DELETE(buf_page_t, hash, &page_hash, fold, watch);
- hash_lock->write_unlock();
- // Now that the watch is detached from page_hash, release it to watch[].
- mysql_mutex_lock(&mutex);
- /* It is possible that watch_remove() already removed the watch. */
- if (watch->id_ == id)
- {
- ut_ad(!watch->buf_fix_count());
- ut_ad(watch->state() == BUF_BLOCK_ZIP_PAGE);
- watch->set_state(BUF_BLOCK_NOT_USED);
- }
- mysql_mutex_unlock(&mutex);
- }
- else
- hash_lock->write_unlock();
- }
+ void watch_unset(const page_id_t id);
/** Remove the sentinel block for the watch before replacing it with a
real block. watch_unset() or watch_occurred() will notice