summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--storage/innobase/btr/btr0btr.cc1
-rw-r--r--storage/innobase/btr/btr0bulk.cc3
-rw-r--r--storage/innobase/btr/btr0cur.cc33
-rw-r--r--storage/innobase/btr/btr0sea.cc50
-rw-r--r--storage/innobase/buf/buf0block_hint.cc11
-rw-r--r--storage/innobase/buf/buf0buddy.cc18
-rw-r--r--storage/innobase/buf/buf0buf.cc752
-rw-r--r--storage/innobase/buf/buf0dblwr.cc3
-rw-r--r--storage/innobase/buf/buf0flu.cc317
-rw-r--r--storage/innobase/buf/buf0lru.cc74
-rw-r--r--storage/innobase/buf/buf0rea.cc138
-rw-r--r--storage/innobase/dict/dict0crea.cc2
-rw-r--r--storage/innobase/dict/dict0dict.cc2
-rw-r--r--storage/innobase/fsp/fsp0fsp.cc4
-rw-r--r--storage/innobase/gis/gis0sea.cc3
-rw-r--r--storage/innobase/handler/ha_innodb.cc16
-rw-r--r--storage/innobase/ibuf/ibuf0ibuf.cc9
-rw-r--r--storage/innobase/include/btr0sea.h12
-rw-r--r--storage/innobase/include/buf0buf.h374
-rw-r--r--storage/innobase/include/buf0buf.ic67
-rw-r--r--storage/innobase/include/buf0flu.h20
-rw-r--r--storage/innobase/include/buf0lru.h17
-rw-r--r--storage/innobase/include/buf0types.h69
-rw-r--r--storage/innobase/include/dict0mem.h8
-rw-r--r--storage/innobase/include/hash0hash.h12
-rw-r--r--storage/innobase/include/lock0lock.h188
-rw-r--r--storage/innobase/include/lock0priv.h7
-rw-r--r--storage/innobase/include/lock0priv.ic7
-rw-r--r--storage/innobase/include/os0file.h6
-rw-r--r--storage/innobase/include/rw_lock.h18
-rw-r--r--storage/innobase/include/srv0srv.h2
-rw-r--r--storage/innobase/include/srw_lock.h74
-rw-r--r--storage/innobase/include/transactional_lock_guard.h167
-rw-r--r--storage/innobase/include/trx0trx.h3
-rw-r--r--storage/innobase/include/ut0new.h2
-rw-r--r--storage/innobase/lock/lock0lock.cc301
-rw-r--r--storage/innobase/log/log0log.cc11
-rw-r--r--storage/innobase/log/log0recv.cc7
-rw-r--r--storage/innobase/mtr/mtr0mtr.cc2
-rw-r--r--storage/innobase/os/os0file.cc6
-rw-r--r--storage/innobase/row/row0import.cc2
-rw-r--r--storage/innobase/row/row0ins.cc3
-rw-r--r--storage/innobase/srv/srv0srv.cc19
-rw-r--r--storage/innobase/srv/srv0start.cc145
-rw-r--r--storage/innobase/sync/srw_lock.cc71
-rw-r--r--storage/innobase/trx/trx0purge.cc56
-rw-r--r--storage/innobase/trx/trx0rec.cc22
-rw-r--r--storage/innobase/trx/trx0trx.cc23
48 files changed, 1827 insertions, 1330 deletions
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc
index 02aa89361c5..e02df95b641 100644
--- a/storage/innobase/btr/btr0btr.cc
+++ b/storage/innobase/btr/btr0btr.cc
@@ -1069,6 +1069,7 @@ top_loop:
/** Clear the index tree and reinitialize the root page, in the
rollback of TRX_UNDO_EMPTY. The BTR_SEG_LEAF is freed and reinitialized.
@param thr query thread */
+TRANSACTIONAL_TARGET
void dict_index_t::clear(que_thr_t *thr)
{
mtr_t mtr;
diff --git a/storage/innobase/btr/btr0bulk.cc b/storage/innobase/btr/btr0bulk.cc
index 13e0e3a0eff..046291158a8 100644
--- a/storage/innobase/btr/btr0bulk.cc
+++ b/storage/innobase/btr/btr0bulk.cc
@@ -856,9 +856,10 @@ PageBulk::latch()
ut_ad(m_block->page.buf_fix_count());
- /* In case the block is S-latched by page_cleaner. */
+ /* In case the block is U-latched by page_cleaner. */
if (!buf_page_optimistic_get(RW_X_LATCH, m_block, m_modify_clock,
&m_mtr)) {
+ /* FIXME: avoid another lookup */
m_block = buf_page_get_gen(page_id_t(m_index->table->space_id,
m_page_no),
0, RW_X_LATCH,
diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc
index 32bdf9a8a51..435b62a7493 100644
--- a/storage/innobase/btr/btr0cur.cc
+++ b/storage/innobase/btr/btr0cur.cc
@@ -1214,6 +1214,7 @@ If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the
search tuple should be performed in the B-tree. InnoDB does an insert
immediately after the cursor. Thus, the cursor may end up on a user record,
or on a page infimum record. */
+TRANSACTIONAL_TARGET
dberr_t
btr_cur_search_to_nth_level_func(
dict_index_t* index, /*!< in: index */
@@ -1630,6 +1631,9 @@ retry_page_get:
ut_ad(cursor->thr);
switch (btr_op) {
+ default:
+ ut_error;
+ break;
case BTR_INSERT_OP:
case BTR_INSERT_IGNORE_UNIQUE_OP:
ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
@@ -1662,6 +1666,8 @@ retry_page_get:
case BTR_DELETE_OP:
ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
ut_ad(!dict_index_is_spatial(index));
+ auto& chain = buf_pool.page_hash.cell_get(
+ page_id.fold());
if (!row_purge_poss_sec(cursor->purge_node,
index, tuple)) {
@@ -1676,15 +1682,12 @@ retry_page_get:
cursor->flag = BTR_CUR_DELETE_IBUF;
} else {
/* The purge could not be buffered. */
- buf_pool.watch_unset(page_id);
+ buf_pool.watch_unset(page_id, chain);
break;
}
- buf_pool.watch_unset(page_id);
+ buf_pool.watch_unset(page_id, chain);
goto func_exit;
-
- default:
- ut_error;
}
/* Insert to the insert/delete buffer did not succeed, we
@@ -1992,16 +1995,15 @@ retry_page_get:
&& mode != PAGE_CUR_RTREE_INSERT
&& mode != PAGE_CUR_RTREE_LOCATE
&& mode >= PAGE_CUR_CONTAIN) {
- trx_t* trx = thr_get_trx(cursor->thr);
lock_prdt_t prdt;
- lock_sys.rd_lock(SRW_LOCK_CALL);
- trx->mutex_lock();
- lock_init_prdt_from_mbr(
- &prdt, &cursor->rtr_info->mbr, mode,
- trx->lock.lock_heap);
- lock_sys.rd_unlock();
- trx->mutex_unlock();
+ {
+ trx_t* trx = thr_get_trx(cursor->thr);
+ TMLockTrxGuard g{TMLockTrxArgs(*trx)};
+ lock_init_prdt_from_mbr(
+ &prdt, &cursor->rtr_info->mbr, mode,
+ trx->lock.lock_heap);
+ }
if (rw_latch == RW_NO_LATCH && height != 0) {
block->lock.s_lock();
@@ -6743,11 +6745,10 @@ static void btr_blob_free(buf_block_t *block, bool all, mtr_t *mtr)
ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
mtr->commit();
- const ulint fold= page_id.fold();
-
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
mysql_mutex_lock(&buf_pool.mutex);
- if (buf_page_t *bpage= buf_pool.page_hash_get_low(page_id, fold))
+ if (buf_page_t *bpage= buf_pool.page_hash.get(page_id, chain))
if (!buf_LRU_free_page(bpage, all) && all && bpage->zip.data)
/* Attempt to deallocate the redundant copy of the uncompressed page
if the whole ROW_FORMAT=COMPRESSED block cannot be deallocted. */
diff --git a/storage/innobase/btr/btr0sea.cc b/storage/innobase/btr/btr0sea.cc
index b95fbbe694a..a59a54676ed 100644
--- a/storage/innobase/btr/btr0sea.cc
+++ b/storage/innobase/btr/btr0sea.cc
@@ -1007,6 +1007,7 @@ both have sensible values.
or NULL
@param[in] mtr mini transaction
@return whether the search succeeded */
+TRANSACTIONAL_TARGET
bool
btr_search_guess_on_hash(
dict_index_t* index,
@@ -1090,26 +1091,34 @@ fail:
buf_block_t* block = buf_pool.block_from_ahi(rec);
if (!ahi_latch) {
- page_hash_latch* hash_lock = buf_pool.hash_lock_get(
- block->page.id());
- hash_lock->read_lock();
-
- if (block->page.state() == BUF_BLOCK_REMOVE_HASH) {
- /* Another thread is just freeing the block
- from the LRU list of the buffer pool: do not
- try to access this page. */
- hash_lock->read_unlock();
- goto fail;
+ buf_pool_t::hash_chain& chain = buf_pool.page_hash.cell_get(
+ block->page.id().fold());
+ bool fail;
+ {
+ transactional_shared_lock_guard<page_hash_latch> g{
+ buf_pool.page_hash.lock_get(chain)};
+
+ switch (block->page.state()) {
+ case BUF_BLOCK_REMOVE_HASH:
+ /* Another thread is just freeing the block
+ from the LRU list of the buffer pool: do not
+ try to access this page. */
+ goto fail;
+ case BUF_BLOCK_FILE_PAGE:
+ break;
+ default:
+#ifndef NO_ELISION
+ xend();
+#endif
+ ut_error;
+ }
+
+ block->fix();
+ fail = index != block->index
+ && index_id == block->index->id;
}
- const bool fail = index != block->index
- && index_id == block->index->id;
ut_a(!fail || block->index->freed());
- ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
- DBUG_ASSERT(fail || block->page.status != buf_page_t::FREED);
-
- buf_block_buf_fix_inc(block);
- hash_lock->read_unlock();
block->page.set_accessed();
buf_page_make_young_if_needed(&block->page);
@@ -1136,6 +1145,8 @@ got_no_latch:
if (UNIV_UNLIKELY(fail)) {
goto fail_and_release_page;
}
+
+ DBUG_ASSERT(block->page.status != buf_page_t::FREED);
} else if (UNIV_UNLIKELY(index != block->index
&& index_id == block->index->id)) {
ut_a(block->index->freed());
@@ -2209,8 +2220,9 @@ btr_search_hash_table_validate(ulint hash_table_id)
assertion and the comment below) */
const page_id_t id(block->page.id());
if (const buf_page_t* hash_page
- = buf_pool.page_hash_get_low(
- id, id.fold())) {
+ = buf_pool.page_hash.get(
+ id, buf_pool.page_hash.cell_get(
+ id.fold()))) {
ut_ad(hash_page == &block->page);
goto state_ok;
}
diff --git a/storage/innobase/buf/buf0block_hint.cc b/storage/innobase/buf/buf0block_hint.cc
index f9581bc7b5d..00c968511b3 100644
--- a/storage/innobase/buf/buf0block_hint.cc
+++ b/storage/innobase/buf/buf0block_hint.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2020, MariaDB Corporation.
+Copyright (c) 2020, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License, version 2.0, as published by the
@@ -28,6 +28,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include "buf0block_hint.h"
namespace buf {
+TRANSACTIONAL_TARGET
void Block_hint::buffer_fix_block_if_still_valid()
{
/* To check if m_block belongs to the current buf_pool, we must
@@ -46,14 +47,14 @@ void Block_hint::buffer_fix_block_if_still_valid()
validate m_block->state() to ensure that the block is not being freed. */
if (m_block)
{
- const ulint fold= m_page_id.fold();
- page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
+ auto &cell= buf_pool.page_hash.cell_get(m_page_id.fold());
+ transactional_shared_lock_guard<page_hash_latch> g
+ {buf_pool.page_hash.lock_get(cell)};
if (buf_pool.is_uncompressed(m_block) && m_page_id == m_block->page.id() &&
m_block->page.state() == BUF_BLOCK_FILE_PAGE)
- buf_block_buf_fix_inc(m_block);
+ m_block->fix();
else
clear();
- hash_lock->read_unlock();
}
}
} // namespace buf
diff --git a/storage/innobase/buf/buf0buddy.cc b/storage/innobase/buf/buf0buddy.cc
index f822adc3389..6f4b4554518 100644
--- a/storage/innobase/buf/buf0buddy.cc
+++ b/storage/innobase/buf/buf0buddy.cc
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -499,9 +499,10 @@ static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force)
ut_ad(space != BUF_BUDDY_STAMP_FREE);
const page_id_t page_id(space, offset);
- const ulint fold= page_id.fold();
+ /* FIXME: we are computing this while holding buf_pool.mutex */
+ auto &cell= buf_pool.page_hash.cell_get(page_id.fold());
- bpage = buf_pool.page_hash_get_low(page_id, fold);
+ bpage = buf_pool.page_hash.get(page_id, cell);
if (!bpage || bpage->zip.data != src) {
/* The block has probably been freshly
@@ -546,8 +547,11 @@ static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force)
return false;
}
- page_hash_latch *hash_lock = buf_pool.page_hash.lock_get(fold);
- hash_lock->write_lock();
+ page_hash_latch &hash_lock = buf_pool.page_hash.lock_get(cell);
+ /* It does not make sense to use transactional_lock_guard here,
+ because the memcpy() of 1024 to 16384 bytes would likely make the
+ memory transaction too large. */
+ hash_lock.lock();
if (bpage->can_relocate()) {
/* Relocate the compressed page. */
@@ -558,7 +562,7 @@ static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force)
memcpy(dst, src, size);
bpage->zip.data = reinterpret_cast<page_zip_t*>(dst);
- hash_lock->write_unlock();
+ hash_lock.unlock();
buf_buddy_mem_invalid(
reinterpret_cast<buf_buddy_free_t*>(src), i);
@@ -569,7 +573,7 @@ static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force)
return(true);
}
- hash_lock->write_unlock();
+ hash_lock.unlock();
return(false);
}
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
index 4ec6a61ccb9..4fb0ac37309 100644
--- a/storage/innobase/buf/buf0buf.cc
+++ b/storage/innobase/buf/buf0buf.cc
@@ -277,6 +277,7 @@ the read requests for the whole area.
*/
#ifndef UNIV_INNOCHECKSUM
+# ifdef SUX_LOCK_GENERIC
void page_hash_latch::read_lock_wait()
{
/* First, try busy spinning for a while. */
@@ -309,6 +310,7 @@ void page_hash_latch::write_lock_wait()
std::this_thread::yield();
while (!write_lock_poll());
}
+# endif
constexpr std::chrono::microseconds WAIT_FOR_READ(100);
constexpr int WAIT_FOR_WRITE= 100;
@@ -1145,7 +1147,7 @@ void buf_pool_t::page_hash_table::create(ulint n)
const size_t size= pad(n_cells) * sizeof *array;
void* v= aligned_malloc(size, CPU_LEVEL1_DCACHE_LINESIZE);
memset(v, 0, size);
- array= static_cast<hash_cell_t*>(v);
+ array= static_cast<hash_chain*>(v);
}
/** Create the buffer pool.
@@ -1334,9 +1336,14 @@ inline bool buf_pool_t::realloc(buf_block_t *block)
return(false); /* free list was not enough */
}
- const page_id_t id(block->page.id());
- page_hash_latch* hash_lock = hash_lock_get(id);
- hash_lock->write_lock();
+ const page_id_t id{block->page.id()};
+ hash_chain& chain = page_hash.cell_get(id.fold());
+ page_hash_latch& hash_lock = page_hash.lock_get(chain);
+ /* It does not make sense to use transactional_lock_guard
+ here, because copying innodb_page_size (4096 to 65536) bytes
+ as well as other changes would likely make the memory
+ transaction too large. */
+ hash_lock.lock();
if (block->page.can_relocate()) {
memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(
@@ -1380,14 +1387,10 @@ inline bool buf_pool_t::realloc(buf_block_t *block)
}
/* relocate page_hash */
- ut_ad(block->page.in_page_hash);
- ut_ad(new_block->page.in_page_hash);
- const ulint fold = id.fold();
- ut_ad(&block->page == page_hash_get_low(id, fold));
- ut_d(block->page.in_page_hash = false);
- HASH_REPLACE(buf_page_t, hash, &page_hash, fold,
- &block->page, &new_block->page);
-
+ hash_chain& chain = page_hash.cell_get(id.fold());
+ ut_ad(&block->page == page_hash.get(id, chain));
+ buf_pool.page_hash.replace(chain, &block->page,
+ &new_block->page);
buf_block_modify_clock_inc(block);
static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
memset_aligned<4>(block->frame + FIL_PAGE_OFFSET, 0xff, 4);
@@ -1422,7 +1425,7 @@ inline bool buf_pool_t::realloc(buf_block_t *block)
new_block = block;
}
- hash_lock->write_unlock();
+ hash_lock.unlock();
buf_LRU_block_free_non_file_page(new_block);
return(true); /* free_list was enough */
}
@@ -1505,7 +1508,7 @@ inline bool buf_pool_t::withdraw_blocks()
std::max<ulint>(withdraw_target
- UT_LIST_GET_LEN(withdraw),
srv_LRU_scan_depth));
- buf_flush_wait_batch_end_acquiring_mutex(true);
+ buf_flush_wait_LRU_batch_end_acquiring_mutex();
}
/* relocate blocks/buddies in withdrawn area */
@@ -1597,7 +1600,7 @@ inline void buf_pool_t::page_hash_table::write_lock_all()
{
for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1)
{
- reinterpret_cast<page_hash_latch&>(array[n]).write_lock();
+ reinterpret_cast<page_hash_latch&>(array[n]).lock();
if (!n)
break;
}
@@ -1608,7 +1611,7 @@ inline void buf_pool_t::page_hash_table::write_unlock_all()
{
for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1)
{
- reinterpret_cast<page_hash_latch&>(array[n]).write_unlock();
+ reinterpret_cast<page_hash_latch&>(array[n]).unlock();
if (!n)
break;
}
@@ -1743,6 +1746,8 @@ withdraw_retry:
{found, withdraw_started, my_hrtime_coarse()};
withdraw_started = current_time;
+ /* This is going to exceed the maximum size of a
+ memory transaction. */
LockMutexGuard g{SRW_LOCK_CALL};
trx_sys.trx_list.for_each(f);
}
@@ -2047,13 +2052,14 @@ The caller must relocate bpage->list.
@param dpage destination control block */
static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage)
{
- const ulint fold= bpage->id().fold();
+ const page_id_t id= bpage->id();
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold());
ut_ad(bpage->state() == BUF_BLOCK_ZIP_PAGE);
mysql_mutex_assert_owner(&buf_pool.mutex);
- ut_ad(buf_pool.hash_lock_get(bpage->id())->is_write_locked());
+ ut_ad(buf_pool.page_hash.lock_get(chain).is_write_locked());
ut_a(bpage->io_fix() == BUF_IO_NONE);
ut_a(!bpage->buf_fix_count());
- ut_ad(bpage == buf_pool.page_hash_get_low(bpage->id(), fold));
+ ut_ad(bpage == buf_pool.page_hash.get(id, chain));
ut_ad(!buf_pool.watch_is_sentinel(*bpage));
ut_ad(bpage->state() == BUF_BLOCK_ZIP_PAGE);
@@ -2088,29 +2094,24 @@ static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage)
ut_d(CheckInLRUList::validate());
- /* relocate buf_pool.page_hash */
- ut_ad(bpage->in_page_hash);
- ut_ad(dpage->in_page_hash);
- ut_d(bpage->in_page_hash= false);
- HASH_REPLACE(buf_page_t, hash, &buf_pool.page_hash, fold, bpage, dpage);
+ buf_pool.page_hash.replace(chain, bpage, dpage);
}
/** Register a watch for a page identifier. The caller must hold an
exclusive page hash latch. The *hash_lock may be released,
relocated, and reacquired.
@param id page identifier
-@param hash_lock exclusively held page_hash latch
+@param chain hash table chain with exclusively held page_hash
@return a buffer pool block corresponding to id
@retval nullptr if the block was not present, and a watch was installed */
inline buf_page_t *buf_pool_t::watch_set(const page_id_t id,
- page_hash_latch **hash_lock)
+ buf_pool_t::hash_chain &chain)
{
- const ulint fold= id.fold();
- ut_ad(*hash_lock == page_hash.lock_get(fold));
- ut_ad((*hash_lock)->is_write_locked());
+ ut_ad(&chain == &page_hash.cell_get(id.fold()));
+ ut_ad(page_hash.lock_get(chain).is_write_locked());
retry:
- if (buf_page_t *bpage= page_hash_get_low(id, fold))
+ if (buf_page_t *bpage= page_hash.get(id, chain))
{
if (!watch_is_sentinel(*bpage))
/* The page was loaded meanwhile. */
@@ -2120,7 +2121,7 @@ retry:
return nullptr;
}
- (*hash_lock)->write_unlock();
+ page_hash.lock_get(chain).unlock();
/* Allocate a watch[] and then try to insert it into the page_hash. */
mysql_mutex_lock(&mutex);
@@ -2140,28 +2141,23 @@ retry:
ut_ad(!w->buf_fix_count());
/* w is pointing to watch[], which is protected by mutex.
Normally, buf_page_t::id for objects that are reachable by
- page_hash_get_low(id, fold) are protected by hash_lock. */
+ page_hash.get(id, chain) are protected by hash_lock. */
w->set_state(BUF_BLOCK_ZIP_PAGE);
w->id_= id;
- *hash_lock= page_hash.lock_get(fold);
-
- buf_page_t *bpage= page_hash_get_low(id, fold);
+ buf_page_t *bpage= page_hash.get(id, chain);
if (UNIV_LIKELY_NULL(bpage))
{
w->set_state(BUF_BLOCK_NOT_USED);
- *hash_lock= page_hash.lock_get(fold);
- (*hash_lock)->write_lock();
+ page_hash.lock_get(chain).lock();
mysql_mutex_unlock(&mutex);
goto retry;
}
- (*hash_lock)->write_lock();
+ page_hash.lock_get(chain).lock();
ut_ad(!w->buf_fix_count_);
w->buf_fix_count_= 1;
- ut_ad(!w->in_page_hash);
- ut_d(w->in_page_hash= true);
- HASH_INSERT(buf_page_t, hash, &page_hash, fold, w);
+ buf_pool.page_hash.append(chain, w);
mysql_mutex_unlock(&mutex);
return nullptr;
}
@@ -2173,50 +2169,57 @@ retry:
/** Stop watching whether a page has been read in.
watch_set(id) must have returned nullptr before.
-@param id page identifier */
-void buf_pool_t::watch_unset(const page_id_t id)
+@param id page identifier
+@param chain unlocked hash table chain */
+TRANSACTIONAL_TARGET
+void buf_pool_t::watch_unset(const page_id_t id, buf_pool_t::hash_chain &chain)
{
mysql_mutex_assert_not_owner(&mutex);
- const ulint fold= id.fold();
- page_hash_latch *hash_lock= page_hash.lock<true>(fold);
- /* The page must exist because watch_set() increments buf_fix_count. */
- buf_page_t *w= page_hash_get_low(id, fold);
- const auto buf_fix_count= w->buf_fix_count();
- ut_ad(buf_fix_count);
- const bool must_remove= buf_fix_count == 1 && watch_is_sentinel(*w);
- ut_ad(w->in_page_hash);
- if (!must_remove)
- w->unfix();
- hash_lock->write_unlock();
-
- if (must_remove)
+ buf_page_t *w;
{
- const auto old= w;
- /* The following is based on buf_pool_t::watch_remove(). */
- mysql_mutex_lock(&mutex);
- w= page_hash_get_low(id, fold);
- page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
- hash_lock->write_lock();
+ transactional_lock_guard<page_hash_latch> g{page_hash.lock_get(chain)};
+ /* The page must exist because watch_set() increments buf_fix_count. */
+ w= page_hash.get(id, chain);
+ const auto buf_fix_count= w->buf_fix_count();
+ ut_ad(buf_fix_count);
+ ut_ad(w->in_page_hash);
+ if (buf_fix_count != 1 || !watch_is_sentinel(*w))
+ {
+ w->unfix();
+ w= nullptr;
+ }
+ }
+
+ if (!w)
+ return;
+
+ const auto old= w;
+ /* The following is based on buf_pool_t::watch_remove(). */
+ mysql_mutex_lock(&mutex);
+ w= page_hash.get(id, chain);
+
+ {
+ transactional_lock_guard<page_hash_latch> g
+ {buf_pool.page_hash.lock_get(chain)};
if (w->unfix() == 0 && w == old)
{
- ut_ad(w->in_page_hash);
- ut_d(w->in_page_hash= false);
- HASH_DELETE(buf_page_t, hash, &page_hash, fold, w);
- // Now that the watch is detached from page_hash, release it to watch[].
+ page_hash.remove(chain, w);
+ // Now that w is detached from page_hash, release it to watch[].
ut_ad(w->id_ == id);
ut_ad(!w->buf_fix_count());
ut_ad(w->state() == BUF_BLOCK_ZIP_PAGE);
w->set_state(BUF_BLOCK_NOT_USED);
}
- hash_lock->write_unlock();
- mysql_mutex_unlock(&mutex);
}
+
+ mysql_mutex_unlock(&mutex);
}
/** Mark the page status as FREED for the given tablespace and page number.
@param[in,out] space tablespace
@param[in] page page number
@param[in,out] mtr mini-transaction */
+TRANSACTIONAL_TARGET
void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr)
{
ut_ad(mtr);
@@ -2231,28 +2234,23 @@ void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr)
++buf_pool.stat.n_page_gets;
const page_id_t page_id(space->id, page);
- const ulint fold= page_id.fold();
- page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
- if (buf_block_t *block= reinterpret_cast<buf_block_t*>
- (buf_pool.page_hash_get_low(page_id, fold)))
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
+ buf_block_t *block;
{
- if (block->page.state() != BUF_BLOCK_FILE_PAGE)
- /* FIXME: convert, but avoid buf_zip_decompress() */;
- else
- {
- buf_block_buf_fix_inc(block);
- ut_ad(block->page.buf_fix_count());
- hash_lock->read_unlock();
-
- mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
- block->lock.x_lock();
-
- block->page.status= buf_page_t::FREED;
+ transactional_shared_lock_guard<page_hash_latch> g
+ {buf_pool.page_hash.lock_get(chain)};
+ block= reinterpret_cast<buf_block_t*>
+ (buf_pool.page_hash.get(page_id, chain));
+ if (!block || block->page.state() != BUF_BLOCK_FILE_PAGE)
+ /* FIXME: convert ROW_FORMAT=COMPRESSED, without buf_zip_decompress() */
return;
- }
+ block->fix();
}
+ ut_ad(block->page.buf_fix_count());
- hash_lock->read_unlock();
+ mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
+ block->lock.x_lock();
+ block->page.status= buf_page_t::FREED;
}
/** Get read access to a compressed page (usually of type
@@ -2265,80 +2263,48 @@ the same set of mutexes or latches.
@param[in] page_id page id
@param[in] zip_size ROW_FORMAT=COMPRESSED page size
@return pointer to the block */
+TRANSACTIONAL_TARGET
buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size)
{
ut_ad(zip_size);
ut_ad(ut_is_2pow(zip_size));
++buf_pool.stat.n_page_gets;
- bool discard_attempted= false;
- const ulint fold= page_id.fold();
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
+ page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain);
buf_page_t *bpage;
- page_hash_latch *hash_lock;
- for (;;)
- {
lookup:
- bpage= buf_pool.page_hash_get_locked<false>(page_id, fold, &hash_lock);
- if (bpage)
- break;
-
- dberr_t err= buf_read_page(page_id, zip_size);
-
- if (UNIV_UNLIKELY(err != DB_SUCCESS))
+ for (bool discard_attempted= false;;)
+ {
{
- ib::error() << "Reading compressed page " << page_id
- << " failed with error: " << err;
- goto err_exit;
- }
+ transactional_shared_lock_guard<page_hash_latch> g{hash_lock};
+ bpage= buf_pool.page_hash.get(page_id, chain);
+ if (!bpage || buf_pool.watch_is_sentinel(*bpage))
+ goto must_read_page;
-#ifdef UNIV_DEBUG
- if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
-#endif /* UNIV_DEBUG */
- }
+ ut_ad(bpage->in_file());
+ ut_ad(page_id == bpage->id());
- ut_ad(hash_lock->is_read_locked());
+ if (!bpage->zip.data)
+ /* There is no ROW_FORMAT=COMPRESSED page. */
+ return nullptr;
- if (!bpage->zip.data)
- {
- /* There is no compressed page. */
-err_exit:
- hash_lock->read_unlock();
- return nullptr;
- }
-
- ut_ad(!buf_pool.watch_is_sentinel(*bpage));
-
- switch (bpage->state()) {
- case BUF_BLOCK_FILE_PAGE:
- /* Discard the uncompressed page frame if possible. */
- if (!discard_attempted)
- {
- discard_attempted= true;
- hash_lock->read_unlock();
- mysql_mutex_lock(&buf_pool.mutex);
- if (buf_page_t *bpage= buf_pool.page_hash_get_low(page_id, fold))
- buf_LRU_free_page(bpage, false);
- mysql_mutex_unlock(&buf_pool.mutex);
- goto lookup;
+ if (discard_attempted || bpage->state() == BUF_BLOCK_ZIP_PAGE)
+ {
+ bpage->fix();
+ break;
+ }
}
- /* fall through */
- case BUF_BLOCK_ZIP_PAGE:
- bpage->fix();
- goto got_block;
- default:
- break;
- }
- ut_error;
- goto err_exit;
-
-got_block:
- bool must_read= bpage->io_fix() == BUF_IO_READ;
- hash_lock->read_unlock();
+ discard_attempted= true;
+ mysql_mutex_lock(&buf_pool.mutex);
+ if (buf_page_t *bpage= buf_pool.page_hash.get(page_id, chain))
+ buf_LRU_free_page(bpage, false);
+ mysql_mutex_unlock(&buf_pool.mutex);
+ }
DBUG_ASSERT(bpage->status != buf_page_t::FREED);
-
bpage->set_accessed();
buf_page_make_young_if_needed(bpage);
@@ -2348,12 +2314,19 @@ got_block:
ut_ad(bpage->buf_fix_count());
ut_ad(bpage->in_file());
- if (must_read)
- /* Let us wait until the read operation completes */
- while (bpage->io_fix() == BUF_IO_READ)
- std::this_thread::sleep_for(WAIT_FOR_READ);
-
+ /* Let us wait until the read operation completes */
+ while (bpage->io_fix() == BUF_IO_READ)
+ std::this_thread::sleep_for(WAIT_FOR_READ);
return bpage;
+
+must_read_page:
+ if (dberr_t err= buf_read_page(page_id, zip_size))
+ {
+ ib::error() << "Reading compressed page " << page_id
+ << " failed with error: " << err;
+ return nullptr;
+ }
+ goto lookup;
}
/********************************************************************//**
@@ -2505,6 +2478,7 @@ while reading the page from file
then it makes sure that it does merging of change buffer changes while
reading the page from file.
@return pointer to the block or NULL */
+TRANSACTIONAL_TARGET
buf_block_t*
buf_page_get_low(
const page_id_t page_id,
@@ -2516,10 +2490,8 @@ buf_page_get_low(
dberr_t* err,
bool allow_ibuf_merge)
{
- buf_block_t* block;
unsigned access_time;
ulint retries = 0;
- const ulint fold = page_id.fold();
ut_ad((mtr == NULL) == (mode == BUF_EVICT_IF_IN_POOL));
ut_ad(!mtr || mtr->is_active());
@@ -2570,156 +2542,141 @@ buf_page_get_low(
|| ibuf_page_low(page_id, zip_size, FALSE, NULL));
++buf_pool.stat.n_page_gets;
-loop:
- buf_block_t* fix_block;
- block = guess;
- page_hash_latch* hash_lock = buf_pool.page_hash.lock<false>(fold);
+ auto& chain= buf_pool.page_hash.cell_get(page_id.fold());
+ page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain);
+loop:
+ buf_block_t* block = guess;
if (block) {
-
- /* If the guess is a compressed page descriptor that
- has been allocated by buf_page_alloc_descriptor(),
- it may have been freed by buf_relocate(). */
-
- if (!buf_pool.is_uncompressed(block)
- || page_id != block->page.id()
- || block->page.state() != BUF_BLOCK_FILE_PAGE) {
- /* Our guess was bogus or things have changed
- since. */
- guess = nullptr;
- goto lookup;
- } else {
+ transactional_shared_lock_guard<page_hash_latch> g{hash_lock};
+ if (buf_pool.is_uncompressed(block)
+ && page_id == block->page.id()
+ && block->page.state() == BUF_BLOCK_FILE_PAGE) {
ut_ad(!block->page.in_zip_hash);
+ block->fix();
+ goto got_block;
}
- } else {
-lookup:
- block = reinterpret_cast<buf_block_t*>(
- buf_pool.page_hash_get_low(page_id, fold));
}
- if (!block || buf_pool.watch_is_sentinel(block->page)) {
- hash_lock->read_unlock();
- block = nullptr;
+ guess = nullptr;
+
+ /* A memory transaction would frequently be aborted here. */
+ hash_lock.lock_shared();
+ block = reinterpret_cast<buf_block_t*>(
+ buf_pool.page_hash.get(page_id, chain));
+ if (UNIV_LIKELY(block
+ && !buf_pool.watch_is_sentinel(block->page))) {
+ block->fix();
+ hash_lock.unlock_shared();
+ goto got_block;
}
+ hash_lock.unlock_shared();
- if (UNIV_UNLIKELY(!block)) {
- /* Page not in buf_pool: needs to be read from file */
- if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
- hash_lock = buf_pool.page_hash.lock<true>(fold);
-
- if (buf_page_t *bpage= buf_pool.watch_set(
- page_id, &hash_lock)) {
- /* We can release hash_lock after we
- increment the fix count to make
- sure that no state change takes place. */
- bpage->fix();
- hash_lock->write_unlock();
- block = reinterpret_cast<buf_block_t*>(bpage);
- fix_block = block;
- goto got_block;
- }
-
- hash_lock->write_unlock();
+ /* Page not in buf_pool: needs to be read from file */
+ switch (mode) {
+ case BUF_GET_IF_IN_POOL:
+ case BUF_PEEK_IF_IN_POOL:
+ case BUF_EVICT_IF_IN_POOL:
+ return nullptr;
+ case BUF_GET_IF_IN_POOL_OR_WATCH:
+ /* We cannot easily use a memory transaction here. */
+ hash_lock.lock();
+ block = reinterpret_cast<buf_block_t*>
+ (buf_pool.watch_set(page_id, chain));
+ if (block) {
+ /* buffer-fixing prevents block->page.state()
+ changes */
+ block->fix();
}
+ hash_lock.unlock();
- switch (mode) {
- case BUF_GET_IF_IN_POOL:
- case BUF_GET_IF_IN_POOL_OR_WATCH:
- case BUF_PEEK_IF_IN_POOL:
- case BUF_EVICT_IF_IN_POOL:
- return(NULL);
+ if (block) {
+ goto got_block;
}
- /* The call path is buf_read_page() ->
- buf_read_page_low() (fil_space_t::io()) ->
- buf_page_read_complete() ->
- buf_decrypt_after_read(). Here fil_space_t* is used
- and we decrypt -> buf_page_check_corrupt() where page
- checksums are compared. Decryption, decompression as
- well as error handling takes place at a lower level.
- Here we only need to know whether the page really is
- corrupted, or if an encrypted page with a valid
- checksum cannot be decypted. */
-
- dberr_t local_err = buf_read_page(page_id, zip_size);
-
- if (local_err == DB_SUCCESS) {
- buf_read_ahead_random(page_id, zip_size,
- ibuf_inside(mtr));
-
- retries = 0;
- } else if (mode == BUF_GET_POSSIBLY_FREED) {
+ return nullptr;
+ }
+
+ /* The call path is buf_read_page() ->
+ buf_read_page_low() (fil_space_t::io()) ->
+ buf_page_read_complete() ->
+ buf_decrypt_after_read(). Here fil_space_t* is used
+ and we decrypt -> buf_page_check_corrupt() where page
+ checksums are compared. Decryption, decompression as
+ well as error handling takes place at a lower level.
+ Here we only need to know whether the page really is
+ corrupted, or if an encrypted page with a valid
+ checksum cannot be decypted. */
+
+ if (dberr_t local_err = buf_read_page(page_id, zip_size)) {
+ if (mode == BUF_GET_POSSIBLY_FREED) {
if (err) {
*err = local_err;
}
- return NULL;
+ return nullptr;
} else if (retries < BUF_PAGE_READ_MAX_RETRIES) {
++retries;
-
- DBUG_EXECUTE_IF(
- "innodb_page_corruption_retries",
- retries = BUF_PAGE_READ_MAX_RETRIES;
- );
+ DBUG_EXECUTE_IF("innodb_page_corruption_retries",
+ retries = BUF_PAGE_READ_MAX_RETRIES;);
} else {
if (err) {
*err = local_err;
}
-
- /* Pages whose encryption key is unavailable or used
- key, encryption algorithm or encryption method is
- incorrect are marked as encrypted in
+ /* Pages whose encryption key is unavailable or the
+ configured key, encryption algorithm or encryption
+ method are incorrect are marked as encrypted in
buf_page_check_corrupt(). Unencrypted page could be
corrupted in a way where the key_id field is
nonzero. There is no checksum on field
FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION. */
- if (local_err == DB_DECRYPTION_FAILED) {
- return (NULL);
- }
-
- if (local_err == DB_PAGE_CORRUPTED
- && srv_force_recovery) {
- return NULL;
+ switch (local_err) {
+ case DB_PAGE_CORRUPTED:
+ if (!srv_force_recovery) {
+ break;
+ }
+ /* fall through */
+ case DB_DECRYPTION_FAILED:
+ return nullptr;
+ default:
+ break;
}
/* Try to set table as corrupted instead of
asserting. */
if (page_id.space() == TRX_SYS_SPACE) {
} else if (page_id.space() == SRV_TMP_SPACE_ID) {
- } else if (fil_space_t* space= fil_space_t::get(
- page_id.space())) {
+ } else if (fil_space_t* space
+ = fil_space_t::get(page_id.space())) {
bool set = dict_set_corrupted_by_space(space);
space->release();
if (set) {
- return NULL;
+ return nullptr;
}
}
if (local_err == DB_IO_ERROR) {
- return NULL;
+ return nullptr;
}
ib::fatal() << "Unable to read page " << page_id
- << " into the buffer pool after "
- << BUF_PAGE_READ_MAX_RETRIES
- << ". The most probable cause"
+ << " into the buffer pool after "
+ << BUF_PAGE_READ_MAX_RETRIES
+ << ". The most probable cause"
" of this error may be that the"
" table has been corrupted."
" See https://mariadb.com/kb/en/library/innodb-recovery-modes/";
}
-
-#ifdef UNIV_DEBUG
- if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
-#endif /* UNIV_DEBUG */
- goto loop;
} else {
- fix_block = block;
+ buf_read_ahead_random(page_id, zip_size, ibuf_inside(mtr));
+ retries = 0;
}
- fix_block->fix();
- hash_lock->read_unlock();
+ ut_d(if (!(++buf_dbg_counter % 5771)) buf_pool.validate());
+ goto loop;
got_block:
+ ut_ad(!block->page.in_zip_hash);
switch (mode) {
default:
ut_ad(block->zip_size() == zip_size);
@@ -2727,23 +2684,23 @@ got_block:
case BUF_GET_IF_IN_POOL:
case BUF_PEEK_IF_IN_POOL:
case BUF_EVICT_IF_IN_POOL:
- if (fix_block->page.io_fix() == BUF_IO_READ) {
+ if (block->page.io_fix() == BUF_IO_READ) {
/* The page is being read to buffer pool,
but we cannot wait around for the read to
complete. */
- fix_block->unfix();
+ block->unfix();
return(NULL);
}
}
- switch (UNIV_EXPECT(fix_block->page.state(), BUF_BLOCK_FILE_PAGE)) {
+ switch (UNIV_EXPECT(block->page.state(), BUF_BLOCK_FILE_PAGE)) {
case BUF_BLOCK_FILE_PAGE:
if (fsp_is_system_temporary(page_id.space())
&& block->page.io_fix() != BUF_IO_NONE) {
/* This suggests that the page is being flushed.
Avoid returning reference to this page.
Instead wait for the flush action to complete. */
- fix_block->unfix();
+ block->unfix();
std::this_thread::sleep_for(
std::chrono::microseconds(WAIT_FOR_WRITE));
goto loop;
@@ -2751,11 +2708,11 @@ got_block:
if (UNIV_UNLIKELY(mode == BUF_EVICT_IF_IN_POOL)) {
evict_from_pool:
- ut_ad(!fix_block->page.oldest_modification());
+ ut_ad(!block->page.oldest_modification());
mysql_mutex_lock(&buf_pool.mutex);
- fix_block->unfix();
+ block->unfix();
- if (!buf_LRU_free_page(&fix_block->page, true)) {
+ if (!buf_LRU_free_page(&block->page, true)) {
ut_ad(0);
}
@@ -2778,7 +2735,7 @@ evict_from_pool:
adaptive hash index. There cannot be an
adaptive hash index for a compressed-only
page, so do not bother decompressing the page. */
- fix_block->unfix();
+ block->unfix();
return(NULL);
}
@@ -2792,7 +2749,7 @@ evict_from_pool:
/* This condition often occurs when the buffer
is not buffer-fixed, but I/O-fixed by
buf_page_init_for_read(). */
- fix_block->unfix();
+ block->unfix();
/* The block is buffer-fixed or I/O-fixed.
Try again later. */
@@ -2805,18 +2762,21 @@ evict_from_pool:
or relocated while we are attempting to allocate an
uncompressed page. */
- block = buf_LRU_get_free_block(false);
- buf_block_init_low(block);
+ buf_block_t *new_block = buf_LRU_get_free_block(false);
+ buf_block_init_low(new_block);
mysql_mutex_lock(&buf_pool.mutex);
- hash_lock = buf_pool.page_hash.lock_get(fold);
+ page_hash_latch& hash_lock=buf_pool.page_hash.lock_get(chain);
- hash_lock->write_lock();
+ /* It does not make sense to use
+ transactional_lock_guard here, because buf_relocate()
+ would likely make a memory transaction too large. */
+ hash_lock.lock();
/* Buffer-fixing prevents the page_hash from changing. */
- ut_ad(bpage == buf_pool.page_hash_get_low(page_id, fold));
+ ut_ad(bpage == buf_pool.page_hash.get(page_id, chain));
- fix_block->unfix(); /* hash_lock protects us after this */
+ block->unfix(); /* hash_lock protects us after this */
if (bpage->buf_fix_count() || bpage->io_fix() != BUF_IO_NONE) {
/* The block was buffer-fixed or I/O-fixed while
@@ -2825,15 +2785,15 @@ evict_from_pool:
This should be extremely unlikely, for example,
if buf_page_get_zip() was invoked. */
- hash_lock->write_unlock();
- buf_LRU_block_free_non_file_page(block);
+ hash_lock.unlock();
+ buf_LRU_block_free_non_file_page(new_block);
mysql_mutex_unlock(&buf_pool.mutex);
/* Try again */
goto loop;
}
- fix_block = block;
+ block = new_block;
/* Move the compressed page from bpage to block,
and uncompress it. */
@@ -2864,7 +2824,7 @@ evict_from_pool:
MEM_UNDEFINED(bpage, sizeof *bpage);
mysql_mutex_unlock(&buf_pool.mutex);
- hash_lock->write_unlock();
+ hash_lock.unlock();
buf_pool.n_pend_unzip++;
access_time = block->page.is_accessed();
@@ -2880,9 +2840,9 @@ evict_from_pool:
buf_pool.mutex. */
if (!buf_zip_decompress(block, false)) {
- fix_block->lock.x_unlock();
- fix_block->page.io_unfix();
- fix_block->unfix();
+ block->lock.x_unlock();
+ block->page.io_unfix();
+ block->unfix();
--buf_pool.n_pend_unzip;
if (err) {
@@ -2891,16 +2851,14 @@ evict_from_pool:
return NULL;
}
+ block->page.io_unfix();
block->lock.x_unlock();
- fix_block->page.io_unfix();
--buf_pool.n_pend_unzip;
- break;
}
- ut_ad(block == fix_block);
- ut_ad(fix_block->page.buf_fix_count());
+ ut_ad(block->page.buf_fix_count());
- ut_ad(fix_block->page.state() == BUF_BLOCK_FILE_PAGE);
+ ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
re_evict:
@@ -2913,49 +2871,35 @@ re_evict:
mysql_mutex_lock(&buf_pool.mutex);
- fix_block->unfix();
+ block->unfix();
/* Blocks cannot be relocated or enter or exit the
buf_pool while we are holding the buf_pool.mutex. */
- const bool evicted = buf_LRU_free_page(&fix_block->page, true);
+ const bool evicted = buf_LRU_free_page(&block->page, true);
space->release();
if (evicted) {
- hash_lock = buf_pool.page_hash.lock_get(fold);
- hash_lock->write_lock();
+ page_hash_latch& hash_lock
+ = buf_pool.page_hash.lock_get(chain);
+ hash_lock.lock();
mysql_mutex_unlock(&buf_pool.mutex);
/* We may set the watch, as it would have
been set if the page were not in the
buffer pool in the first place. */
block= reinterpret_cast<buf_block_t*>(
mode == BUF_GET_IF_IN_POOL_OR_WATCH
- ? buf_pool.watch_set(page_id, &hash_lock)
- : buf_pool.page_hash_get_low(page_id, fold));
- hash_lock->write_unlock();
-
- if (block != NULL) {
- /* Either the page has been read in or
- a watch was set on that in the window
- where we released the buf_pool.mutex
- and before we acquire the hash_lock
- above. Try again. */
- guess = block;
-
- goto loop;
- }
-
+ ? buf_pool.watch_set(page_id, chain)
+ : buf_pool.page_hash.get(page_id, chain));
+ hash_lock.unlock();
return(NULL);
}
- fix_block->fix();
+ block->fix();
mysql_mutex_unlock(&buf_pool.mutex);
- buf_flush_list();
- buf_flush_wait_batch_end_acquiring_mutex(false);
- while (buf_flush_list_space(space));
- os_aio_wait_until_no_pending_writes();
+ buf_flush_sync();
- if (fix_block->page.buf_fix_count() == 1
- && !fix_block->page.oldest_modification()) {
+ if (block->page.buf_fix_count() == 1
+ && !block->page.oldest_modification()) {
goto re_evict;
}
@@ -2963,7 +2907,7 @@ re_evict:
}
#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
- ut_ad(fix_block->page.buf_fix_count());
+ ut_ad(block->page.buf_fix_count());
/* While tablespace is reinited the indexes are already freed but the
blocks related to it still resides in buffer pool. Trying to remove
@@ -2974,25 +2918,25 @@ re_evict:
"btr_search_drop_page_hash_when_freed". */
ut_ad(mode == BUF_GET_POSSIBLY_FREED
|| mode == BUF_PEEK_IF_IN_POOL
- || fix_block->page.status != buf_page_t::FREED);
+ || block->page.status != buf_page_t::FREED);
- const bool not_first_access = fix_block->page.set_accessed();
+ const bool not_first_access = block->page.set_accessed();
if (mode != BUF_PEEK_IF_IN_POOL) {
- buf_page_make_young_if_needed(&fix_block->page);
+ buf_page_make_young_if_needed(&block->page);
}
#ifdef UNIV_DEBUG
if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
#endif /* UNIV_DEBUG */
- ut_ad(fix_block->page.state() == BUF_BLOCK_FILE_PAGE);
+ ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
/* We have to wait here because the IO_READ state was set
under the protection of the hash_lock and not block->lock. */
- buf_wait_for_read(fix_block);
+ buf_wait_for_read(block);
- if (fix_block->page.id() != page_id) {
- buf_block_buf_fix_dec(fix_block);
+ if (block->page.id() != page_id) {
+ buf_block_buf_fix_dec(block);
if (err) {
*err = DB_PAGE_CORRUPTED;
@@ -3001,27 +2945,27 @@ re_evict:
return NULL;
}
- if (fix_block->page.status != buf_page_t::FREED
+ if (block->page.status != buf_page_t::FREED
&& allow_ibuf_merge
- && fil_page_get_type(fix_block->frame) == FIL_PAGE_INDEX
- && page_is_leaf(fix_block->frame)) {
- fix_block->lock.x_lock();
+ && fil_page_get_type(block->frame) == FIL_PAGE_INDEX
+ && page_is_leaf(block->frame)) {
+ block->lock.x_lock();
- if (fix_block->page.ibuf_exist) {
- fix_block->page.ibuf_exist = false;
- ibuf_merge_or_delete_for_page(fix_block, page_id,
+ if (block->page.ibuf_exist) {
+ block->page.ibuf_exist = false;
+ ibuf_merge_or_delete_for_page(block, page_id,
zip_size);
}
if (rw_latch == RW_X_LATCH) {
- mtr->memo_push(fix_block, MTR_MEMO_PAGE_X_FIX);
+ mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
} else {
- fix_block->lock.x_unlock();
+ block->lock.x_unlock();
goto get_latch;
}
} else {
get_latch:
- mtr->page_lock(fix_block, rw_latch);
+ mtr->page_lock(block, rw_latch);
}
if (!not_first_access && mode != BUF_PEEK_IF_IN_POOL) {
@@ -3031,7 +2975,7 @@ get_latch:
buf_read_ahead_linear(page_id, zip_size, ibuf_inside(mtr));
}
- return(fix_block);
+ return block;
}
/** Get access to a database page. Buffered redo log may be applied.
@@ -3092,6 +3036,7 @@ buf_page_get_gen(
This is the general function used to get optimistic access to a database
page.
@return TRUE if success */
+TRANSACTIONAL_TARGET
ibool
buf_page_optimistic_get(
/*====================*/
@@ -3107,26 +3052,26 @@ buf_page_optimistic_get(
ut_ad(mtr->is_active());
ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH);
- if (UNIV_UNLIKELY(block->page.state() != BUF_BLOCK_FILE_PAGE
- || block->page.io_fix() != BUF_IO_NONE)) {
+ if (have_transactional_memory) {
+ } else if (UNIV_UNLIKELY(block->page.state() != BUF_BLOCK_FILE_PAGE
+ || block->page.io_fix() != BUF_IO_NONE)) {
return FALSE;
}
- const page_id_t id(block->page.id());
+ const page_id_t id{block->page.id()};
+ buf_pool_t::hash_chain& chain = buf_pool.page_hash.cell_get(id.fold());
- page_hash_latch *hash_lock = buf_pool.hash_lock_get(id);
- hash_lock->read_lock();
-
- if (UNIV_UNLIKELY(id != block->page.id()
- || block->page.state() != BUF_BLOCK_FILE_PAGE
- || block->page.io_fix() != BUF_IO_NONE)) {
- hash_lock->read_unlock();
- return(FALSE);
+ {
+ transactional_shared_lock_guard<page_hash_latch> g{
+ buf_pool.page_hash.lock_get(chain)};
+ if (UNIV_UNLIKELY(id != block->page.id()
+ || block->page.state() != BUF_BLOCK_FILE_PAGE
+ || block->page.io_fix() != BUF_IO_NONE)) {
+ return FALSE;
+ }
+ block->fix();
}
- buf_block_buf_fix_inc(block);
- hash_lock->read_unlock();
-
block->page.set_accessed();
buf_page_make_young_if_needed(&block->page);
@@ -3187,30 +3132,27 @@ Suitable for using when holding the lock_sys latches (as it avoids deadlock).
@param[in,out] mtr mini-transaction
@return the block
@retval nullptr if an S-latch cannot be granted immediately */
+TRANSACTIONAL_TARGET
buf_block_t *buf_page_try_get(const page_id_t page_id, mtr_t *mtr)
{
ut_ad(mtr);
ut_ad(mtr->is_active());
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
+ buf_block_t *block;
- page_hash_latch *hash_lock;
- buf_page_t *bpage= buf_pool.page_hash_get_locked<false>(page_id,
- page_id.fold(),
- &hash_lock);
- if (!bpage)
- return nullptr;
- if (bpage->state() != BUF_BLOCK_FILE_PAGE)
{
- hash_lock->read_unlock();
- return nullptr;
+ transactional_shared_lock_guard<page_hash_latch> g
+ {buf_pool.page_hash.lock_get(chain)};
+ block= reinterpret_cast<buf_block_t*>
+ (buf_pool.page_hash.get(page_id, chain));
+ if (!block || block->page.state() != BUF_BLOCK_FILE_PAGE)
+ return nullptr;
+ block->fix();
}
- buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage);
- buf_block_buf_fix_inc(block);
- hash_lock->read_unlock();
-
if (!block->lock.s_lock_try())
{
- buf_block_buf_fix_dec(block);
+ block->unfix();
return nullptr;
}
@@ -3219,9 +3161,9 @@ buf_block_t *buf_page_try_get(const page_id_t page_id, mtr_t *mtr)
#ifdef UNIV_DEBUG
if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
#endif /* UNIV_DEBUG */
- ut_ad(bpage->buf_fix_count());
- ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE);
- ut_ad(bpage->id() == page_id);
+ ut_ad(block->page.buf_fix_count());
+ ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
+ ut_ad(block->page.id() == page_id);
++buf_pool.stat.n_page_gets;
return block;
@@ -3240,6 +3182,7 @@ void buf_block_t::initialise(const page_id_t page_id, ulint zip_size,
page_zip_set_size(&page.zip, zip_size);
}
+TRANSACTIONAL_TARGET
static buf_block_t* buf_page_create_low(page_id_t page_id, ulint zip_size,
mtr_t *mtr, buf_block_t *free_block)
{
@@ -3248,12 +3191,12 @@ static buf_block_t* buf_page_create_low(page_id_t page_id, ulint zip_size,
free_block->initialise(page_id, zip_size, 1);
- const ulint fold= page_id.fold();
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
+loop:
mysql_mutex_lock(&buf_pool.mutex);
-loop:
buf_block_t *block= reinterpret_cast<buf_block_t*>
- (buf_pool.page_hash_get_low(page_id, fold));
+ (buf_pool.page_hash.get(page_id, chain));
if (block && block->page.in_file() &&
!buf_pool.watch_is_sentinel(block->page))
@@ -3269,15 +3212,12 @@ loop:
if (!mtr->have_x_latch(*block))
{
buf_block_buf_fix_inc(block);
- while (!block->lock.x_lock_try())
+ if (!block->lock.x_lock_try())
{
- /* Wait for buf_page_write_complete() to release block->lock.
- We must not hold buf_pool.mutex while waiting. */
- timespec abstime;
- set_timespec_nsec(abstime, 1000000);
- my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex,
- &abstime);
- }
+ mysql_mutex_unlock(&buf_pool.mutex);
+ block->lock.x_lock();
+ mysql_mutex_lock(&buf_pool.mutex);
+ }
mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
}
else
@@ -3292,16 +3232,17 @@ loop:
#endif
break;
case BUF_BLOCK_ZIP_PAGE:
- page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
- hash_lock->write_lock();
+ page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain);
+ /* It does not make sense to use transactional_lock_guard here,
+ because buf_relocate() would likely make the memory transaction
+ too large. */
+ hash_lock.lock();
if (block->page.io_fix() != BUF_IO_NONE)
{
- hash_lock->write_unlock();
+ hash_lock.unlock();
/* Wait for buf_page_write_complete() to release the I/O fix. */
- timespec abstime;
- set_timespec_nsec(abstime, 1000000);
- my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex,
- &abstime);
+ mysql_mutex_unlock(&buf_pool.mutex);
+ os_aio_wait_until_no_pending_writes();
goto loop;
}
@@ -3313,7 +3254,7 @@ loop:
free_block->page.set_state(BUF_BLOCK_FILE_PAGE);
buf_unzip_LRU_add_block(free_block, FALSE);
- hash_lock->write_unlock();
+ hash_lock.unlock();
buf_page_free_descriptor(&block->page);
block= free_block;
buf_block_buf_fix_inc(block);
@@ -3349,25 +3290,20 @@ loop:
/* The block must be put to the LRU list */
buf_LRU_add_block(&block->page, false);
- page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
- hash_lock->write_lock();
- block->page.set_state(BUF_BLOCK_FILE_PAGE);
- ut_d(block->page.in_page_hash= true);
- HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, &block->page);
+ {
+ transactional_lock_guard<page_hash_latch> g
+ {buf_pool.page_hash.lock_get(chain)};
+ block->page.set_state(BUF_BLOCK_FILE_PAGE);
+ buf_pool.page_hash.append(chain, &block->page);
+ block->lock.x_lock();
+ if (UNIV_UNLIKELY(zip_size))
+ /* Prevent race conditions during buf_buddy_alloc(), which may
+ release and reacquire buf_pool.mutex, by IO-fixing and X-latching. */
+ block->page.set_io_fix(BUF_IO_READ);
+ }
- block->lock.x_lock();
if (UNIV_UNLIKELY(zip_size))
{
- /* Prevent race conditions during buf_buddy_alloc(), which may
- release and reacquire buf_pool.mutex, by IO-fixing and X-latching
- the block. */
- block->page.set_io_fix(BUF_IO_READ);
- hash_lock->write_unlock();
-
- /* buf_pool.mutex may be released and reacquired by
- buf_buddy_alloc(). We must defer this operation until
- after the block descriptor has been added to
- buf_pool.LRU and buf_pool.page_hash. */
block->page.zip.data= buf_buddy_alloc(zip_size);
/* To maintain the invariant block->in_unzip_LRU_list ==
@@ -3378,8 +3314,6 @@ loop:
block->page.set_io_fix(BUF_IO_NONE);
}
- else
- hash_lock->write_unlock();
mysql_mutex_unlock(&buf_pool.mutex);
@@ -3562,32 +3496,6 @@ static void buf_mark_space_corrupt(buf_page_t* bpage, const fil_space_t& space)
}
}
-/** Release and evict a corrupted page.
-@param bpage page that was being read */
-ATTRIBUTE_COLD void buf_pool_t::corrupted_evict(buf_page_t *bpage)
-{
- const page_id_t id(bpage->id());
- page_hash_latch *hash_lock= hash_lock_get(id);
-
- mysql_mutex_lock(&mutex);
- hash_lock->write_lock();
-
- ut_ad(bpage->io_fix() == BUF_IO_READ);
- ut_ad(!bpage->oldest_modification());
- bpage->set_corrupt_id();
-
- if (bpage->state() == BUF_BLOCK_FILE_PAGE)
- reinterpret_cast<buf_block_t*>(bpage)->lock.x_unlock(true);
- bpage->io_unfix();
-
- /* remove from LRU and page_hash */
- buf_LRU_free_one_page(bpage, id, hash_lock);
- mysql_mutex_unlock(&mutex);
-
- ut_d(auto n=) n_pend_reads--;
- ut_ad(n > 0);
-}
-
/** Mark a table corrupted.
@param[in] bpage Corrupted page
@param[in] node data file
@@ -3889,9 +3797,6 @@ void buf_pool_invalidate()
{
mysql_mutex_lock(&buf_pool.mutex);
- buf_flush_wait_batch_end(true);
- buf_flush_wait_batch_end(false);
-
/* It is possible that a write batch that has been posted
earlier is still not complete. For buffer pool invalidation to
proceed we must ensure there is NO write activity happening. */
@@ -3953,7 +3858,8 @@ void buf_pool_t::validate()
case BUF_BLOCK_FILE_PAGE:
const page_id_t id = block->page.id();
- ut_ad(page_hash_get_low(id, id.fold())
+ ut_ad(page_hash.get(id, page_hash.cell_get(
+ id.fold()))
== &block->page);
n_lru++;
break;
@@ -3986,7 +3892,7 @@ void buf_pool_t::validate()
break;
}
const page_id_t id = b->id();
- ut_ad(page_hash_get_low(id, id.fold()) == b);
+ ut_ad(page_hash.get(id, page_hash.cell_get(id.fold())) == b);
}
ut_ad(UT_LIST_GET_LEN(flush_list) == n_flushing);
@@ -4039,6 +3945,8 @@ void buf_pool_t::print()
counts = static_cast<ulint*>(ut_malloc_nokey(sizeof(ulint) * size));
+ size_t pending_writes = os_aio_pending_writes();
+
mysql_mutex_lock(&mutex);
mysql_mutex_lock(&flush_list_mutex);
@@ -4051,7 +3959,7 @@ void buf_pool_t::print()
<< ", n pending decompressions=" << n_pend_unzip
<< ", n pending reads=" << n_pend_reads
<< ", n pending flush LRU=" << n_flush_LRU_
- << " list=" << n_flush_list_
+ << " list=" << pending_writes
<< ", pages made young=" << stat.n_pages_made_young
<< ", not young=" << stat.n_pages_not_made_young
<< ", pages read=" << stat.n_pages_read
@@ -4169,7 +4077,7 @@ void buf_stats_get_pool_info(buf_pool_info_t *pool_info)
pool_info->n_pending_flush_lru = buf_pool.n_flush_LRU_;
- pool_info->n_pending_flush_list = buf_pool.n_flush_list_;
+ pool_info->n_pending_flush_list = os_aio_pending_writes();
current_time = time(NULL);
time_elapsed = 0.001 + difftime(current_time,
diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc
index 447fba38323..5e73687ad50 100644
--- a/storage/innobase/buf/buf0dblwr.cc
+++ b/storage/innobase/buf/buf0dblwr.cc
@@ -212,8 +212,7 @@ too_small:
trx_sys_block->frame, TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N);
mtr.commit();
- /* Flush the modified pages to disk and make a checkpoint */
- log_make_checkpoint();
+ buf_flush_wait_flushed(mtr.commit_lsn());
/* Remove doublewrite pages from LRU */
buf_pool_invalidate();
diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc
index db546e287b4..f4608c3071d 100644
--- a/storage/innobase/buf/buf0flu.cc
+++ b/storage/innobase/buf/buf0flu.cc
@@ -229,10 +229,10 @@ void buf_flush_remove_pages(ulint id)
{
const page_id_t first(id, 0), end(id + 1, 0);
ut_ad(id);
- mysql_mutex_lock(&buf_pool.mutex);
for (;;)
{
+ mysql_mutex_lock(&buf_pool.mutex);
bool deferred= false;
mysql_mutex_lock(&buf_pool.flush_list_mutex);
@@ -255,18 +255,14 @@ void buf_flush_remove_pages(ulint id)
bpage= prev;
}
+ mysql_mutex_unlock(&buf_pool.mutex);
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
if (!deferred)
- break;
+ return;
- mysql_mutex_unlock(&buf_pool.mutex);
- std::this_thread::yield();
- mysql_mutex_lock(&buf_pool.mutex);
- buf_flush_wait_batch_end(false);
+ os_aio_wait_until_no_pending_writes();
}
-
- mysql_mutex_unlock(&buf_pool.mutex);
}
/*******************************************************************//**
@@ -325,6 +321,32 @@ buf_flush_relocate_on_flush_list(
ut_d(buf_flush_validate_low());
}
+/** Increment a counter in a race-condition prone way. */
+TPOOL_SUPPRESS_TSAN static inline void inc_n_pages_written()
+{ buf_pool.stat.n_pages_written++; }
+
+/** Note that a block is no longer dirty, while not removing
+it from buf_pool.flush_list */
+inline void buf_page_t::write_complete(bool temporary)
+{
+ ut_ad(io_fix() == BUF_IO_WRITE);
+ ut_ad(temporary == fsp_is_system_temporary(id().space()));
+ if (temporary)
+ {
+ ut_ad(oldest_modification() == 2);
+ oldest_modification_= 0;
+ }
+ else
+ {
+ /* We use release memory order to guarantee that callers of
+ oldest_modification_acquire() will observe the block as
+ being detached from buf_pool.flush_list, after reading the value 0. */
+ ut_ad(oldest_modification() > 2);
+ oldest_modification_.store(1, std::memory_order_release);
+ }
+ io_fix_= BUF_IO_NONE;
+}
+
/** Complete write of a file page from buf_pool.
@param request write request */
void buf_page_write_complete(const IORequest &request)
@@ -363,28 +385,28 @@ void buf_page_write_complete(const IORequest &request)
buf_page_monitor(bpage, BUF_IO_WRITE);
DBUG_PRINT("ib_buf", ("write page %u:%u",
bpage->id().space(), bpage->id().page_no()));
- const bool temp= fsp_is_system_temporary(bpage->id().space());
- mysql_mutex_lock(&buf_pool.mutex);
+ mysql_mutex_assert_not_owner(&buf_pool.mutex);
mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
- buf_pool.stat.n_pages_written++;
- /* While we do not need any mutex for clearing oldest_modification
- here, we hope that it will be in the same cache line with io_fix,
- whose changes must be protected by buf_pool.mutex. */
- ut_ad(temp || bpage->oldest_modification() > 2);
- bpage->clear_oldest_modification(temp);
- ut_ad(bpage->io_fix() == BUF_IO_WRITE);
- bpage->set_io_fix(BUF_IO_NONE);
-
- if (bpage->state() == BUF_BLOCK_FILE_PAGE)
- reinterpret_cast<buf_block_t*>(bpage)->lock.u_unlock(true);
if (request.is_LRU())
{
+ mysql_mutex_lock(&buf_pool.mutex);
+ buf_pool.stat.n_pages_written++;
+ /* Releasing the io_fix and page latch must be protected by
+ buf_pool.mutex, because we do not want any thread to access the
+ block before we have freed it). */
+ bpage->write_complete(fsp_is_system_temporary(bpage->id().space()));
+ if (bpage->state() == BUF_BLOCK_FILE_PAGE)
+ reinterpret_cast<buf_block_t*>(bpage)->lock.u_unlock(true);
+
buf_LRU_free_page(bpage, true);
ut_ad(buf_pool.n_flush_LRU_);
- if (!--buf_pool.n_flush_LRU_)
+ const auto n_LRU_left= --buf_pool.n_flush_LRU_;
+ mysql_mutex_unlock(&buf_pool.mutex);
+
+ if (!n_LRU_left)
{
pthread_cond_broadcast(&buf_pool.done_flush_LRU);
pthread_cond_signal(&buf_pool.done_free);
@@ -392,13 +414,23 @@ void buf_page_write_complete(const IORequest &request)
}
else
{
- ut_ad(!temp);
- ut_ad(buf_pool.n_flush_list_);
- if (!--buf_pool.n_flush_list_)
- pthread_cond_broadcast(&buf_pool.done_flush_list);
+ ut_ad(bpage->oldest_modification() > 2);
+ ut_ad(!fsp_is_system_temporary(bpage->id().space()));
+ if (UNIV_LIKELY(bpage->state() == BUF_BLOCK_FILE_PAGE))
+ {
+ /* In normal checkpoint flushing, we may elide buf_pool.mutex. */
+ bpage->write_complete(false);
+ reinterpret_cast<buf_block_t*>(bpage)->lock.u_unlock(true);
+ inc_n_pages_written();
+ }
+ else
+ {
+ mysql_mutex_lock(&buf_pool.mutex);
+ bpage->write_complete(false);
+ buf_pool.stat.n_pages_written++;
+ mysql_mutex_unlock(&buf_pool.mutex);
+ }
}
-
- mysql_mutex_unlock(&buf_pool.mutex);
}
/** Calculate a ROW_FORMAT=COMPRESSED page checksum and update the page.
@@ -788,7 +820,6 @@ buf_pool.mutex must be held.
static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space)
{
ut_ad(bpage->in_file());
- ut_ad(bpage->ready_for_flush());
ut_ad((space->purpose == FIL_TYPE_TEMPORARY) ==
(space == fil_system.temp_space));
ut_ad(space->referenced());
@@ -797,12 +828,20 @@ static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space)
block_lock *rw_lock;
if (bpage->state() != BUF_BLOCK_FILE_PAGE)
+ {
+ ut_ad(bpage->ready_for_flush());
rw_lock= nullptr;
+ }
else
{
rw_lock= &reinterpret_cast<buf_block_t*>(bpage)->lock;
if (!rw_lock->u_lock_try(true))
return false;
+ if (!bpage->ready_for_flush() || bpage->oldest_modification() < 2)
+ {
+ rw_lock->u_unlock(true);
+ return false;
+ }
}
bpage->set_io_fix(BUF_IO_WRITE);
@@ -817,8 +856,6 @@ static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space)
{
if (lru)
buf_pool.n_flush_LRU_++;
- else
- buf_pool.n_flush_list_++;
buf_flush_page_count++;
}
@@ -842,8 +879,7 @@ static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space)
: oldest_modification > 2);
ut_ad(bpage->state() ==
(rw_lock ? BUF_BLOCK_FILE_PAGE : BUF_BLOCK_ZIP_PAGE));
- ut_ad(ULINT_UNDEFINED >
- (lru ? buf_pool.n_flush_LRU_ : buf_pool.n_flush_list_));
+ ut_ad(!lru || ULINT_UNDEFINED > buf_pool.n_flush_LRU_);
mysql_mutex_unlock(&buf_pool.mutex);
buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage);
@@ -947,7 +983,9 @@ static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, bool lru)
mysql_mutex_assert_owner(&buf_pool.mutex);
ut_ad(fold == id.fold());
- buf_page_t *bpage= buf_pool.page_hash_get_low(id, fold);
+ /* FIXME: cell_get() is being invoked while holding buf_pool.mutex */
+ const buf_page_t *bpage=
+ buf_pool.page_hash.get(id, buf_pool.page_hash.cell_get(fold));
if (!bpage || buf_pool.watch_is_sentinel(*bpage))
return false;
@@ -1107,9 +1145,10 @@ static ulint buf_flush_try_neighbors(fil_space_t *space,
id_fold= id.fold();
}
+ const buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id_fold);
mysql_mutex_lock(&buf_pool.mutex);
- if (buf_page_t *bpage= buf_pool.page_hash_get_low(id, id_fold))
+ if (buf_page_t *bpage= buf_pool.page_hash.get(id, chain))
{
ut_ad(bpage->in_file());
/* We avoid flushing 'non-old' blocks in an LRU flush,
@@ -1367,6 +1406,7 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
ulint scanned= 0;
mysql_mutex_assert_owner(&buf_pool.mutex);
+ mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
? 0 : srv_flush_neighbors;
@@ -1377,7 +1417,6 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
/* Start from the end of the list looking for a suitable block to be
flushed. */
- mysql_mutex_lock(&buf_pool.flush_list_mutex);
ulint len= UT_LIST_GET_LEN(buf_pool.flush_list);
for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list);
@@ -1457,7 +1496,6 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
}
buf_pool.flush_hp.set(nullptr);
- mysql_mutex_unlock(&buf_pool.flush_list_mutex);
if (space)
space->release();
@@ -1467,32 +1505,21 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
scanned);
- if (count)
- MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_TOTAL_PAGE,
- MONITOR_FLUSH_BATCH_COUNT,
- MONITOR_FLUSH_BATCH_PAGES,
- count);
- mysql_mutex_assert_owner(&buf_pool.mutex);
return count;
}
-/** Wait until a flush batch ends.
-@param lru true=buf_pool.LRU; false=buf_pool.flush_list */
-void buf_flush_wait_batch_end(bool lru)
+/** Wait until a LRU flush batch ends. */
+void buf_flush_wait_LRU_batch_end()
{
- const auto &n_flush= lru ? buf_pool.n_flush_LRU_ : buf_pool.n_flush_list_;
-
- if (n_flush)
+ if (buf_pool.n_flush_LRU())
{
- auto cond= lru ? &buf_pool.done_flush_LRU : &buf_pool.done_flush_list;
tpool::tpool_wait_begin();
thd_wait_begin(nullptr, THD_WAIT_DISKIO);
do
- my_cond_wait(cond, &buf_pool.mutex.m_mutex);
- while (n_flush);
+ my_cond_wait(&buf_pool.done_flush_LRU, &buf_pool.mutex.m_mutex);
+ while (buf_pool.n_flush_LRU());
tpool::tpool_wait_end();
thd_wait_end(nullptr);
- pthread_cond_broadcast(cond);
}
}
@@ -1501,38 +1528,47 @@ void buf_flush_wait_batch_end(bool lru)
@param lsn buf_pool.get_oldest_modification(LSN_MAX) target
@return the number of processed pages
@retval 0 if a buf_pool.flush_list batch is already running */
-ulint buf_flush_list(ulint max_n, lsn_t lsn)
+static ulint buf_flush_list(ulint max_n= ULINT_UNDEFINED, lsn_t lsn= LSN_MAX)
{
ut_ad(lsn);
- if (buf_pool.n_flush_list())
- return 0;
-
mysql_mutex_lock(&buf_pool.mutex);
- const bool running= buf_pool.n_flush_list_ != 0;
- /* FIXME: we are performing a dirty read of buf_pool.flush_list.count
- while not holding buf_pool.flush_list_mutex */
- if (running || !UT_LIST_GET_LEN(buf_pool.flush_list))
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ if (buf_pool.flush_list_active)
{
- if (!running)
- pthread_cond_broadcast(&buf_pool.done_flush_list);
+nothing_to_do:
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
mysql_mutex_unlock(&buf_pool.mutex);
return 0;
}
-
- buf_pool.n_flush_list_++;
+ if (!buf_pool.get_oldest_modification(0))
+ {
+ pthread_cond_broadcast(&buf_pool.done_flush_list);
+ goto nothing_to_do;
+ }
+ buf_pool.flush_list_active= true;
const ulint n_flushed= buf_do_flush_list_batch(max_n, lsn);
- const ulint n_flushing= --buf_pool.n_flush_list_;
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ mysql_mutex_unlock(&buf_pool.mutex);
+ os_aio_wait_until_no_pending_writes();
+ mysql_mutex_lock(&buf_pool.mutex);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ buf_pool.flush_list_active= false;
+ pthread_cond_broadcast(&buf_pool.done_flush_list);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
buf_pool.try_LRU_scan= true;
mysql_mutex_unlock(&buf_pool.mutex);
- if (!n_flushing)
- pthread_cond_broadcast(&buf_pool.done_flush_list);
-
buf_dblwr.flush_buffered_writes();
+ if (n_flushed)
+ MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+ MONITOR_FLUSH_BATCH_COUNT,
+ MONITOR_FLUSH_BATCH_PAGES,
+ n_flushed);
+
DBUG_PRINT("ib_buf", ("flush_list completed, " ULINTPF " pages", n_flushed));
return n_flushed;
}
@@ -1796,6 +1832,24 @@ ATTRIBUTE_COLD void log_make_checkpoint()
while (!log_checkpoint());
}
+/** Wait for all dirty pages up to an LSN to be written out.
+NOTE: The calling thread is not allowed to hold any buffer page latches! */
+static void buf_flush_wait(lsn_t lsn)
+{
+ ut_ad(lsn <= log_sys.get_lsn());
+
+ while (buf_pool.get_oldest_modification(lsn) < lsn)
+ {
+ if (buf_flush_sync_lsn < lsn)
+ buf_flush_sync_lsn= lsn;
+ pthread_cond_signal(&buf_pool.do_flush_list);
+ tpool::tpool_wait_begin();
+ my_cond_wait(&buf_pool.done_flush_list,
+ &buf_pool.flush_list_mutex.m_mutex);
+ tpool::tpool_wait_end();
+ }
+}
+
/** Wait until all persistent pages are flushed up to a limit.
@param sync_lsn buf_pool.get_oldest_modification(LSN_MAX) to wait for */
ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn)
@@ -1812,50 +1866,31 @@ ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn)
if (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn)
{
+ MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
+ thd_wait_begin(nullptr, THD_WAIT_DISKIO);
#if 1 /* FIXME: remove this, and guarantee that the page cleaner serves us */
- if (UNIV_UNLIKELY(!buf_page_cleaner_is_active)
- ut_d(|| innodb_page_cleaner_disabled_debug))
+ if (UNIV_UNLIKELY(!buf_page_cleaner_is_active))
{
do
{
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
ulint n_pages= buf_flush_list(srv_max_io_capacity, sync_lsn);
- buf_flush_wait_batch_end_acquiring_mutex(false);
if (n_pages)
{
MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE,
MONITOR_FLUSH_SYNC_COUNT,
MONITOR_FLUSH_SYNC_PAGES, n_pages);
}
- MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
mysql_mutex_lock(&buf_pool.flush_list_mutex);
}
while (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn);
-
- goto try_checkpoint;
}
+ else
#endif
- if (buf_flush_sync_lsn < sync_lsn)
- {
- buf_flush_sync_lsn= sync_lsn;
- pthread_cond_signal(&buf_pool.do_flush_list);
- }
-
- do
- {
- tpool::tpool_wait_begin();
- thd_wait_begin(nullptr, THD_WAIT_DISKIO);
- my_cond_wait(&buf_pool.done_flush_list,
- &buf_pool.flush_list_mutex.m_mutex);
- thd_wait_end(nullptr);
- tpool::tpool_wait_end();
-
- MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
- }
- while (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn);
+ buf_flush_wait(sync_lsn);
+ thd_wait_end(nullptr);
}
-try_checkpoint:
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
if (UNIV_UNLIKELY(log_sys.last_checkpoint_lsn < sync_lsn))
@@ -1894,13 +1929,13 @@ ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious)
}
}
-/** Wait for pending flushes to complete. */
-void buf_flush_wait_batch_end_acquiring_mutex(bool lru)
+/** Wait for pending LRU flush to complete. */
+void buf_flush_wait_LRU_batch_end_acquiring_mutex()
{
- if (lru ? buf_pool.n_flush_LRU() : buf_pool.n_flush_list())
+ if (buf_pool.n_flush_LRU())
{
mysql_mutex_lock(&buf_pool.mutex);
- buf_flush_wait_batch_end(lru);
+ buf_flush_wait_LRU_batch_end();
mysql_mutex_unlock(&buf_pool.mutex);
}
}
@@ -1923,10 +1958,6 @@ ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn)
MONITOR_FLUSH_SYNC_PAGES, n_flushed);
}
- /* Attempt to perform a log checkpoint upon completing each batch. */
- if (recv_recovery_is_on())
- recv_sys.apply(true);
-
switch (srv_file_flush_method) {
case SRV_NOSYNC:
case SRV_O_DIRECT_NO_FSYNC:
@@ -1943,7 +1974,8 @@ ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn)
mysql_mutex_unlock(&log_sys.flush_order_mutex);
const lsn_t checkpoint_lsn= measure ? measure : newest_lsn;
- if (checkpoint_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT)
+ if (!recv_recovery_is_on() &&
+ checkpoint_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT)
{
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
log_checkpoint_low(checkpoint_lsn, newest_lsn);
@@ -1967,7 +1999,7 @@ ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn)
else if (measure >= buf_flush_async_lsn)
buf_flush_async_lsn= 0;
- /* wake up buf_flush_wait_flushed() */
+ /* wake up buf_flush_wait() */
pthread_cond_broadcast(&buf_pool.done_flush_list);
lsn= std::max(lsn, target);
@@ -2225,7 +2257,7 @@ furious_flush:
if (UNIV_UNLIKELY(lsn_limit != 0))
{
buf_flush_sync_lsn= 0;
- /* wake up buf_flush_wait_flushed() */
+ /* wake up buf_flush_wait() */
pthread_cond_broadcast(&buf_pool.done_flush_list);
}
unemployed:
@@ -2295,8 +2327,6 @@ unemployed:
if (UNIV_UNLIKELY(lsn_limit != 0))
{
n_flushed= buf_flush_list(srv_max_io_capacity, lsn_limit);
- /* wake up buf_flush_wait_flushed() */
- pthread_cond_broadcast(&buf_pool.done_flush_list);
goto try_checkpoint;
}
else if (idle_flush || !srv_adaptive_flushing)
@@ -2368,8 +2398,8 @@ next:
if (srv_fast_shutdown != 2)
{
- buf_flush_wait_batch_end_acquiring_mutex(true);
- buf_flush_wait_batch_end_acquiring_mutex(false);
+ buf_flush_wait_LRU_batch_end_acquiring_mutex();
+ os_aio_wait_until_no_pending_writes();
}
mysql_mutex_lock(&buf_pool.flush_list_mutex);
@@ -2400,15 +2430,6 @@ ATTRIBUTE_COLD void buf_flush_page_cleaner_init()
std::thread(buf_flush_page_cleaner).detach();
}
-/** @return the number of dirty pages in the buffer pool */
-static ulint buf_flush_list_length()
-{
- mysql_mutex_lock(&buf_pool.flush_list_mutex);
- const ulint len= UT_LIST_GET_LEN(buf_pool.flush_list);
- mysql_mutex_unlock(&buf_pool.flush_list_mutex);
- return len;
-}
-
/** Flush the buffer pool on shutdown. */
ATTRIBUTE_COLD void buf_flush_buffer_pool()
{
@@ -2418,39 +2439,59 @@ ATTRIBUTE_COLD void buf_flush_buffer_pool()
service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
"Waiting to flush the buffer pool");
- while (buf_pool.n_flush_list() || buf_flush_list_length())
+ os_aio_wait_until_no_pending_writes();
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+ while (buf_pool.get_oldest_modification(0))
{
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
buf_flush_list(srv_max_io_capacity);
- timespec abstime;
-
- if (buf_pool.n_flush_list())
- {
- service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
- "Waiting to flush " ULINTPF " pages",
- buf_flush_list_length());
- set_timespec(abstime, INNODB_EXTEND_TIMEOUT_INTERVAL / 2);
- mysql_mutex_lock(&buf_pool.mutex);
- while (buf_pool.n_flush_list_)
- my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex,
- &abstime);
- mysql_mutex_unlock(&buf_pool.mutex);
- }
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+ "Waiting to flush " ULINTPF " pages",
+ UT_LIST_GET_LEN(buf_pool.flush_list));
}
- ut_ad(!buf_pool.any_io_pending());
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ ut_ad(!buf_pool.some_io_pending());
+ ut_ad(!os_aio_pending_writes());
+}
+
+/** Synchronously flush dirty blocks during recv_sys_t::apply().
+NOTE: The calling thread is not allowed to hold any buffer page latches! */
+void buf_flush_sync_batch(lsn_t lsn)
+{
+ thd_wait_begin(nullptr, THD_WAIT_DISKIO);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ buf_flush_wait(lsn);
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ thd_wait_end(nullptr);
}
/** Synchronously flush dirty blocks.
NOTE: The calling thread is not allowed to hold any buffer page latches! */
void buf_flush_sync()
{
+ if (recv_recovery_is_on())
+ recv_sys.apply(true);
+
+ thd_wait_begin(nullptr, THD_WAIT_DISKIO);
+ mysql_mutex_lock(&buf_pool.flush_list_mutex);
for (;;)
{
- const ulint n_flushed= buf_flush_list(srv_max_io_capacity);
- buf_flush_wait_batch_end_acquiring_mutex(false);
- if (!n_flushed && !buf_flush_list_length())
- return;
+ const lsn_t lsn= log_sys.get_lsn();
+ buf_flush_wait(lsn);
+ if (lsn == log_sys.get_lsn())
+ break;
}
+
+ /* Wait for the checkpoint. */
+ while (buf_flush_sync_lsn)
+ my_cond_wait(&buf_pool.done_flush_list,
+ &buf_pool.flush_list_mutex.m_mutex);
+
+ mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+ thd_wait_end(nullptr);
}
#ifdef UNIV_DEBUG
diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc
index a5b3cc72fc7..77d367074b0 100644
--- a/storage/innobase/buf/buf0lru.cc
+++ b/storage/innobase/buf/buf0lru.cc
@@ -113,7 +113,7 @@ the object will be freed.
@param bpage buffer block
@param id page identifier
-@param hash_lock buf_pool.page_hash latch (will be released here)
+@param chain locked buf_pool.page_hash chain (will be released here)
@param zip whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed
If a compressed page is freed other compressed pages may be relocated.
@@ -122,7 +122,8 @@ caller needs to free the page to the free list
@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In
this case the block is already returned to the buddy allocator. */
static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
- page_hash_latch *hash_lock, bool zip);
+ buf_pool_t::hash_chain &chain,
+ bool zip);
/** Free a block to buf_pool */
static void buf_LRU_block_free_hashed_page(buf_block_t *block)
@@ -807,9 +808,11 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
/* We must hold an exclusive hash_lock to prevent
bpage->can_relocate() from changing due to a concurrent
execution of buf_page_get_low(). */
- const ulint fold = id.fold();
- page_hash_latch* hash_lock = buf_pool.page_hash.lock_get(fold);
- hash_lock->write_lock();
+ buf_pool_t::hash_chain& chain= buf_pool.page_hash.cell_get(id.fold());
+ page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain);
+ /* We cannot use transactional_lock_guard here,
+ because buf_buddy_relocate() in buf_buddy_free() could get stuck. */
+ hash_lock.lock();
lsn_t oldest_modification = bpage->oldest_modification_acquire();
if (UNIV_UNLIKELY(!bpage->can_relocate())) {
@@ -839,7 +842,7 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
} else if (oldest_modification
&& bpage->state() != BUF_BLOCK_FILE_PAGE) {
func_exit:
- hash_lock->write_unlock();
+ hash_lock.unlock();
return(false);
} else if (bpage->state() == BUF_BLOCK_FILE_PAGE) {
@@ -859,7 +862,7 @@ func_exit:
ut_ad(bpage->can_relocate());
- if (!buf_LRU_block_remove_hashed(bpage, id, hash_lock, zip)) {
+ if (!buf_LRU_block_remove_hashed(bpage, id, chain, zip)) {
ut_ad(!b);
mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
return(true);
@@ -875,7 +878,7 @@ func_exit:
if (UNIV_LIKELY_NULL(b)) {
buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, b);
- ut_ad(!buf_pool.page_hash_get_low(id, fold));
+ ut_ad(!buf_pool.page_hash.get(id, chain));
ut_ad(b->zip_size());
/* The field in_LRU_list of
@@ -894,8 +897,10 @@ func_exit:
ut_ad(!b->in_zip_hash);
ut_ad(b->in_LRU_list);
ut_ad(b->in_page_hash);
+ ut_d(b->in_page_hash = false);
+ b->hash = nullptr;
- HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, b);
+ buf_pool.page_hash.append(chain, b);
/* Insert b where bpage was in the LRU list. */
if (prev_b) {
@@ -951,9 +956,10 @@ func_exit:
decompressing the block while we release
hash_lock. */
b->set_io_fix(BUF_IO_PIN);
- hash_lock->write_unlock();
+ goto release;
} else if (!zip) {
- hash_lock->write_unlock();
+release:
+ hash_lock.unlock();
}
buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage);
@@ -1063,7 +1069,7 @@ the object will be freed.
@param bpage buffer block
@param id page identifier
-@param hash_lock buf_pool.page_hash latch (will be released here)
+@param chain locked buf_pool.page_hash chain (will be released here)
@param zip whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed
If a compressed page is freed other compressed pages may be relocated.
@@ -1072,10 +1078,11 @@ caller needs to free the page to the free list
@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In
this case the block is already returned to the buddy allocator. */
static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
- page_hash_latch *hash_lock, bool zip)
+ buf_pool_t::hash_chain &chain,
+ bool zip)
{
mysql_mutex_assert_owner(&buf_pool.mutex);
- ut_ad(hash_lock->is_write_locked());
+ ut_ad(buf_pool.page_hash.lock_get(chain).is_write_locked());
ut_a(bpage->io_fix() == BUF_IO_NONE);
ut_a(!bpage->buf_fix_count());
@@ -1155,7 +1162,8 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
}
ut_ad(!bpage->in_zip_hash);
- HASH_DELETE(buf_page_t, hash, &buf_pool.page_hash, id.fold(), bpage);
+ buf_pool.page_hash.remove(chain, bpage);
+ page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain);
switch (bpage->state()) {
case BUF_BLOCK_ZIP_PAGE:
@@ -1165,7 +1173,7 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
ut_a(bpage->zip.ssize);
ut_ad(!bpage->oldest_modification());
- hash_lock->write_unlock();
+ hash_lock.unlock();
buf_pool_mutex_exit_forbid();
buf_buddy_free(bpage->zip.data, bpage->zip_size());
@@ -1209,7 +1217,7 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
and by the time we'll release it in the caller we'd
have inserted the compressed only descriptor in the
page_hash. */
- hash_lock->write_unlock();
+ hash_lock.unlock();
if (bpage->zip.data) {
/* Free the compressed page. */
@@ -1240,20 +1248,38 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
return(false);
}
-/** Remove one page from LRU list and put it to free list.
-@param bpage file page to be freed
-@param id page identifier
-@param hash_lock buf_pool.page_hash latch (will be released here) */
-void buf_LRU_free_one_page(buf_page_t *bpage, const page_id_t id,
- page_hash_latch *hash_lock)
+/** Release and evict a corrupted page.
+@param bpage page that was being read */
+ATTRIBUTE_COLD void buf_pool_t::corrupted_evict(buf_page_t *bpage)
{
+ const page_id_t id(bpage->id());
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold());
+ page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain);
+
+ mysql_mutex_lock(&mutex);
+ hash_lock.lock();
+
+ ut_ad(bpage->io_fix() == BUF_IO_READ);
+ ut_ad(!bpage->oldest_modification());
+ bpage->set_corrupt_id();
+ bpage->io_unfix();
+
+ if (bpage->state() == BUF_BLOCK_FILE_PAGE)
+ reinterpret_cast<buf_block_t*>(bpage)->lock.x_unlock(true);
+
while (bpage->buf_fix_count())
/* Wait for other threads to release the fix count
before releasing the bpage from LRU list. */
(void) LF_BACKOFF();
- if (buf_LRU_block_remove_hashed(bpage, id, hash_lock, true))
+ /* remove from LRU and page_hash */
+ if (buf_LRU_block_remove_hashed(bpage, id, chain, true))
buf_LRU_block_free_hashed_page(reinterpret_cast<buf_block_t*>(bpage));
+
+ mysql_mutex_unlock(&mutex);
+
+ ut_d(auto n=) n_pend_reads--;
+ ut_ad(n > 0);
}
/** Update buf_pool.LRU_old_ratio.
diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc
index 13e1a35f08a..080f87adb0f 100644
--- a/storage/innobase/buf/buf0rea.cc
+++ b/storage/innobase/buf/buf0rea.cc
@@ -50,17 +50,17 @@ i/o-fixed buffer blocks */
/** Remove the sentinel block for the watch before replacing it with a
real block. watch_unset() or watch_occurred() will notice
that the block has been replaced with the real block.
-@param watch sentinel */
-inline void buf_pool_t::watch_remove(buf_page_t *watch)
+@param watch sentinel
+@param chain locked hash table chain */
+inline void buf_pool_t::watch_remove(buf_page_t *watch,
+ buf_pool_t::hash_chain &chain)
{
mysql_mutex_assert_owner(&buf_pool.mutex);
- ut_ad(hash_lock_get(watch->id())->is_write_locked());
+ ut_ad(page_hash.lock_get(chain).is_write_locked());
ut_a(watch_is_sentinel(*watch));
if (watch->buf_fix_count())
{
- ut_ad(watch->in_page_hash);
- ut_d(watch->in_page_hash= false);
- HASH_DELETE(buf_page_t, hash, &page_hash, watch->id().fold(), watch);
+ page_hash.remove(chain, watch);
watch->set_buf_fix_count(0);
}
ut_ad(!watch->in_page_hash);
@@ -83,6 +83,7 @@ and the lock released later.
requested (for ROW_FORMAT=COMPRESSED)
@return pointer to the block
@retval NULL in case of an error */
+TRANSACTIONAL_TARGET
static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
ulint zip_size, bool unzip)
{
@@ -114,11 +115,11 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
block->lock.x_lock(true);
}
- const ulint fold= page_id.fold();
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
mysql_mutex_lock(&buf_pool.mutex);
- buf_page_t *hash_page= buf_pool.page_hash_get_low(page_id, fold);
+ buf_page_t *hash_page= buf_pool.page_hash.get(page_id, chain);
if (hash_page && !buf_pool.watch_is_sentinel(*hash_page))
{
/* The page is already in the buffer pool. */
@@ -135,27 +136,26 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
bpage= &block->page;
/* Insert into the hash table of file pages */
- page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
- hash_lock->write_lock();
-
- if (hash_page)
{
- /* Preserve the reference count. */
- auto buf_fix_count= hash_page->buf_fix_count();
- ut_a(buf_fix_count > 0);
- block->page.add_buf_fix_count(buf_fix_count);
- buf_pool.watch_remove(hash_page);
- }
+ transactional_lock_guard<page_hash_latch> g
+ {buf_pool.page_hash.lock_get(chain)};
- block->page.set_io_fix(BUF_IO_READ);
- block->page.set_state(BUF_BLOCK_FILE_PAGE);
- ut_ad(!block->page.in_page_hash);
- ut_d(block->page.in_page_hash= true);
- HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, bpage);
- hash_lock->write_unlock();
+ if (hash_page)
+ {
+ /* Preserve the reference count. */
+ auto buf_fix_count= hash_page->buf_fix_count();
+ ut_a(buf_fix_count > 0);
+ block->page.add_buf_fix_count(buf_fix_count);
+ buf_pool.watch_remove(hash_page, chain);
+ }
+
+ block->page.set_io_fix(BUF_IO_READ);
+ block->page.set_state(BUF_BLOCK_FILE_PAGE);
+ buf_pool.page_hash.append(chain, &block->page);
+ }
/* The block must be put to the LRU list, to the old blocks */
- buf_LRU_add_block(bpage, true/* to old blocks */);
+ buf_LRU_add_block(&block->page, true/* to old blocks */);
if (UNIV_UNLIKELY(zip_size))
{
@@ -188,7 +188,7 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
check the page_hash again, as it may have been modified. */
if (UNIV_UNLIKELY(lru))
{
- hash_page= buf_pool.page_hash_get_low(page_id, fold);
+ hash_page= buf_pool.page_hash.get(page_id, chain);
if (UNIV_UNLIKELY(hash_page && !buf_pool.watch_is_sentinel(*hash_page)))
{
@@ -206,23 +206,22 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
bpage->init(BUF_BLOCK_ZIP_PAGE, page_id);
- page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
- hash_lock->write_lock();
-
- if (hash_page)
{
- /* Preserve the reference count. It can be 0 if
- buf_pool_t::watch_unset() is executing concurrently,
- waiting for buf_pool.mutex, which we are holding. */
- bpage->add_buf_fix_count(hash_page->buf_fix_count());
- buf_pool.watch_remove(hash_page);
- }
+ transactional_lock_guard<page_hash_latch> g
+ {buf_pool.page_hash.lock_get(chain)};
- ut_ad(!bpage->in_page_hash);
- ut_d(bpage->in_page_hash= true);
- HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, bpage);
- bpage->set_io_fix(BUF_IO_READ);
- hash_lock->write_unlock();
+ if (hash_page)
+ {
+ /* Preserve the reference count. It can be 0 if
+ buf_pool_t::watch_unset() is executing concurrently,
+ waiting for buf_pool.mutex, which we are holding. */
+ bpage->add_buf_fix_count(hash_page->buf_fix_count());
+ buf_pool.watch_remove(hash_page, chain);
+ }
+
+ buf_pool.page_hash.append(chain, bpage);
+ bpage->set_io_fix(BUF_IO_READ);
+ }
/* The block must be put to the LRU list, to the old blocks.
The zip size is already set into the page zip */
@@ -375,6 +374,7 @@ wants to access
@return number of page read requests issued; NOTE that if we read ibuf
pages, it may happen that the page at the given page number does not
get read even if we return a positive value! */
+TRANSACTIONAL_TARGET
ulint
buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf)
{
@@ -408,13 +408,12 @@ buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf)
for (page_id_t i= low; i < high; ++i)
{
- const ulint fold= i.fold();
- page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
- const buf_page_t *bpage= buf_pool.page_hash_get_low(i, fold);
- bool found= bpage && bpage->is_accessed() && buf_page_peek_if_young(bpage);
- hash_lock->read_unlock();
- if (found && !--count)
- goto read_ahead;
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(i.fold());
+ transactional_shared_lock_guard<page_hash_latch> g
+ {buf_pool.page_hash.lock_get(chain)};
+ if (const buf_page_t *bpage= buf_pool.page_hash.get(i, chain))
+ if (bpage->is_accessed() && buf_page_peek_if_young(bpage) && !--count)
+ goto read_ahead;
}
no_read_ahead:
@@ -556,6 +555,7 @@ which could result in a deadlock if the OS does not support asynchronous io.
@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
@param[in] ibuf whether if we are inside ibuf routine
@return number of page read requests issued */
+TRANSACTIONAL_TARGET
ulint
buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
{
@@ -608,9 +608,19 @@ fail:
unsigned prev_accessed= 0;
for (page_id_t i= low; i != high_1; ++i)
{
- const ulint fold= i.fold();
- page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
- const buf_page_t* bpage= buf_pool.page_hash_get_low(i, fold);
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(i.fold());
+ transactional_shared_lock_guard<page_hash_latch> g
+ {buf_pool.page_hash.lock_get(chain)};
+ const buf_page_t* bpage= buf_pool.page_hash.get(i, chain);
+ if (!bpage)
+ {
+ if (i == page_id)
+ goto fail;
+failed:
+ if (--count)
+ continue;
+ goto fail;
+ }
if (i == page_id)
{
/* Read the natural predecessor and successor page addresses from
@@ -618,12 +628,6 @@ fail:
on the page, we do not acquire an s-latch on the page, this is to
prevent deadlocks. The hash_lock is only protecting the
buf_pool.page_hash for page i, not the bpage contents itself. */
- if (!bpage)
- {
-hard_fail:
- hash_lock->read_unlock();
- goto fail;
- }
const byte *f;
switch (UNIV_EXPECT(bpage->state(), BUF_BLOCK_FILE_PAGE)) {
case BUF_BLOCK_FILE_PAGE:
@@ -633,38 +637,31 @@ hard_fail:
f= bpage->zip.data;
break;
default:
- goto hard_fail;
+ ut_ad("invalid state" == 0);
+ goto fail;
}
uint32_t prev= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_PREV));
uint32_t next= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_NEXT));
if (prev == FIL_NULL || next == FIL_NULL)
- goto hard_fail;
+ goto fail;
page_id_t id= page_id;
if (descending && next - 1 == page_id.page_no())
id.set_page_no(prev);
else if (!descending && prev + 1 == page_id.page_no())
id.set_page_no(next);
else
- goto hard_fail; /* Successor or predecessor not in the right order */
+ goto fail; /* Successor or predecessor not in the right order */
new_low= id - (id.page_no() % buf_read_ahead_area);
new_high_1= new_low + (buf_read_ahead_area - 1);
if (id != new_low && id != new_high_1)
/* This is not a border page of the area: return */
- goto hard_fail;
+ goto fail;
if (new_high_1.page_no() > space->last_page_number())
/* The area is not whole */
- goto hard_fail;
- }
- else if (!bpage)
- {
-failed:
- hash_lock->read_unlock();
- if (--count)
- continue;
- goto fail;
+ goto fail;
}
const unsigned accessed= bpage->is_accessed();
@@ -681,7 +678,6 @@ failed:
prev_accessed= accessed;
if (fail)
goto failed;
- hash_lock->read_unlock();
}
/* If we got this far, read-ahead can be sensible: do it */
diff --git a/storage/innobase/dict/dict0crea.cc b/storage/innobase/dict/dict0crea.cc
index 667a64f907a..c88227dbade 100644
--- a/storage/innobase/dict/dict0crea.cc
+++ b/storage/innobase/dict/dict0crea.cc
@@ -1381,6 +1381,8 @@ dberr_t dict_sys_t::create_or_check_sys_tables()
trx_start_for_ddl(trx);
{
+ /* Do not bother with transactional memory; this is only
+ executed at startup, with no conflicts present. */
LockMutexGuard g{SRW_LOCK_CALL};
trx->mutex_lock();
lock_table_create(dict_sys.sys_tables, LOCK_X, trx);
diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc
index 6074398afd3..34080b22095 100644
--- a/storage/innobase/dict/dict0dict.cc
+++ b/storage/innobase/dict/dict0dict.cc
@@ -1181,6 +1181,7 @@ inline void dict_sys_t::add(dict_table_t* table)
/** Test whether a table can be evicted from dict_sys.table_LRU.
@param table table to be considered for eviction
@return whether the table can be evicted */
+TRANSACTIONAL_TARGET
static bool dict_table_can_be_evicted(dict_table_t *table)
{
ut_ad(dict_sys.locked());
@@ -2064,6 +2065,7 @@ dict_index_add_to_cache(
/**********************************************************************//**
Removes an index from the dictionary cache. */
+TRANSACTIONAL_TARGET
static
void
dict_index_remove_from_cache_low(
diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc
index 2581cecddc3..ac70923b446 100644
--- a/storage/innobase/fsp/fsp0fsp.cc
+++ b/storage/innobase/fsp/fsp0fsp.cc
@@ -1052,10 +1052,10 @@ fsp_page_create(fil_space_t *space, page_no_t offset, mtr_t *mtr)
if (UNIV_UNLIKELY(space->is_being_truncated))
{
const page_id_t page_id{space->id, offset};
- const ulint fold= page_id.fold();
+ buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
mysql_mutex_lock(&buf_pool.mutex);
block= reinterpret_cast<buf_block_t*>
- (buf_pool.page_hash_get_low(page_id, fold));
+ (buf_pool.page_hash.get(page_id, chain));
if (block && block->page.oldest_modification() <= 1)
block= nullptr;
mysql_mutex_unlock(&buf_pool.mutex);
diff --git a/storage/innobase/gis/gis0sea.cc b/storage/innobase/gis/gis0sea.cc
index f067d43d6a3..41f32cf7240 100644
--- a/storage/innobase/gis/gis0sea.cc
+++ b/storage/innobase/gis/gis0sea.cc
@@ -78,6 +78,7 @@ rtr_adjust_parent_path(
Find the next matching record. This function is used by search
or record locating during index delete/update.
@return true if there is suitable record found, otherwise false */
+TRANSACTIONAL_TARGET
static
bool
rtr_pcur_getnext_from_path(
@@ -387,7 +388,7 @@ rtr_pcur_getnext_from_path(
trx_t* trx = thr_get_trx(
btr_cur->rtr_info->thr);
{
- LockMutexGuard g{SRW_LOCK_CALL};
+ TMLockTrxGuard g{TMLockTrxArgs(*trx)};
lock_init_prdt_from_mbr(
&prdt, &btr_cur->rtr_info->mbr,
mode, trx->lock.lock_heap);
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 0b0c3b045ff..6729e7e1747 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -3185,6 +3185,7 @@ the query cache.
@param[in] table table object
@param[in] trx transaction object
@return whether the storing or retrieving from the query cache is permitted */
+TRANSACTIONAL_TARGET
static bool innobase_query_caching_table_check_low(
dict_table_t* table, trx_t* trx)
{
@@ -3211,6 +3212,16 @@ static bool innobase_query_caching_table_check_low(
return false;
}
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (xbegin()) {
+ if (table->lock_mutex_is_locked())
+ xabort();
+ auto len = UT_LIST_GET_LEN(table->locks);
+ xend();
+ return len == 0;
+ }
+#endif
+
table->lock_mutex_lock();
auto len= UT_LIST_GET_LEN(table->locks);
table->lock_mutex_unlock();
@@ -18562,7 +18573,9 @@ void lock_wait_wsrep_kill(trx_t *bf_trx, ulong thd_id, trx_id_t trx_id)
trx_t *vtrx= thd_to_trx(vthd);
if (vtrx)
{
- lock_sys.wr_lock(SRW_LOCK_CALL);
+ /* Do not bother with lock elision using transactional memory here;
+ this is rather complex code */
+ LockMutexGuard g{SRW_LOCK_CALL};
mysql_mutex_lock(&lock_sys.wait_mutex);
vtrx->mutex_lock();
/* victim transaction is either active or prepared, if it has already
@@ -18607,7 +18620,6 @@ void lock_wait_wsrep_kill(trx_t *bf_trx, ulong thd_id, trx_id_t trx_id)
WSREP_DEBUG("kill transaction skipped due to wsrep_aborter set");
}
}
- lock_sys.wr_unlock();
mysql_mutex_unlock(&lock_sys.wait_mutex);
vtrx->mutex_unlock();
}
diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc
index bd471dff765..fbec3bee38c 100644
--- a/storage/innobase/ibuf/ibuf0ibuf.cc
+++ b/storage/innobase/ibuf/ibuf0ibuf.cc
@@ -3145,7 +3145,7 @@ or clustered
@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
@param[in,out] thr query thread
@return DB_SUCCESS, DB_STRONG_FAIL or other error */
-static MY_ATTRIBUTE((warn_unused_result))
+static TRANSACTIONAL_TARGET MY_ATTRIBUTE((warn_unused_result))
dberr_t
ibuf_insert_low(
ulint mode,
@@ -3310,7 +3310,8 @@ fail_exit:
/* We check if the index page is suitable for buffered entries */
- if (buf_pool.page_hash_contains(page_id)) {
+ if (buf_pool.page_hash_contains(
+ page_id, buf_pool.page_hash.cell_get(page_id.fold()))) {
commit_exit:
ibuf_mtr_commit(&bitmap_mtr);
goto fail_exit;
@@ -3469,6 +3470,7 @@ is clustered or unique.
@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
@param[in,out] thr query thread
@return true if success */
+TRANSACTIONAL_TARGET
bool
ibuf_insert(
ibuf_op_t op,
@@ -3556,7 +3558,8 @@ check_watch:
that the issuer of IBUF_OP_DELETE has called
buf_pool_t::watch_set(). */
- if (buf_pool.page_hash_contains<true>(page_id)) {
+ if (buf_pool.page_hash_contains<true>(
+ page_id, buf_pool.page_hash.cell_get(page_id.fold()))) {
/* A buffer pool watch has been set or the
page has been read into the buffer pool.
Do not buffer the request. If a purge operation
diff --git a/storage/innobase/include/btr0sea.h b/storage/innobase/include/btr0sea.h
index 4339c895400..b45183a6428 100644
--- a/storage/innobase/include/btr0sea.h
+++ b/storage/innobase/include/btr0sea.h
@@ -347,11 +347,21 @@ struct btr_search_sys_t
extern btr_search_sys_t btr_search_sys;
/** @return number of leaf pages pointed to by the adaptive hash index */
-inline ulint dict_index_t::n_ahi_pages() const
+TRANSACTIONAL_INLINE inline ulint dict_index_t::n_ahi_pages() const
{
if (!btr_search_enabled)
return 0;
srw_spin_lock *latch= &btr_search_sys.get_part(*this)->latch;
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (xbegin())
+ {
+ if (latch->is_locked())
+ xabort();
+ ulint ref_count= search_info->ref_count;
+ xend();
+ return ref_count;
+ }
+#endif
latch->rd_lock(SRW_LOCK_CALL);
ulint ref_count= search_info->ref_count;
latch->rd_unlock();
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
index 480c41669e0..00b19ff3d79 100644
--- a/storage/innobase/include/buf0buf.h
+++ b/storage/innobase/include/buf0buf.h
@@ -36,11 +36,11 @@ Created 11/5/1995 Heikki Tuuri
#include "assume_aligned.h"
#include "buf0types.h"
#ifndef UNIV_INNOCHECKSUM
-#include "hash0hash.h"
#include "ut0byte.h"
#include "page0types.h"
#include "log0log.h"
#include "srv0srv.h"
+#include "transactional_lock_guard.h"
#include <ostream>
// Forward declaration
@@ -169,30 +169,10 @@ operator<<(
const page_id_t page_id);
#ifndef UNIV_INNOCHECKSUM
-/*********************************************************************//**
-Gets the current size of buffer buf_pool in bytes.
-@return size in bytes */
-UNIV_INLINE
-ulint
-buf_pool_get_curr_size(void);
-/*========================*/
-
-/********************************************************************//**
-Allocates a buf_page_t descriptor. This function must succeed. In case
-of failure we assert in this function. */
-UNIV_INLINE
-buf_page_t*
-buf_page_alloc_descriptor(void)
-/*===========================*/
- MY_ATTRIBUTE((malloc));
-/********************************************************************//**
-Free a buf_page_t descriptor. */
-UNIV_INLINE
-void
-buf_page_free_descriptor(
-/*=====================*/
- buf_page_t* bpage) /*!< in: bpage descriptor to free. */
- MY_ATTRIBUTE((nonnull));
+# define buf_pool_get_curr_size() srv_buf_pool_curr_size
+# define buf_page_alloc_descriptor() \
+ static_cast<buf_page_t*>(ut_zalloc_nokey(sizeof(buf_page_t)))
+# define buf_page_free_descriptor(bpage) ut_free(bpage)
/** Allocate a buffer block.
@return own: the allocated block, in state BUF_BLOCK_MEMORY */
@@ -349,25 +329,6 @@ void buf_page_make_young(buf_page_t *bpage);
@param[in,out] mtr mini-transaction */
void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr);
-/********************************************************************//**
-Reads the freed_page_clock of a buffer block.
-@return freed_page_clock */
-UNIV_INLINE
-unsigned
-buf_page_get_freed_page_clock(
-/*==========================*/
- const buf_page_t* bpage) /*!< in: block */
- MY_ATTRIBUTE((warn_unused_result));
-/********************************************************************//**
-Reads the freed_page_clock of a buffer block.
-@return freed_page_clock */
-UNIV_INLINE
-unsigned
-buf_block_get_freed_page_clock(
-/*===========================*/
- const buf_block_t* block) /*!< in: block */
- MY_ATTRIBUTE((warn_unused_result));
-
/** Determine if a block is still close enough to the MRU end of the LRU list
meaning that it is not in danger of getting evicted and also implying
that it has been accessed recently.
@@ -665,7 +626,7 @@ class buf_page_t
/* @{ */
public: // FIXME: fix fil_iterate()
- /** Page id. Protected by buf_pool.hash_lock_get(id) when
+ /** Page id. Protected by buf_pool.page_hash.lock_get() when
the page is in buf_pool.page_hash. */
page_id_t id_;
private:
@@ -687,13 +648,13 @@ private:
Atomic_relaxed<buf_io_fix> io_fix_;
/** Block state. @see in_file().
State transitions between in_file() states and to
- BUF_BLOCK_REMOVE_HASH are protected by buf_pool.hash_lock_get(id)
+ BUF_BLOCK_REMOVE_HASH are protected by buf_pool.page_hash.lock_get()
when the block is in buf_pool.page_hash.
Other transitions when in_LRU_list are protected by buf_pool.mutex. */
buf_page_state state_;
public:
- /** buf_pool.page_hash link; protected by buf_pool.hash_lock_get(id) */
+ /** buf_pool.page_hash link; protected by buf_pool.page_hash.lock_get() */
buf_page_t *hash;
/* @} */
page_zip_des_t zip; /*!< compressed page; zip.data
@@ -801,7 +762,6 @@ public:
ut_d(in_free_list= false);
ut_d(in_LRU_list= false);
ut_d(in_page_hash= false);
- HASH_INVALIDATE(this, hash);
}
/** Initialize some more fields */
@@ -819,6 +779,7 @@ public:
init();
id_= id;
buf_fix_count_= buf_fix_count;
+ hash= nullptr;
}
public:
@@ -862,9 +823,16 @@ public:
inline void set_oldest_modification(lsn_t lsn);
/** Clear oldest_modification after removing from buf_pool.flush_list */
inline void clear_oldest_modification();
+ /** Reset the oldest_modification when marking a persistent page freed */
+ void reset_oldest_modification()
+ {
+ ut_ad(oldest_modification() > 2);
+ oldest_modification_.store(1, std::memory_order_release);
+ }
+
/** Note that a block is no longer dirty, while not removing
it from buf_pool.flush_list */
- inline void clear_oldest_modification(bool temporary);
+ inline void write_complete(bool temporary);
/** Notify that a page in a temporary tablespace has been modified. */
void set_temp_modified()
@@ -934,9 +902,6 @@ public:
/** @return whether the block is modified and ready for flushing */
inline bool ready_for_flush() const;
- /** @return whether the state can be changed to BUF_BLOCK_NOT_USED */
- bool ready_for_replace() const
- { return !oldest_modification() && can_relocate(); }
/** @return whether the block can be relocated in memory.
The block can be dirty, but it must not be I/O-fixed or bufferfixed. */
inline bool can_relocate() const;
@@ -1347,7 +1312,14 @@ class buf_pool_t
inline const buf_block_t *not_freed() const;
#endif /* UNIV_DEBUG */
};
-
+public:
+ /** Hash cell chain in page_hash_table */
+ struct hash_chain
+ {
+ /** pointer to the first block */
+ buf_page_t *first;
+ };
+private:
/** Withdraw blocks from the buffer pool until meeting withdraw_target.
@return whether retry is needed */
inline bool withdraw_blocks();
@@ -1509,84 +1481,33 @@ public:
return is_block_field(reinterpret_cast<const void*>(block));
}
- /** Get the page_hash latch for a page */
- page_hash_latch *hash_lock_get(const page_id_t id) const
- {
- return page_hash.lock_get(id.fold());
- }
-
- /** Look up a block descriptor.
- @param id page identifier
- @param fold id.fold()
- @return block descriptor, possibly in watch[]
- @retval nullptr if not found*/
- buf_page_t *page_hash_get_low(const page_id_t id, const ulint fold)
- {
- ut_ad(id.fold() == fold);
-#ifdef SAFE_MUTEX
- DBUG_ASSERT(mysql_mutex_is_owner(&mutex) ||
- page_hash.lock_get(fold)->is_locked());
-#endif /* SAFE_MUTEX */
- buf_page_t *bpage;
- /* Look for the page in the hash table */
- HASH_SEARCH(hash, &page_hash, fold, buf_page_t*, bpage,
- ut_ad(bpage->in_page_hash), id == bpage->id());
- return bpage;
- }
-private:
- /** Look up a block descriptor.
- @tparam exclusive whether the latch is to be acquired exclusively
- @tparam watch whether to allow watch_is_sentinel()
- @param page_id page identifier
- @param fold page_id.fold()
- @param hash_lock pointer to the acquired latch (to be released by caller)
- @return pointer to the block
- @retval nullptr if no block was found; !lock || !*lock will also hold */
- template<bool exclusive,bool watch>
- buf_page_t *page_hash_get_locked(const page_id_t page_id, ulint fold,
- page_hash_latch **hash_lock)
+public:
+ /** @return whether the buffer pool contains a page
+ @tparam allow_watch whether to allow watch_is_sentinel()
+ @param page_id page identifier
+ @param chain hash table chain for page_id.fold() */
+ template<bool allow_watch= false>
+ TRANSACTIONAL_INLINE
+ bool page_hash_contains(const page_id_t page_id, hash_chain &chain)
{
- ut_ad(hash_lock || !exclusive);
- page_hash_latch *latch= page_hash.lock<exclusive>(fold);
- buf_page_t *bpage= page_hash_get_low(page_id, fold);
- if (!bpage || watch_is_sentinel(*bpage))
+ transactional_shared_lock_guard<page_hash_latch> g
+ {page_hash.lock_get(chain)};
+ buf_page_t *bpage= page_hash.get(page_id, chain);
+ if (bpage >= &watch[0] && bpage < &watch[UT_ARR_SIZE(watch)])
{
- latch->release<exclusive>();
- if (hash_lock)
- *hash_lock= nullptr;
- return watch ? bpage : nullptr;
+ ut_ad(bpage->state() == BUF_BLOCK_ZIP_PAGE);
+ ut_ad(!bpage->in_zip_hash);
+ ut_ad(!bpage->zip.data);
+ if (!allow_watch)
+ bpage= nullptr;
+ }
+ else if (bpage)
+ {
+ ut_ad(page_id == bpage->id());
+ ut_ad(bpage->in_file());
}
-
- ut_ad(bpage->in_file());
- ut_ad(page_id == bpage->id());
-
- if (hash_lock)
- *hash_lock= latch; /* to be released by the caller */
- else
- latch->release<exclusive>();
return bpage;
}
-public:
- /** Look up a block descriptor.
- @tparam exclusive whether the latch is to be acquired exclusively
- @param page_id page identifier
- @param fold page_id.fold()
- @param hash_lock pointer to the acquired latch (to be released by caller)
- @return pointer to the block
- @retval nullptr if no block was found; !lock || !*lock will also hold */
- template<bool exclusive>
- buf_page_t *page_hash_get_locked(const page_id_t page_id, ulint fold,
- page_hash_latch **hash_lock)
- { return page_hash_get_locked<exclusive,false>(page_id, fold, hash_lock); }
-
- /** @return whether the buffer pool contains a page
- @tparam watch whether to allow watch_is_sentinel()
- @param page_id page identifier */
- template<bool watch= false>
- bool page_hash_contains(const page_id_t page_id)
- {
- return page_hash_get_locked<false,watch>(page_id, page_id.fold(), nullptr);
- }
/** Determine if a block is a sentinel for a buffer pool watch.
@param bpage page descriptor
@@ -1595,13 +1516,14 @@ public:
{
#ifdef SAFE_MUTEX
DBUG_ASSERT(mysql_mutex_is_owner(&mutex) ||
- hash_lock_get(bpage.id())->is_locked());
+ page_hash.lock_get(page_hash.cell_get(bpage.id().fold())).
+ is_locked());
#endif /* SAFE_MUTEX */
- ut_ad(bpage.in_file());
-
if (&bpage < &watch[0] || &bpage >= &watch[UT_ARR_SIZE(watch)])
{
- ut_ad(bpage.state() != BUF_BLOCK_ZIP_PAGE || bpage.zip.data);
+ ut_ad(bpage.state() == BUF_BLOCK_ZIP_PAGE
+ ? !!bpage.zip.data
+ : bpage.state() == BUF_BLOCK_FILE_PAGE);
return false;
}
@@ -1615,37 +1537,37 @@ public:
This may only be called after !watch_set() and before invoking watch_unset().
@param id page identifier
@return whether the page was read to the buffer pool */
+ TRANSACTIONAL_INLINE
bool watch_occurred(const page_id_t id)
{
- const ulint fold= id.fold();
- page_hash_latch *hash_lock= page_hash.lock<false>(fold);
+ hash_chain &chain= page_hash.cell_get(id.fold());
+ transactional_shared_lock_guard<page_hash_latch> g
+ {page_hash.lock_get(chain)};
/* The page must exist because watch_set() increments buf_fix_count. */
- buf_page_t *bpage= page_hash_get_low(id, fold);
- const bool is_sentinel= watch_is_sentinel(*bpage);
- hash_lock->read_unlock();
- return !is_sentinel;
+ return !watch_is_sentinel(*page_hash.get(id, chain));
}
/** Register a watch for a page identifier. The caller must hold an
exclusive page hash latch. The *hash_lock may be released,
relocated, and reacquired.
@param id page identifier
- @param hash_lock exclusively held page_hash latch
+ @param chain hash table chain with exclusively held page_hash
@return a buffer pool block corresponding to id
@retval nullptr if the block was not present, and a watch was installed */
- inline buf_page_t *watch_set(const page_id_t id,
- page_hash_latch **hash_lock);
+ inline buf_page_t *watch_set(const page_id_t id, hash_chain &chain);
/** Stop watching whether a page has been read in.
watch_set(id) must have returned nullptr before.
- @param id page identifier */
- void watch_unset(const page_id_t id);
+ @param id page identifier
+ @param chain unlocked hash table chain */
+ void watch_unset(const page_id_t id, hash_chain &chain);
/** Remove the sentinel block for the watch before replacing it with a
real block. watch_unset() or watch_occurred() will notice
that the block has been replaced with the real block.
- @param watch sentinel */
- inline void watch_remove(buf_page_t *watch);
+ @param watch sentinel
+ @param chain locked hash table chain */
+ inline void watch_remove(buf_page_t *watch, hash_chain &chain);
/** @return whether less than 1/4 of the buffer pool is available */
bool running_out() const
@@ -1690,13 +1612,12 @@ public:
ulint n_flush_LRU_;
/** broadcast when n_flush_LRU reaches 0; protected by mutex */
pthread_cond_t done_flush_LRU;
- /** Number of pending flush_list flush; protected by mutex */
- ulint n_flush_list_;
- /** broadcast when n_flush_list reaches 0; protected by mutex */
+ /** whether a flush_list batch is active; protected by flush_list_mutex */
+ bool flush_list_active;
+ /** broadcast when a batch completes; protected by flush_list_mutex */
pthread_cond_t done_flush_list;
TPOOL_SUPPRESS_TSAN ulint n_flush_LRU() const { return n_flush_LRU_; }
- TPOOL_SUPPRESS_TSAN ulint n_flush_list() const { return n_flush_list_; }
/** @name General fields */
/* @{ */
@@ -1723,7 +1644,7 @@ public:
/** read-ahead request size in pages */
Atomic_counter<uint32_t> read_ahead_area;
- /** Hash table with singly-linked overflow lists. @see hash_table_t */
+ /** Hash table with singly-linked overflow lists */
struct page_hash_table
{
static_assert(CPU_LEVEL1_DCACHE_LINESIZE >= 64, "less than 64 bytes");
@@ -1739,7 +1660,7 @@ public:
/** number of payload elements in array[] */
Atomic_relaxed<ulint> n_cells;
/** the hash table, with pad(n_cells) elements, aligned to L1 cache size */
- hash_cell_t *array;
+ hash_chain *array;
/** Create the hash table.
@param n the lower bound of n_cells */
@@ -1766,29 +1687,72 @@ public:
{
return pad(hash(fold, n_cells));
}
- /** Get a page_hash latch. */
- page_hash_latch *lock_get(ulint fold, ulint n) const
+ public:
+ /** @return the latch covering a hash table chain */
+ static page_hash_latch &lock_get(hash_chain &chain)
{
static_assert(!((ELEMENTS_PER_LATCH + 1) & ELEMENTS_PER_LATCH),
"must be one less than a power of 2");
- return reinterpret_cast<page_hash_latch*>
- (&array[calc_hash(fold, n) & ~ELEMENTS_PER_LATCH]);
+ const size_t addr= reinterpret_cast<size_t>(&chain);
+ ut_ad(addr & (ELEMENTS_PER_LATCH * sizeof chain));
+ return *reinterpret_cast<page_hash_latch*>
+ (addr & ~(ELEMENTS_PER_LATCH * sizeof chain));
}
- public:
- /** Get a page_hash latch. */
- page_hash_latch *lock_get(ulint fold) const
- { return lock_get(fold, n_cells); }
-
- /** Acquire an array latch.
- @tparam exclusive whether the latch is to be acquired exclusively
- @param fold hash bucket key */
- template<bool exclusive> page_hash_latch *lock(ulint fold)
+
+ /** Get a hash table slot. */
+ hash_chain &cell_get(ulint fold) const
+ { return array[calc_hash(fold, n_cells)]; }
+
+ /** Append a block descriptor to a hash bucket chain. */
+ void append(hash_chain &chain, buf_page_t *bpage)
+ {
+ ut_ad(!bpage->in_page_hash);
+ ut_ad(!bpage->hash);
+ ut_d(bpage->in_page_hash= true);
+ buf_page_t **prev= &chain.first;
+ while (*prev)
+ {
+ ut_ad((*prev)->in_page_hash);
+ prev= &(*prev)->hash;
+ }
+ *prev= bpage;
+ }
+
+ /** Remove a block descriptor from a hash bucket chain. */
+ void remove(hash_chain &chain, buf_page_t *bpage)
+ {
+ ut_ad(bpage->in_page_hash);
+ buf_page_t **prev= &chain.first;
+ while (*prev != bpage)
+ {
+ ut_ad((*prev)->in_page_hash);
+ prev= &(*prev)->hash;
+ }
+ *prev= bpage->hash;
+ ut_d(bpage->in_page_hash= false);
+ bpage->hash= nullptr;
+ }
+
+ /** Replace a block descriptor with another. */
+ void replace(hash_chain &chain, buf_page_t *old, buf_page_t *bpage)
{
- page_hash_latch *latch= lock_get(fold, n_cells);
- latch->acquire<exclusive>();
- return latch;
+ ut_ad(old->in_page_hash);
+ ut_ad(bpage->in_page_hash);
+ ut_d(old->in_page_hash= false);
+ ut_ad(bpage->hash == old->hash);
+ old->hash= nullptr;
+ buf_page_t **prev= &chain.first;
+ while (*prev != old)
+ {
+ ut_ad((*prev)->in_page_hash);
+ prev= &(*prev)->hash;
+ }
+ *prev= bpage;
}
+ /** Look up a page in a hash bucket chain. */
+ inline buf_page_t *get(const page_id_t id, const hash_chain &chain) const;
+
/** Exclusively aqcuire all latches */
inline void write_lock_all();
@@ -1861,7 +1825,7 @@ public:
last_activity_count= activity_count;
}
- // n_flush_LRU() + n_flush_list()
+ // os_aio_pending_writes()
// is approximately COUNT(io_fix()==BUF_IO_WRITE) in flush_list
unsigned freed_page_clock;/*!< a sequence number used
@@ -1946,15 +1910,10 @@ public:
/** Reserve a buffer. */
buf_tmp_buffer_t *io_buf_reserve() { return io_buf.reserve(); }
- /** @return whether any I/O is pending */
- bool any_io_pending() const
+ /** @return whether some I/O is pending, excluding os_aio_pending_writes() */
+ bool some_io_pending() const
{
- return n_pend_reads || n_flush_LRU() || n_flush_list();
- }
- /** @return total amount of pending I/O */
- ulint io_pending() const
- {
- return n_pend_reads + n_flush_LRU() + n_flush_list();
+ return n_pend_reads || n_flush_LRU() || flush_list_active;
}
private:
@@ -2024,18 +1983,37 @@ private:
/** The InnoDB buffer pool */
extern buf_pool_t buf_pool;
-inline void page_hash_latch::read_lock()
+inline buf_page_t *buf_pool_t::page_hash_table::get(const page_id_t id,
+ const hash_chain &chain)
+ const
+{
+#ifdef SAFE_MUTEX
+ DBUG_ASSERT(mysql_mutex_is_owner(&buf_pool.mutex) ||
+ lock_get(const_cast<hash_chain&>(chain)).is_locked());
+#endif /* SAFE_MUTEX */
+ for (buf_page_t *bpage= chain.first; bpage; bpage= bpage->hash)
+ {
+ ut_ad(bpage->in_page_hash);
+ if (bpage->id() == id)
+ return bpage;
+ }
+ return nullptr;
+}
+
+#ifdef SUX_LOCK_GENERIC
+inline void page_hash_latch::lock_shared()
{
mysql_mutex_assert_not_owner(&buf_pool.mutex);
if (!read_trylock())
read_lock_wait();
}
-inline void page_hash_latch::write_lock()
+inline void page_hash_latch::lock()
{
if (!write_trylock())
write_lock_wait();
}
+#endif /* SUX_LOCK_GENERIC */
inline void buf_page_t::add_buf_fix_count(uint32_t count)
{
@@ -2060,18 +2038,17 @@ inline void buf_page_t::set_state(buf_page_state state)
we are holding the hash_lock. */
break;
case BUF_BLOCK_MEMORY:
- if (!in_file()) break;
- /* fall through */
- case BUF_BLOCK_FILE_PAGE:
- ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked());
break;
case BUF_BLOCK_NOT_USED:
- if (!in_file()) break;
- /* fall through */
+ break;
case BUF_BLOCK_ZIP_PAGE:
- ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked() ||
- (this >= &buf_pool.watch[0] &&
- this <= &buf_pool.watch[UT_ARR_SIZE(buf_pool.watch)]));
+ if (this >= &buf_pool.watch[0] &&
+ this <= &buf_pool.watch[UT_ARR_SIZE(buf_pool.watch)])
+ break;
+ /* fall through */
+ case BUF_BLOCK_FILE_PAGE:
+ ut_ad(buf_pool.page_hash.lock_get(buf_pool.page_hash.cell_get(id_.fold())).
+ is_write_locked());
break;
}
#endif
@@ -2103,7 +2080,8 @@ inline void buf_page_t::set_corrupt_id()
break;
case BUF_BLOCK_ZIP_PAGE:
case BUF_BLOCK_FILE_PAGE:
- ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked());
+ ut_ad(buf_pool.page_hash.lock_get(buf_pool.page_hash.cell_get(id_.fold())).
+ is_write_locked());
break;
case BUF_BLOCK_NOT_USED:
case BUF_BLOCK_MEMORY:
@@ -2138,35 +2116,13 @@ inline void buf_page_t::clear_oldest_modification()
oldest_modification_.store(0, std::memory_order_release);
}
-/** Note that a block is no longer dirty, while not removing
-it from buf_pool.flush_list */
-inline void buf_page_t::clear_oldest_modification(bool temporary)
-{
- ut_ad(temporary == fsp_is_system_temporary(id().space()));
- if (temporary)
- {
- ut_ad(oldest_modification() == 2);
- oldest_modification_= 0;
- }
- else
- {
- /* We use release memory order to guarantee that callers of
- oldest_modification_acquire() will observe the block as
- being detached from buf_pool.flush_list, after reading the value 0. */
- ut_ad(oldest_modification() > 2);
- oldest_modification_.store(1, std::memory_order_release);
- }
-}
-
/** @return whether the block is modified and ready for flushing */
inline bool buf_page_t::ready_for_flush() const
{
mysql_mutex_assert_owner(&buf_pool.mutex);
ut_ad(in_LRU_list);
ut_a(in_file());
- ut_ad(fsp_is_system_temporary(id().space())
- ? oldest_modification() == 2
- : oldest_modification() > 2);
+ ut_ad(!fsp_is_system_temporary(id().space()) || oldest_modification() == 2);
return io_fix_ == BUF_IO_NONE;
}
diff --git a/storage/innobase/include/buf0buf.ic b/storage/innobase/include/buf0buf.ic
index 13eda113a21..30fd0b2b1f9 100644
--- a/storage/innobase/include/buf0buf.ic
+++ b/storage/innobase/include/buf0buf.ic
@@ -2,7 +2,7 @@
Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, Google Inc.
-Copyright (c) 2014, 2020, MariaDB Corporation.
+Copyright (c) 2014, 2021, MariaDB Corporation.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -37,42 +37,6 @@ Created 11/5/1995 Heikki Tuuri
#include "buf0rea.h"
#include "fsp0types.h"
-/*********************************************************************//**
-Gets the current size of buffer buf_pool in bytes.
-@return size in bytes */
-UNIV_INLINE
-ulint
-buf_pool_get_curr_size(void)
-/*========================*/
-{
- return(srv_buf_pool_curr_size);
-}
-
-/********************************************************************//**
-Reads the freed_page_clock of a buffer block.
-@return freed_page_clock */
-UNIV_INLINE
-unsigned
-buf_page_get_freed_page_clock(
-/*==========================*/
- const buf_page_t* bpage) /*!< in: block */
-{
- /* This is sometimes read without holding buf_pool.mutex. */
- return(bpage->freed_page_clock);
-}
-
-/********************************************************************//**
-Reads the freed_page_clock of a buffer block.
-@return freed_page_clock */
-UNIV_INLINE
-unsigned
-buf_block_get_freed_page_clock(
-/*===========================*/
- const buf_block_t* block) /*!< in: block */
-{
- return(buf_page_get_freed_page_clock(&block->page));
-}
-
/** Determine if a block is still close enough to the MRU end of the LRU list
meaning that it is not in danger of getting evicted and also implying
that it has been accessed recently.
@@ -154,35 +118,6 @@ ok:
}
#endif /* UNIV_DEBUG */
-/********************************************************************//**
-Allocates a buf_page_t descriptor. This function must succeed. In case
-of failure we assert in this function.
-@return: the allocated descriptor. */
-UNIV_INLINE
-buf_page_t*
-buf_page_alloc_descriptor(void)
-/*===========================*/
-{
- buf_page_t* bpage;
-
- bpage = (buf_page_t*) ut_zalloc_nokey(sizeof *bpage);
- ut_ad(bpage);
- MEM_UNDEFINED(bpage, sizeof *bpage);
-
- return(bpage);
-}
-
-/********************************************************************//**
-Free a buf_page_t descriptor. */
-UNIV_INLINE
-void
-buf_page_free_descriptor(
-/*=====================*/
- buf_page_t* bpage) /*!< in: bpage descriptor to free. */
-{
- ut_free(bpage);
-}
-
/** Allocate a buffer block.
@return own: the allocated block, in state BUF_BLOCK_MEMORY */
inline buf_block_t *buf_block_alloc()
diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h
index cd0d068abb0..90c9c94cd1e 100644
--- a/storage/innobase/include/buf0flu.h
+++ b/storage/innobase/include/buf0flu.h
@@ -87,13 +87,6 @@ buf_flush_init_for_writing(
void* page_zip_,
bool use_full_checksum);
-/** Write out dirty blocks from buf_pool.flush_list.
-@param max_n wished maximum mumber of blocks flushed
-@param lsn buf_pool.get_oldest_modification(LSN_MAX) target
-@return the number of processed pages
-@retval 0 if a buf_pool.flush_list batch is already running */
-ulint buf_flush_list(ulint max_n= ULINT_UNDEFINED, lsn_t lsn= LSN_MAX);
-
/** Try to flush dirty pages that belong to a given tablespace.
@param space tablespace
@param n_flushed number of pages written
@@ -107,9 +100,8 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed= nullptr)
@retval 0 if a buf_pool.LRU batch is already running */
ulint buf_flush_LRU(ulint max_n);
-/** Wait until a flush batch ends.
-@param lru true=buf_pool.LRU; false=buf_pool.flush_list */
-void buf_flush_wait_batch_end(bool lru);
+/** Wait until a LRU flush batch ends. */
+void buf_flush_wait_LRU_batch_end();
/** Wait until all persistent pages are flushed up to a limit.
@param sync_lsn buf_pool.get_oldest_modification(LSN_MAX) to wait for */
ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn);
@@ -135,8 +127,8 @@ buf_flush_note_modification(
/** Initialize page_cleaner. */
ATTRIBUTE_COLD void buf_flush_page_cleaner_init();
-/** Wait for pending flushes to complete. */
-void buf_flush_wait_batch_end_acquiring_mutex(bool lru);
+/** Wait for pending LRU flush to complete. */
+void buf_flush_wait_LRU_batch_end_acquiring_mutex();
/** Flush the buffer pool on shutdown. */
ATTRIBUTE_COLD void buf_flush_buffer_pool();
@@ -146,6 +138,10 @@ ATTRIBUTE_COLD void buf_flush_buffer_pool();
void buf_flush_validate();
#endif /* UNIV_DEBUG */
+/** Synchronously flush dirty blocks during recv_sys_t::apply().
+NOTE: The calling thread is not allowed to hold any buffer page latches! */
+void buf_flush_sync_batch(lsn_t lsn);
+
/** Synchronously flush dirty blocks.
NOTE: The calling thread is not allowed to hold any buffer page latches! */
void buf_flush_sync();
diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h
index 540c14a49c9..aec08e77f54 100644
--- a/storage/innobase/include/buf0lru.h
+++ b/storage/innobase/include/buf0lru.h
@@ -1,7 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -24,11 +24,10 @@ The database buffer pool LRU replacement algorithm
Created 11/5/1995 Heikki Tuuri
*******************************************************/
-#ifndef buf0lru_h
-#define buf0lru_h
+#pragma once
-#include "ut0byte.h"
#include "buf0types.h"
+#include "hash0hash.h"
// Forward declaration
struct trx_t;
@@ -132,14 +131,6 @@ policy at the end of each interval. */
void
buf_LRU_stat_update();
-/** Remove one page from LRU list and put it to free list.
-@param bpage file page to be freed
-@param id page identifier
-@param hash_lock buf_pool.page_hash latch (will be released here) */
-void buf_LRU_free_one_page(buf_page_t *bpage, const page_id_t id,
- page_hash_latch *hash_lock)
- MY_ATTRIBUTE((nonnull));
-
#ifdef UNIV_DEBUG
/** Validate the LRU list. */
void buf_LRU_validate();
@@ -200,5 +191,3 @@ Increments the I/O counter in buf_LRU_stat_cur. */
/********************************************************************//**
Increments the page_zip_decompress() counter in buf_LRU_stat_cur. */
#define buf_LRU_stat_inc_unzip() buf_LRU_stat_cur.unzip++
-
-#endif
diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h
index 04b47aaddab..2cb92a5f1df 100644
--- a/storage/innobase/include/buf0types.h
+++ b/storage/innobase/include/buf0types.h
@@ -178,35 +178,60 @@ enum rw_lock_type_t
#include "sux_lock.h"
-class page_hash_latch : public rw_lock
+#ifdef SUX_LOCK_GENERIC
+class page_hash_latch : private rw_lock
{
-public:
/** Wait for a shared lock */
void read_lock_wait();
/** Wait for an exclusive lock */
void write_lock_wait();
-
+public:
/** Acquire a shared lock */
- inline void read_lock();
+ inline void lock_shared();
/** Acquire an exclusive lock */
- inline void write_lock();
-
- /** Acquire a lock */
- template<bool exclusive> void acquire()
- {
- if (exclusive)
- write_lock();
- else
- read_lock();
- }
- /** Release a lock */
- template<bool exclusive> void release()
- {
- if (exclusive)
- write_unlock();
- else
- read_unlock();
- }
+ inline void lock();
+
+#ifdef UNIV_DEBUG
+ /** @return whether an exclusive lock is being held by any thread */
+ bool is_write_locked() const { return rw_lock::is_write_locked(); }
+#endif
+
+ /** @return whether any lock is being held by any thread */
+ bool is_locked() const { return rw_lock::is_locked(); }
+ /** @return whether any lock is being held or waited for by any thread */
+ bool is_locked_or_waiting() const { return rw_lock::is_locked_or_waiting(); }
+
+ /** Release a shared lock */
+ void unlock_shared() { read_unlock(); }
+ /** Release an exclusive lock */
+ void unlock() { write_unlock(); }
+};
+#elif defined _WIN32 || SIZEOF_SIZE_T >= 8
+class page_hash_latch
+{
+ srw_spin_lock_low lk;
+public:
+ void lock_shared() { lk.rd_lock(); }
+ void unlock_shared() { lk.rd_unlock(); }
+ void lock() { lk.wr_lock(); }
+ void unlock() { lk.wr_unlock(); }
+ bool is_write_locked() const { return lk.is_write_locked(); }
+ bool is_locked() const { return lk.is_locked(); }
+ bool is_locked_or_waiting() const { return lk.is_locked_or_waiting(); }
+};
+#else
+class page_hash_latch
+{
+ srw_spin_mutex lk;
+public:
+ void lock_shared() { lock(); }
+ void unlock_shared() { unlock(); }
+ void lock() { lk.wr_lock(); }
+ void unlock() { lk.wr_unlock(); }
+ bool is_locked() const { return lk.is_locked(); }
+ bool is_write_locked() const { return is_locked(); }
+ bool is_locked_or_waiting() const { return is_locked(); }
};
+#endif
#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h
index 2a7b38f345c..986c767ff49 100644
--- a/storage/innobase/include/dict0mem.h
+++ b/storage/innobase/include/dict0mem.h
@@ -1979,11 +1979,15 @@ struct dict_table_t {
ut_ad(lock_mutex_owner.exchange(0) == os_thread_get_curr_id());
lock_mutex.wr_unlock();
}
+#ifndef SUX_LOCK_GENERIC
+ /** @return whether the lock mutex is held by some thread */
+ bool lock_mutex_is_locked() const noexcept { return lock_mutex.is_locked(); }
+#endif
/* stats mutex lock currently defaults to lock_mutex but in the future,
there could be a use-case to have separate mutex for stats.
-  extra indirection (through inline so no performance hit) should
-  help simplify code and increase long-term maintainability */
+ extra indirection (through inline so no performance hit) should
+ help simplify code and increase long-term maintainability */
void stats_mutex_init() { lock_mutex_init(); }
void stats_mutex_destroy() { lock_mutex_destroy(); }
void stats_mutex_lock() { lock_mutex_lock(); }
diff --git a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h
index 46a43b13a0a..8e7b8dfd1e6 100644
--- a/storage/innobase/include/hash0hash.h
+++ b/storage/innobase/include/hash0hash.h
@@ -117,18 +117,6 @@ do {\
HASH_INVALIDATE(DATA, NAME);\
} while (0)
-#define HASH_REPLACE(TYPE, NAME, TABLE, FOLD, DATA_OLD, DATA_NEW) \
- do { \
- (DATA_NEW)->NAME = (DATA_OLD)->NAME; \
- \
- hash_cell_t& cell3333 \
- = (TABLE)->array[(TABLE)->calc_hash(FOLD)]; \
- TYPE** struct3333 = (TYPE**)&cell3333.node; \
- while (*struct3333 != DATA_OLD) { \
- struct3333 = &((*struct3333)->NAME); \
- } \
- *struct3333 = DATA_NEW; \
- } while (0)
/*******************************************************************//**
Gets the first struct in a hash chain, NULL if none. */
diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h
index 5f051b8ffbe..50b9792cf2b 100644
--- a/storage/innobase/include/lock0lock.h
+++ b/storage/innobase/include/lock0lock.h
@@ -28,16 +28,15 @@ Created 5/7/1996 Heikki Tuuri
#define lock0lock_h
#include "buf0types.h"
-#include "trx0types.h"
+#include "trx0trx.h"
#include "mtr0types.h"
#include "rem0types.h"
-#include "que0types.h"
-#include "lock0types.h"
#include "hash0hash.h"
#include "srv0srv.h"
#include "ut0vec.h"
#include "gis0rtree.h"
#include "lock0prdt.h"
+#include "transactional_lock_guard.h"
// Forward declaration
class ReadView;
@@ -62,8 +61,10 @@ lock_get_min_heap_no(
/*=================*/
const buf_block_t* block); /*!< in: buffer block */
-/** Discard locks for an index */
-void lock_discard_for_index(const dict_index_t &index);
+/** Discard locks for an index when purging DELETE FROM SYS_INDEXES
+after an aborted CREATE INDEX operation.
+@param index a stale index on which ADD INDEX operation was aborted */
+ATTRIBUTE_COLD void lock_discard_for_index(const dict_index_t &index);
/*************************************************************//**
Updates the lock table when we have reorganized a page. NOTE: we copy
@@ -571,6 +572,9 @@ class lock_sys_t
{
friend struct LockGuard;
friend struct LockMultiGuard;
+ friend struct TMLockGuard;
+ friend struct TMLockMutexGuard;
+ friend struct TMLockTrxGuard;
/** Hash table latch */
struct hash_latch
@@ -585,6 +589,11 @@ class lock_sys_t
void acquire() { if (!try_acquire()) wait(); }
/** Release a lock */
void release();
+ /** @return whether any lock is being held or waited for by any thread */
+ bool is_locked_or_waiting() const
+ { return rw_lock::is_locked_or_waiting(); }
+ /** @return whether this latch is possibly held by any thread */
+ bool is_locked() const { return rw_lock::is_locked(); }
#else
{
private:
@@ -596,11 +605,11 @@ class lock_sys_t
void acquire() { lock.wr_lock(); }
/** Release a lock */
void release() { lock.wr_unlock(); }
-#endif
-#ifdef UNIV_DEBUG
+ /** @return whether any lock may be held by any thread */
+ bool is_locked_or_waiting() const noexcept
+ { return lock.is_locked_or_waiting(); }
/** @return whether this latch is possibly held by any thread */
- bool is_locked() const
- { return memcmp(this, field_ref_zero, sizeof *this); }
+ bool is_locked() const noexcept { return lock.is_locked(); }
#endif
};
@@ -799,7 +808,14 @@ public:
#ifdef UNIV_DEBUG
/** @return whether the current thread is the lock_sys.latch writer */
bool is_writer() const
- { return writer.load(std::memory_order_relaxed) == os_thread_get_curr_id(); }
+ {
+# ifdef SUX_LOCK_GENERIC
+ return writer.load(std::memory_order_relaxed) == os_thread_get_curr_id();
+# else
+ return writer.load(std::memory_order_relaxed) == os_thread_get_curr_id() ||
+ (xtest() && !latch.is_locked_or_waiting());
+# endif
+ }
/** Assert that a lock shard is exclusively latched (by some thread) */
void assert_locked(const lock_t &lock) const;
/** Assert that a table lock shard is exclusively latched by this thread */
@@ -836,13 +852,14 @@ public:
void deadlock_check();
/** Cancel a waiting lock request.
- @param lock waiting lock request
- @param trx active transaction
- @param check_victim whether to check trx->lock.was_chosen_as_deadlock_victim
+ @tparam check_victim whether to check for DB_DEADLOCK
+ @param lock waiting lock request
+ @param trx active transaction
@retval DB_SUCCESS if no lock existed
@retval DB_DEADLOCK if trx->lock.was_chosen_as_deadlock_victim was set
@retval DB_LOCK_WAIT if the lock was canceled */
- static dberr_t cancel(trx_t *trx, lock_t *lock, bool check_victim);
+ template<bool check_victim>
+ static dberr_t cancel(trx_t *trx, lock_t *lock);
/** Cancel a waiting lock request (if any) when killing a transaction */
static void cancel(trx_t *trx);
@@ -981,6 +998,149 @@ private:
hash_cell_t *cell2_;
};
+/** lock_sys.latch exclusive guard using transactional memory */
+struct TMLockMutexGuard
+{
+ TRANSACTIONAL_INLINE
+ TMLockMutexGuard(SRW_LOCK_ARGS(const char *file, unsigned line))
+ {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (xbegin())
+ {
+ if (was_elided())
+ return;
+ xabort();
+ }
+#endif
+ lock_sys.wr_lock(SRW_LOCK_ARGS(file, line));
+ }
+ TRANSACTIONAL_INLINE
+ ~TMLockMutexGuard()
+ {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (was_elided()) xend(); else
+#endif
+ lock_sys.wr_unlock();
+ }
+
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ bool was_elided() const noexcept
+ { return !lock_sys.latch.is_locked_or_waiting(); }
+#else
+ bool was_elided() const noexcept { return false; }
+#endif
+};
+
+/** lock_sys latch guard for 1 page_id_t, using transactional memory */
+struct TMLockGuard
+{
+ TRANSACTIONAL_TARGET
+ TMLockGuard(lock_sys_t::hash_table &hash, const page_id_t id);
+ TRANSACTIONAL_INLINE ~TMLockGuard()
+ {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (elided)
+ {
+ xend();
+ return;
+ }
+#endif
+ lock_sys_t::hash_table::latch(cell_)->release();
+ /* Must be last, to avoid a race with lock_sys_t::hash_table::resize() */
+ lock_sys.rd_unlock();
+ }
+ /** @return the hash array cell */
+ hash_cell_t &cell() const { return *cell_; }
+private:
+ /** The hash array cell */
+ hash_cell_t *cell_;
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ /** whether the latches were elided */
+ bool elided;
+#endif
+};
+
+/** guard for shared lock_sys.latch and trx_t::mutex using
+transactional memory */
+struct TMLockTrxGuard
+{
+ trx_t &trx;
+
+ TRANSACTIONAL_INLINE
+#ifndef UNIV_PFS_RWLOCK
+ TMLockTrxGuard(trx_t &trx) : trx(trx)
+# define TMLockTrxArgs(trx) trx
+#else
+ TMLockTrxGuard(const char *file, unsigned line, trx_t &trx) : trx(trx)
+# define TMLockTrxArgs(trx) SRW_LOCK_CALL, trx
+#endif
+ {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (xbegin())
+ {
+ if (!lock_sys.latch.is_write_locked() && was_elided())
+ return;
+ xabort();
+ }
+#endif
+ lock_sys.rd_lock(SRW_LOCK_ARGS(file, line));
+ trx.mutex_lock();
+ }
+ TRANSACTIONAL_INLINE
+ ~TMLockTrxGuard()
+ {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (was_elided())
+ {
+ xend();
+ return;
+ }
+#endif
+ lock_sys.rd_unlock();
+ trx.mutex_unlock();
+ }
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ bool was_elided() const noexcept { return !trx.mutex_is_locked(); }
+#else
+ bool was_elided() const noexcept { return false; }
+#endif
+};
+
+/** guard for trx_t::mutex using transactional memory */
+struct TMTrxGuard
+{
+ trx_t &trx;
+
+ TRANSACTIONAL_INLINE TMTrxGuard(trx_t &trx) : trx(trx)
+ {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (xbegin())
+ {
+ if (was_elided())
+ return;
+ xabort();
+ }
+#endif
+ trx.mutex_lock();
+ }
+ TRANSACTIONAL_INLINE ~TMTrxGuard()
+ {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (was_elided())
+ {
+ xend();
+ return;
+ }
+#endif
+ trx.mutex_unlock();
+ }
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ bool was_elided() const noexcept { return !trx.mutex_is_locked(); }
+#else
+ bool was_elided() const noexcept { return false; }
+#endif
+};
+
/*********************************************************************//**
Creates a new record lock and inserts it to the lock queue. Does NOT check
for deadlocks or lock compatibility!
diff --git a/storage/innobase/include/lock0priv.h b/storage/innobase/include/lock0priv.h
index f0595598838..b5ec7a0d29e 100644
--- a/storage/innobase/include/lock0priv.h
+++ b/storage/innobase/include/lock0priv.h
@@ -459,7 +459,7 @@ lock_rec_get_n_bits(
/**********************************************************************//**
Sets the nth bit of a record lock to TRUE. */
-UNIV_INLINE
+inline
void
lock_rec_set_nth_bit(
/*=================*/
@@ -473,7 +473,12 @@ lock_rec_set_nth_bit(
inline byte lock_rec_reset_nth_bit(lock_t* lock, ulint i)
{
ut_ad(!lock->is_table());
+#ifdef SUX_LOCK_GENERIC
ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner());
+#else
+ ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner()
+ || (xtest() && !lock->trx->mutex_is_locked()));
+#endif
ut_ad(i < lock->un_member.rec_lock.n_bits);
byte* b = reinterpret_cast<byte*>(&lock[1]) + (i >> 3);
diff --git a/storage/innobase/include/lock0priv.ic b/storage/innobase/include/lock0priv.ic
index c51304cd7ed..21e7c7c95dc 100644
--- a/storage/innobase/include/lock0priv.ic
+++ b/storage/innobase/include/lock0priv.ic
@@ -67,7 +67,7 @@ lock_rec_get_n_bits(
/**********************************************************************//**
Sets the nth bit of a record lock to TRUE. */
-UNIV_INLINE
+inline
void
lock_rec_set_nth_bit(
/*=================*/
@@ -91,7 +91,12 @@ lock_rec_set_nth_bit(
#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
# pragma GCC diagnostic pop
#endif
+#ifdef SUX_LOCK_GENERIC
ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner());
+#else
+ ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner()
+ || (xtest() && !lock->trx->mutex_is_locked()));
+#endif
lock->trx->lock.n_rec_locks++;
}
diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h
index 7e190b340de..687bd725f13 100644
--- a/storage/innobase/include/os0file.h
+++ b/storage/innobase/include/os0file.h
@@ -1111,8 +1111,10 @@ void os_aio_free();
@retval DB_IO_ERROR on I/O error */
dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n);
-/** Wait until there are no pending asynchronous writes.
-Only used on FLUSH TABLES...FOR EXPORT. */
+/** @return the number of pending writes */
+size_t os_aio_pending_writes();
+
+/** Wait until there are no pending asynchronous writes. */
void os_aio_wait_until_no_pending_writes();
/** Wait until all pending asynchronous reads have completed. */
diff --git a/storage/innobase/include/rw_lock.h b/storage/innobase/include/rw_lock.h
index ba380b77261..0ae052fabe2 100644
--- a/storage/innobase/include/rw_lock.h
+++ b/storage/innobase/include/rw_lock.h
@@ -222,23 +222,13 @@ public:
}
/** @return whether an exclusive lock is being held by any thread */
- bool is_write_locked() const
- { return !!(lock.load(std::memory_order_relaxed) & WRITER); }
+ bool is_write_locked() const { return !!(value() & WRITER); }
#ifdef SUX_LOCK_GENERIC
/** @return whether an update lock is being held by any thread */
- bool is_update_locked() const
- { return !!(lock.load(std::memory_order_relaxed) & UPDATER); }
+ bool is_update_locked() const { return !!(value() & UPDATER); }
#endif /* SUX_LOCK_GENERIC */
- /** @return whether a shared lock is being held by any thread */
- bool is_read_locked() const
- {
- auto l= lock.load(std::memory_order_relaxed);
- return (l & ~WRITER_PENDING) && !(l & WRITER);
- }
/** @return whether any lock is being held or waited for by any thread */
- bool is_locked_or_waiting() const
- { return lock.load(std::memory_order_relaxed) != 0; }
+ bool is_locked_or_waiting() const { return value() != 0; }
/** @return whether any lock is being held by any thread */
- bool is_locked() const
- { return (lock.load(std::memory_order_relaxed) & ~WRITER_WAITING) != 0; }
+ bool is_locked() const { return (value() & ~WRITER_WAITING) != 0; }
};
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
index 0c32d5d686a..108e7206c46 100644
--- a/storage/innobase/include/srv0srv.h
+++ b/storage/innobase/include/srv0srv.h
@@ -524,7 +524,7 @@ do { \
#ifdef HAVE_PSI_STAGE_INTERFACE
/** Performance schema stage event for monitoring ALTER TABLE progress
-everything after flush log_make_checkpoint(). */
+in ha_innobase::commit_inplace_alter_table(). */
extern PSI_stage_info srv_stage_alter_table_end;
/** Performance schema stage event for monitoring ALTER TABLE progress
diff --git a/storage/innobase/include/srw_lock.h b/storage/innobase/include/srw_lock.h
index 9e2eac15df0..54d042419ca 100644
--- a/storage/innobase/include/srw_lock.h
+++ b/storage/innobase/include/srw_lock.h
@@ -60,10 +60,10 @@ class srw_mutex_impl final
public:
/** @return whether the mutex is being held or waited for */
bool is_locked_or_waiting() const
- { return lock.load(std::memory_order_relaxed) != 0; }
+ { return lock.load(std::memory_order_acquire) != 0; }
/** @return whether the mutex is being held by any thread */
bool is_locked() const
- { return (lock.load(std::memory_order_relaxed) & HOLDER) != 0; }
+ { return (lock.load(std::memory_order_acquire) & HOLDER) != 0; }
void init() { DBUG_ASSERT(!is_locked_or_waiting()); }
void destroy() { DBUG_ASSERT(!is_locked_or_waiting()); }
@@ -174,11 +174,7 @@ public:
{ return (readers.load(std::memory_order_relaxed) & WRITER) != 0; }
# ifndef DBUG_OFF
/** @return whether the lock is being held or waited for */
- bool is_vacant() const
- {
- return !readers.load(std::memory_order_relaxed) &&
- !writer.is_locked_or_waiting();
- }
+ bool is_vacant() const { return !is_locked_or_waiting(); }
# endif /* !DBUG_OFF */
bool rd_lock_try()
@@ -250,7 +246,7 @@ public:
void wr_u_downgrade()
{
DBUG_ASSERT(writer.is_locked());
- DBUG_ASSERT(readers.load(std::memory_order_relaxed) == WRITER);
+ DBUG_ASSERT(is_write_locked());
readers.store(1, std::memory_order_release);
/* Note: Any pending rd_lock() will not be woken up until u_unlock() */
}
@@ -272,10 +268,24 @@ public:
}
void wr_unlock()
{
- DBUG_ASSERT(readers.load(std::memory_order_relaxed) == WRITER);
+ DBUG_ASSERT(is_write_locked());
readers.store(0, std::memory_order_release);
writer.wr_unlock();
}
+ /** @return whether an exclusive lock may be held by any thread */
+ bool is_write_locked() const noexcept
+ { return readers.load(std::memory_order_acquire) == WRITER; }
+ /** @return whether any lock may be held by any thread */
+ bool is_locked() const noexcept
+ { return readers.load(std::memory_order_acquire) != 0; }
+ /** @return whether any lock may be held by any thread */
+ bool is_locked_or_waiting() const noexcept
+ { return is_locked() || writer.is_locked_or_waiting(); }
+
+ void lock_shared() { rd_lock(); }
+ void unlock_shared() { rd_unlock(); }
+ void lock() { wr_lock(); }
+ void unlock() { wr_unlock(); }
#endif
};
@@ -288,26 +298,43 @@ class srw_lock_
friend srw_lock_impl<spinloop>;
# endif
# ifdef _WIN32
- SRWLOCK lock;
+ SRWLOCK lk;
# else
- rw_lock_t lock;
+ rw_lock_t lk;
# endif
void rd_wait();
void wr_wait();
public:
- void init() { IF_WIN(,my_rwlock_init(&lock, nullptr)); }
- void destroy() { IF_WIN(,rwlock_destroy(&lock)); }
+ void init() { IF_WIN(,my_rwlock_init(&lk, nullptr)); }
+ void destroy() { IF_WIN(,rwlock_destroy(&lk)); }
inline void rd_lock();
inline void wr_lock();
bool rd_lock_try()
- { return IF_WIN(TryAcquireSRWLockShared(&lock), !rw_tryrdlock(&lock)); }
+ { return IF_WIN(TryAcquireSRWLockShared(&lk), !rw_tryrdlock(&lk)); }
void rd_unlock()
- { IF_WIN(ReleaseSRWLockShared(&lock), rw_unlock(&lock)); }
+ { IF_WIN(ReleaseSRWLockShared(&lk), rw_unlock(&lk)); }
bool wr_lock_try()
- { return IF_WIN(TryAcquireSRWLockExclusive(&lock), !rw_trywrlock(&lock)); }
+ { return IF_WIN(TryAcquireSRWLockExclusive(&lk), !rw_trywrlock(&lk)); }
void wr_unlock()
- { IF_WIN(ReleaseSRWLockExclusive(&lock), rw_unlock(&lock)); }
+ { IF_WIN(ReleaseSRWLockExclusive(&lk), rw_unlock(&lk)); }
+#ifdef _WIN32
+ /** @return whether any lock may be held by any thread */
+ bool is_locked_or_waiting() const noexcept { return (size_t&)(lk) != 0; }
+ /** @return whether any lock may be held by any thread */
+ bool is_locked() const noexcept { return is_locked_or_waiting(); }
+ /** @return whether an exclusive lock may be held by any thread */
+ bool is_write_locked() const noexcept
+ {
+ // FIXME: this returns false positives for shared locks
+ return is_locked();
+ }
+
+ void lock_shared() { rd_lock(); }
+ void unlock_shared() { rd_unlock(); }
+ void lock() { wr_lock(); }
+ void unlock() { wr_unlock(); }
+#endif
};
template<> void srw_lock_<true>::rd_wait();
@@ -315,10 +342,10 @@ template<> void srw_lock_<true>::wr_wait();
template<>
inline void srw_lock_<false>::rd_lock()
-{ IF_WIN(AcquireSRWLockShared(&lock), rw_rdlock(&lock)); }
+{ IF_WIN(AcquireSRWLockShared(&lk), rw_rdlock(&lk)); }
template<>
inline void srw_lock_<false>::wr_lock()
-{ IF_WIN(AcquireSRWLockExclusive(&lock), rw_wrlock(&lock)); }
+{ IF_WIN(AcquireSRWLockExclusive(&lk), rw_wrlock(&lk)); }
template<>
inline void srw_lock_<true>::rd_lock() { if (!rd_lock_try()) rd_wait(); }
@@ -476,6 +503,15 @@ public:
}
bool rd_lock_try() { return lock.rd_lock_try(); }
bool wr_lock_try() { return lock.wr_lock_try(); }
+#ifndef SUX_LOCK_GENERIC
+ /** @return whether any lock may be held by any thread */
+ bool is_locked_or_waiting() const noexcept
+ { return lock.is_locked_or_waiting(); }
+ /** @return whether an exclusive lock may be held by any thread */
+ bool is_locked() const noexcept { return lock.is_locked(); }
+ /** @return whether an exclusive lock may be held by any thread */
+ bool is_write_locked() const noexcept { return lock.is_write_locked(); }
+#endif
};
typedef srw_lock_impl<false> srw_lock;
diff --git a/storage/innobase/include/transactional_lock_guard.h b/storage/innobase/include/transactional_lock_guard.h
new file mode 100644
index 00000000000..7ece27638fc
--- /dev/null
+++ b/storage/innobase/include/transactional_lock_guard.h
@@ -0,0 +1,167 @@
+/*****************************************************************************
+
+Copyright (c) 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+
+#if defined __powerpc64__ && defined __clang__ && defined __linux__
+#elif defined __powerpc64__&&defined __GNUC__&&defined __linux__&&__GNUC__ > 4
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+# if __GNUC__ >= 8
+# elif defined __clang_major__ && __clang_major__ > 6
+# else
+# define NO_ELISION
+# endif
+#else /* Transactional memory has not been implemented for this ISA */
+# define NO_ELISION
+#endif
+
+#ifdef NO_ELISION
+constexpr bool have_transactional_memory= false;
+# ifdef UNIV_DEBUG
+static inline bool xtest() { return false; }
+# endif
+# define TRANSACTIONAL_TARGET /* nothing */
+# define TRANSACTIONAL_INLINE /* nothing */
+#else
+# if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
+extern bool have_transactional_memory;
+bool transactional_lock_enabled();
+
+# include <immintrin.h>
+# if defined __GNUC__ && !defined __INTEL_COMPILER
+# define TRANSACTIONAL_TARGET __attribute__((target("rtm")))
+# define TRANSACTIONAL_INLINE __attribute__((target("rtm"),always_inline))
+# else
+# define TRANSACTIONAL_TARGET /* nothing */
+# define TRANSACTIONAL_INLINE /* nothing */
+# endif
+
+TRANSACTIONAL_INLINE static inline bool xbegin()
+{
+ return have_transactional_memory && _xbegin() == _XBEGIN_STARTED;
+}
+
+# ifdef UNIV_DEBUG
+# ifdef __GNUC__
+/** @return whether a memory transaction is active */
+bool xtest();
+# else
+static inline bool xtest() { return have_transactional_memory && _xtest(); }
+# endif
+# endif
+
+TRANSACTIONAL_INLINE static inline void xabort() { _xabort(0); }
+
+TRANSACTIONAL_INLINE static inline void xend() { _xend(); }
+# elif defined __powerpc64__
+# include <htmxlintrin.h>
+extern bool have_transactional_memory;
+bool transactional_lock_enabled();
+# define TRANSACTIONAL_TARGET __attribute__((target("htm")))
+# define TRANSACTIONAL_INLINE __attribute__((target("htm"),always_inline))
+
+TRANSACTIONAL_INLINE static inline bool xbegin()
+{
+ return have_transactional_memory &&
+ __TM_simple_begin() == _HTM_TBEGIN_STARTED;
+}
+
+# ifdef UNIV_DEBUG
+bool xtest();
+# endif
+
+TRANSACTIONAL_INLINE static inline void xabort() { __TM_abort(); }
+
+TRANSACTIONAL_INLINE static inline void xend() { __TM_end(); }
+# endif
+#endif
+
+template<class mutex>
+class transactional_lock_guard
+{
+ mutex &m;
+
+public:
+ TRANSACTIONAL_INLINE transactional_lock_guard(mutex &m) : m(m)
+ {
+#ifndef NO_ELISION
+ if (xbegin())
+ {
+ if (was_elided())
+ return;
+ xabort();
+ }
+#endif
+ m.lock();
+ }
+ transactional_lock_guard(const transactional_lock_guard &)= delete;
+ TRANSACTIONAL_INLINE ~transactional_lock_guard()
+ {
+#ifndef NO_ELISION
+ if (was_elided()) xend(); else
+#endif
+ m.unlock();
+ }
+
+#ifndef NO_ELISION
+ bool was_elided() const noexcept { return !m.is_locked_or_waiting(); }
+#else
+ bool was_elided() const noexcept { return false; }
+#endif
+};
+
+template<class mutex>
+class transactional_shared_lock_guard
+{
+ mutex &m;
+#ifndef NO_ELISION
+ bool elided;
+#else
+ static constexpr bool elided= false;
+#endif
+
+public:
+ TRANSACTIONAL_INLINE transactional_shared_lock_guard(mutex &m) : m(m)
+ {
+#ifndef NO_ELISION
+ if (xbegin())
+ {
+ if (!m.is_write_locked())
+ {
+ elided= true;
+ return;
+ }
+ xabort();
+ }
+ elided= false;
+#endif
+ m.lock_shared();
+ }
+ transactional_shared_lock_guard(const transactional_shared_lock_guard &)=
+ delete;
+ TRANSACTIONAL_INLINE ~transactional_shared_lock_guard()
+ {
+#ifndef NO_ELISION
+ if (was_elided()) xend(); else
+#endif
+ m.unlock_shared();
+ }
+
+ bool was_elided() const noexcept { return elided; }
+};
diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h
index 86e8b534f54..d2bf7075594 100644
--- a/storage/innobase/include/trx0trx.h
+++ b/storage/innobase/include/trx0trx.h
@@ -621,6 +621,9 @@ public:
== os_thread_get_curr_id());
mutex.wr_unlock();
}
+#ifndef SUX_LOCK_GENERIC
+ bool mutex_is_locked() const noexcept { return mutex.is_locked(); }
+#endif
#ifdef UNIV_DEBUG
/** @return whether the current thread holds the mutex */
bool mutex_is_owner() const
diff --git a/storage/innobase/include/ut0new.h b/storage/innobase/include/ut0new.h
index 82c80994e72..cd116cc3a20 100644
--- a/storage/innobase/include/ut0new.h
+++ b/storage/innobase/include/ut0new.h
@@ -841,6 +841,8 @@ constexpr const char* const auto_event_names[] =
"buf0buf",
"buf0dblwr",
"buf0dump",
+ "buf0lru",
+ "buf0rea",
"dict0dict",
"dict0mem",
"dict0stats",
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index 86c44d2e52f..234d215d1df 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -166,9 +166,7 @@ void lock_sys_t::assert_locked(const lock_t &lock) const
void lock_sys_t::assert_locked(const dict_table_t &table) const
{
ut_ad(!table.is_temporary());
-
- const os_thread_id_t current_thread= os_thread_get_curr_id();
- if (writer.load(std::memory_order_relaxed) == current_thread)
+ if (is_writer())
return;
ut_ad(readers);
ut_ad(table.lock_mutex_is_owner());
@@ -186,7 +184,7 @@ void lock_sys_t::hash_table::assert_locked(const page_id_t id) const
/** Assert that a hash table cell is exclusively latched (by some thread) */
void lock_sys_t::assert_locked(const hash_cell_t &cell) const
{
- if (lock_sys.is_writer())
+ if (is_writer())
return;
ut_ad(lock_sys.readers);
ut_ad(hash_table::latch(const_cast<hash_cell_t*>(&cell))->is_locked());
@@ -229,6 +227,28 @@ LockMultiGuard::~LockMultiGuard()
lock_sys.rd_unlock();
}
+TRANSACTIONAL_TARGET
+TMLockGuard::TMLockGuard(lock_sys_t::hash_table &hash, page_id_t id)
+{
+ const auto id_fold= id.fold();
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (xbegin())
+ {
+ if (lock_sys.latch.is_write_locked())
+ xabort();
+ cell_= hash.cell_get(id_fold);
+ if (hash.latch(cell_)->is_locked())
+ xabort();
+ elided= true;
+ return;
+ }
+ elided= false;
+#endif
+ lock_sys.rd_lock(SRW_LOCK_CALL);
+ cell_= hash.cell_get(id_fold);
+ hash.latch(cell_)->acquire();
+}
+
/** Pretty-print a table lock.
@param[in,out] file output stream
@param[in] lock table lock */
@@ -430,6 +450,8 @@ void lock_sys_t::rd_unlock()
void lock_sys_t::resize(ulint n_cells)
{
ut_ad(this == &lock_sys);
+ /* Buffer pool resizing is rarely initiated by the user, and this
+ would exceed the maximum size of a memory transaction. */
LockMutexGuard g{SRW_LOCK_CALL};
rec_hash.resize(n_cells);
prdt_hash.resize(n_cells);
@@ -893,7 +915,8 @@ void lock_wait_wsrep_kill(trx_t *bf_trx, ulong thd_id, trx_id_t trx_id);
/** Kill the holders of conflicting locks.
@param trx brute-force applier transaction running in the current thread */
-ATTRIBUTE_COLD ATTRIBUTE_NOINLINE static void lock_wait_wsrep(trx_t *trx)
+ATTRIBUTE_COLD ATTRIBUTE_NOINLINE
+static void lock_wait_wsrep(trx_t *trx)
{
DBUG_ASSERT(wsrep_on(trx->mysql_thd));
if (!wsrep_thd_is_BF(trx->mysql_thd, false))
@@ -1136,7 +1159,7 @@ lock_rec_create_low(
ulint n_bytes;
ut_d(lock_sys.hash_get(type_mode).assert_locked(page_id));
- ut_ad(holds_trx_mutex == trx->mutex_is_owner());
+ ut_ad(xtest() || holds_trx_mutex == trx->mutex_is_owner());
ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index));
ut_ad(!(type_mode & LOCK_TABLE));
ut_ad(trx->state != TRX_STATE_NOT_STARTED);
@@ -1263,7 +1286,7 @@ lock_rec_enqueue_waiting(
ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index));
trx_t* trx = thr_get_trx(thr);
- ut_ad(trx->mutex_is_owner());
+ ut_ad(xtest() || trx->mutex_is_owner());
ut_ad(!trx->dict_operation_lock_mode);
if (trx->mysql_thd && thd_lock_wait_timeout(trx->mysql_thd) == 0) {
@@ -1331,6 +1354,7 @@ can reuse a suitable record lock object already existing on the same page,
just setting the appropriate bit in its bitmap. This is a low-level function
which does NOT check for deadlocks or lock compatibility!
@return lock where the bit was set */
+TRANSACTIONAL_TARGET
static
void
lock_rec_add_to_queue(
@@ -1349,7 +1373,7 @@ lock_rec_add_to_queue(
transaction mutex */
{
ut_d(lock_sys.hash_get(type_mode).assert_locked(id));
- ut_ad(caller_owns_trx_mutex == trx->mutex_is_owner());
+ ut_ad(xtest() || caller_owns_trx_mutex == trx->mutex_is_owner());
ut_ad(index->is_primary()
|| dict_index_get_online_status(index) != ONLINE_INDEX_CREATION);
ut_ad(!(type_mode & LOCK_TABLE));
@@ -1422,9 +1446,11 @@ lock_rec_add_to_queue(
if (caller_owns_trx_mutex) {
trx->mutex_unlock();
}
- lock_trx->mutex_lock();
- lock_rec_set_nth_bit(lock, heap_no);
- lock_trx->mutex_unlock();
+ {
+ TMTrxGuard tg{*lock_trx};
+ lock_rec_set_nth_bit(lock, heap_no);
+ }
+
if (caller_owns_trx_mutex) {
trx->mutex_lock();
}
@@ -1637,6 +1663,8 @@ static void lock_wait_rpl_report(trx_t *trx)
if (!wait_lock)
return;
ut_ad(!(wait_lock->type_mode & LOCK_AUTO_INC));
+ /* This would likely be too large to attempt to use a memory transaction,
+ even for wait_lock->is_table(). */
if (!lock_sys.wr_lock_try())
{
mysql_mutex_unlock(&lock_sys.wait_mutex);
@@ -1833,13 +1861,13 @@ dberr_t lock_wait(que_thr_t *thr)
if (row_lock_wait)
lock_sys.wait_resume(trx->mysql_thd, suspend_time, my_hrtime_coarse());
-end_wait:
if (lock_t *lock= trx->lock.wait_lock)
{
- lock_sys_t::cancel(trx, lock, false);
+ lock_sys_t::cancel<false>(trx, lock);
lock_sys.deadlock_check();
}
+end_wait:
mysql_mutex_unlock(&lock_sys.wait_mutex);
thd_wait_end(trx->mysql_thd);
@@ -1993,6 +2021,7 @@ static void lock_rec_dequeue_from_page(lock_t *in_lock, bool owns_wait_mutex)
/** Remove a record lock request, waiting or granted, on a discarded page
@param hash hash table
@param in_lock lock object */
+TRANSACTIONAL_TARGET
void lock_rec_discard(lock_sys_t::hash_table &lock_hash, lock_t *in_lock)
{
ut_ad(!in_lock->is_table());
@@ -2000,13 +2029,15 @@ void lock_rec_discard(lock_sys_t::hash_table &lock_hash, lock_t *in_lock)
HASH_DELETE(lock_t, hash, &lock_hash,
in_lock->un_member.rec_lock.page_id.fold(), in_lock);
- trx_t *trx= in_lock->trx;
- trx->mutex_lock();
- ut_d(auto old_locks=)
- in_lock->index->table->n_rec_locks--;
+ ut_d(uint32_t old_locks);
+ {
+ trx_t *trx= in_lock->trx;
+ TMTrxGuard tg{*trx};
+ ut_d(old_locks=)
+ in_lock->index->table->n_rec_locks--;
+ UT_LIST_REMOVE(trx->lock.trx_locks, in_lock);
+ }
ut_ad(old_locks);
- UT_LIST_REMOVE(trx->lock.trx_locks, in_lock);
- trx->mutex_unlock();
MONITOR_INC(MONITOR_RECLOCK_REMOVED);
MONITOR_DEC(MONITOR_NUM_RECLOCK);
}
@@ -2030,11 +2061,15 @@ lock_rec_free_all_from_discard_page(page_id_t id, const hash_cell_t &cell,
}
}
-/** Discard locks for an index */
-void lock_discard_for_index(const dict_index_t &index)
+/** Discard locks for an index when purging DELETE FROM SYS_INDEXES
+after an aborted CREATE INDEX operation.
+@param index a stale index on which ADD INDEX operation was aborted */
+ATTRIBUTE_COLD void lock_discard_for_index(const dict_index_t &index)
{
ut_ad(!index.is_committed());
- lock_sys.wr_lock(SRW_LOCK_CALL);
+ /* This is very rarely executed code, and the size of the hash array
+ would exceed the maximum size of a memory transaction. */
+ LockMutexGuard g{SRW_LOCK_CALL};
const ulint n= lock_sys.rec_hash.pad(lock_sys.rec_hash.n_cells);
for (ulint i= 0; i < n; i++)
{
@@ -2052,7 +2087,6 @@ void lock_discard_for_index(const dict_index_t &index)
lock= lock->hash;
}
}
- lock_sys.wr_unlock();
}
/*============= RECORD LOCK MOVING AND INHERITING ===================*/
@@ -2060,6 +2094,7 @@ void lock_discard_for_index(const dict_index_t &index)
/*************************************************************//**
Resets the lock bits for a single record. Releases transactions waiting for
lock requests here. */
+TRANSACTIONAL_TARGET
static
void
lock_rec_reset_and_release_wait(const hash_cell_t &cell, const page_id_t id,
@@ -2072,10 +2107,8 @@ lock_rec_reset_and_release_wait(const hash_cell_t &cell, const page_id_t id,
lock_rec_cancel(lock);
else
{
- trx_t *lock_trx= lock->trx;
- lock_trx->mutex_lock();
+ TMTrxGuard tg{*lock->trx};
lock_rec_reset_nth_bit(lock, heap_no);
- lock_trx->mutex_unlock();
}
}
}
@@ -2157,6 +2190,7 @@ lock_rec_inherit_to_gap_if_gap_lock(
/*************************************************************//**
Moves the locks of a record to another record and resets the lock bits of
the donating record. */
+TRANSACTIONAL_TARGET
static
void
lock_rec_move(
@@ -2243,6 +2277,7 @@ Updates the lock table when we have reorganized a page. NOTE: we copy
also the locks set on the infimum of the page; the infimum may carry
locks if an update of a record is occurring on the page, and its locks
were temporarily stored on the infimum. */
+TRANSACTIONAL_TARGET
void
lock_move_reorganize_page(
/*======================*/
@@ -2260,12 +2295,14 @@ lock_move_reorganize_page(
const page_id_t id{block->page.id()};
const auto id_fold= id.fold();
{
- LockGuard g{lock_sys.rec_hash, id};
+ TMLockGuard g{lock_sys.rec_hash, id};
if (!lock_sys_t::get_first(g.cell(), id))
return;
}
- /* We will modify arbitrary trx->lock.trx_locks. */
+ /* We will modify arbitrary trx->lock.trx_locks.
+ Do not bother with a memory transaction; we are going
+ to allocate memory and copy a lot of data. */
LockMutexGuard g{SRW_LOCK_CALL};
hash_cell_t &cell= *lock_sys.rec_hash.cell_get(id_fold);
@@ -2348,10 +2385,10 @@ lock_move_reorganize_page(
}
trx_t *lock_trx= lock->trx;
- lock_trx->mutex_lock();
+ lock_trx->mutex_lock();
- /* Clear the bit in old_lock. */
- if (old_heap_no < lock->un_member.rec_lock.n_bits &&
+ /* Clear the bit in old_lock. */
+ if (old_heap_no < lock->un_member.rec_lock.n_bits &&
lock_rec_reset_nth_bit(lock, old_heap_no))
{
ut_ad(!page_rec_is_metadata(orec));
@@ -2362,7 +2399,7 @@ lock_move_reorganize_page(
new_heap_no, lock->index, lock_trx, true);
}
- lock_trx->mutex_unlock();
+ lock_trx->mutex_unlock();
if (new_heap_no == PAGE_HEAP_NO_SUPREMUM)
{
@@ -2389,6 +2426,7 @@ lock_move_reorganize_page(
/*************************************************************//**
Moves the explicit locks on user records to another page if a record
list end is moved to another page. */
+TRANSACTIONAL_TARGET
void
lock_move_rec_list_end(
/*===================*/
@@ -2405,6 +2443,7 @@ lock_move_rec_list_end(
const page_id_t id{block->page.id()};
const page_id_t new_id{new_block->page.id()};
{
+ /* This would likely be too large for a memory transaction. */
LockMultiGuard g{lock_sys.rec_hash, id, new_id};
/* Note: when we move locks from record to record, waiting locks
@@ -2504,6 +2543,7 @@ lock_move_rec_list_end(
/*************************************************************//**
Moves the explicit locks on user records to another page if a record
list start is moved to another page. */
+TRANSACTIONAL_TARGET
void
lock_move_rec_list_start(
/*=====================*/
@@ -2529,6 +2569,7 @@ lock_move_rec_list_start(
const page_id_t new_id{new_block->page.id()};
{
+ /* This would likely be too large for a memory transaction. */
LockMultiGuard g{lock_sys.rec_hash, id, new_id};
for (lock_t *lock= lock_sys_t::get_first(g.cell1(), id); lock;
@@ -2616,6 +2657,7 @@ lock_move_rec_list_start(
/*************************************************************//**
Moves the explicit locks on user records to another page if a record
list start is moved to another page. */
+TRANSACTIONAL_TARGET
void
lock_rtr_move_rec_list(
/*===================*/
@@ -2638,6 +2680,7 @@ lock_rtr_move_rec_list(
const page_id_t new_id{new_block->page.id()};
{
+ /* This would likely be too large for a memory transaction. */
LockMultiGuard g{lock_sys.rec_hash, id, new_id};
for (lock_t *lock= lock_sys_t::get_first(g.cell1(), id); lock;
@@ -2712,6 +2755,7 @@ lock_update_split_right(
const page_id_t l{left_block->page.id()};
const page_id_t r{right_block->page.id()};
+ /* This would likely be too large for a memory transaction. */
LockMultiGuard g{lock_sys.rec_hash, l, r};
/* Move the locks on the supremum of the left page to the supremum
@@ -2764,6 +2808,7 @@ lock_update_merge_right(
const page_id_t l{left_block->page.id()};
const page_id_t r{right_block->page.id()};
+ /* This would likely be too large for a memory transaction. */
LockMultiGuard g{lock_sys.rec_hash, l, r};
/* Inherit the locks from the supremum of the left page to the
@@ -2790,6 +2835,7 @@ to be updated. */
void lock_update_root_raise(const buf_block_t &block, const page_id_t root)
{
const page_id_t id{block.page.id()};
+ /* This would likely be too large for a memory transaction. */
LockMultiGuard g{lock_sys.rec_hash, id, root};
/* Move the locks on the supremum of the root to the supremum of block */
lock_rec_move(g.cell1(), block, id, g.cell2(), root,
@@ -2802,6 +2848,7 @@ void lock_update_root_raise(const buf_block_t &block, const page_id_t root)
void lock_update_copy_and_discard(const buf_block_t &new_block, page_id_t old)
{
const page_id_t id{new_block.page.id()};
+ /* This would likely be too large for a memory transaction. */
LockMultiGuard g{lock_sys.rec_hash, id, old};
/* Move the locks on the supremum of the old page to the supremum of new */
lock_rec_move(g.cell1(), new_block, id, g.cell2(), old,
@@ -2838,6 +2885,7 @@ void lock_update_merge_left(const buf_block_t& left, const rec_t *orig_pred,
const page_id_t l{left.page.id()};
+ /* This would likely be too large for a memory transaction. */
LockMultiGuard g{lock_sys.rec_hash, l, right};
const rec_t *left_next_rec= page_rec_get_next_const(orig_pred);
@@ -2883,6 +2931,7 @@ lock_rec_reset_and_inherit_gap_locks(
donating record */
{
const page_id_t heir{heir_block.page.id()};
+ /* This is a rare operation and likely too large for a memory transaction. */
LockMultiGuard g{lock_sys.rec_hash, heir, donor};
lock_rec_reset_and_release_wait(g.cell1(), heir, heir_heap_no);
lock_rec_inherit_to_gap(g.cell1(), heir, g.cell2(), donor, heir_block.frame,
@@ -2906,6 +2955,7 @@ lock_update_discard(
ulint heap_no;
const page_id_t heir(heir_block->page.id());
const page_id_t page_id(block->page.id());
+ /* This would likely be too large for a memory transaction. */
LockMultiGuard g{lock_sys.rec_hash, heir, page_id};
if (lock_sys_t::get_first(g.cell2(), page_id)) {
@@ -3379,6 +3429,37 @@ lock_table_other_has_incompatible(
return(NULL);
}
+/** Aqcuire or enqueue a table lock */
+static dberr_t lock_table_low(dict_table_t *table, lock_mode mode,
+ que_thr_t *thr, trx_t *trx)
+{
+ lock_t *wait_for=
+ lock_table_other_has_incompatible(trx, LOCK_WAIT, table, mode);
+ dberr_t err= DB_SUCCESS;
+
+ trx->mutex_lock();
+
+ if (wait_for)
+ err= lock_table_enqueue_waiting(mode, table, thr, wait_for);
+ else
+ lock_table_create(table, mode, trx, nullptr);
+
+ trx->mutex_unlock();
+
+ return err;
+}
+
+#ifdef WITH_WSREP
+/** Aqcuire or enqueue a table lock in Galera replication mode. */
+ATTRIBUTE_NOINLINE
+static dberr_t lock_table_wsrep(dict_table_t *table, lock_mode mode,
+ que_thr_t *thr, trx_t *trx)
+{
+ LockMutexGuard g{SRW_LOCK_CALL};
+ return lock_table_low(table, mode, thr, trx);
+}
+#endif
+
/*********************************************************************//**
Locks the specified database table in the mode given. If the lock cannot
be granted immediately, the query thread is put to wait.
@@ -3392,8 +3473,6 @@ lock_table(
que_thr_t* thr) /*!< in: query thread */
{
trx_t* trx;
- dberr_t err;
- lock_t* wait_for;
if (table->is_temporary()) {
return DB_SUCCESS;
@@ -3403,7 +3482,7 @@ lock_table(
/* Look for equal or stronger locks the same trx already
has on the table. No need to acquire LockMutexGuard here
- because only this transacton can add/access table locks
+ because only this transaction can add/access table locks
to/from trx_t::table_locks. */
if (lock_table_has(trx, table, mode) || srv_read_only_mode) {
@@ -3422,46 +3501,18 @@ lock_table(
trx_set_rw_mode(trx);
}
- err = DB_SUCCESS;
-
#ifdef WITH_WSREP
if (trx->is_wsrep()) {
- lock_sys.wr_lock(SRW_LOCK_CALL);
- } else {
- lock_sys.rd_lock(SRW_LOCK_CALL);
- table->lock_mutex_lock();
+ return lock_table_wsrep(table, mode, thr, trx);
}
-#else
+#endif
lock_sys.rd_lock(SRW_LOCK_CALL);
table->lock_mutex_lock();
-#endif
-
- /* We have to check if the new lock is compatible with any locks
- other transactions have in the table lock queue. */
-
- wait_for = lock_table_other_has_incompatible(
- trx, LOCK_WAIT, table, mode);
-
- trx->mutex_lock();
-
- if (wait_for) {
- err = lock_table_enqueue_waiting(mode, table, thr, wait_for);
- } else {
- lock_table_create(table, mode, trx, wait_for);
- }
-
-#ifdef WITH_WSREP
- if (trx->is_wsrep()) {
- lock_sys.wr_unlock();
- trx->mutex_unlock();
- return err;
- }
-#endif
+ dberr_t err = lock_table_low(table, mode, thr, trx);
table->lock_mutex_unlock();
lock_sys.rd_unlock();
- trx->mutex_unlock();
- return(err);
+ return err;
}
/** Create a table lock object for a resurrected transaction.
@@ -3477,6 +3528,8 @@ void lock_table_resurrect(dict_table_t *table, trx_t *trx, lock_mode mode)
return;
{
+ /* This is executed at server startup while no connections
+ are alowed. Do not bother with lock elision. */
LockMutexGuard g{SRW_LOCK_CALL};
ut_ad(!lock_table_other_has_incompatible(trx, LOCK_WAIT, table, mode));
@@ -3650,6 +3703,7 @@ dberr_t lock_sys_tables(trx_t *trx)
Removes a granted record lock of a transaction from the queue and grants
locks to other transactions waiting in the queue if they now are entitled
to a lock. */
+TRANSACTIONAL_TARGET
void
lock_rec_unlock(
/*============*/
@@ -3700,9 +3754,10 @@ lock_rec_unlock(
released:
ut_a(!lock->is_waiting());
- trx->mutex_lock();
- lock_rec_reset_nth_bit(lock, heap_no);
- trx->mutex_unlock();
+ {
+ TMTrxGuard tg{*trx};
+ lock_rec_reset_nth_bit(lock, heap_no);
+ }
/* Check if we can now grant waiting lock requests */
@@ -3730,7 +3785,7 @@ released:
/** Release the explicit locks of a committing transaction,
and release possible other transactions waiting because of these locks.
@return whether the operation succeeded */
-static bool lock_release_try(trx_t *trx)
+TRANSACTIONAL_TARGET static bool lock_release_try(trx_t *trx)
{
/* At this point, trx->lock.trx_locks cannot be modified by other
threads, because our transaction has been committed.
@@ -3746,6 +3801,10 @@ static bool lock_release_try(trx_t *trx)
bool all_released= true;
restart:
ulint count= 1000;
+ /* We will not attempt hardware lock elision (memory transaction)
+ here. Both lock_rec_dequeue_from_page() and lock_table_dequeue()
+ would likely lead to a memory transaction due to a system call, to
+ wake up a waiting transaction. */
lock_sys.rd_lock(SRW_LOCK_CALL);
trx->mutex_lock();
@@ -3824,6 +3883,8 @@ void lock_release(trx_t *trx)
/* Fall back to acquiring lock_sys.latch in exclusive mode */
restart:
count= 1000;
+ /* There is probably no point to try lock elision here;
+ in lock_release_try() it is different. */
lock_sys.wr_lock(SRW_LOCK_CALL);
trx->mutex_lock();
@@ -4002,10 +4063,14 @@ void lock_release_on_prepare(trx_t *trx)
}
/** Release locks on a table whose creation is being rolled back */
-ATTRIBUTE_COLD void lock_release_on_rollback(trx_t *trx, dict_table_t *table)
+ATTRIBUTE_COLD
+void lock_release_on_rollback(trx_t *trx, dict_table_t *table)
{
trx->mod_tables.erase(table);
+ /* This is very rarely executed code, in the rare case that an
+ CREATE TABLE operation is being rolled back. Theoretically,
+ we might try to remove the locks in multiple memory transactions. */
lock_sys.wr_lock(SRW_LOCK_CALL);
trx->mutex_lock();
@@ -4211,6 +4276,7 @@ http://bugs.mysql.com/36942 */
/*********************************************************************//**
Calculates the number of record lock structs in the record lock hash table.
@return number of record locks */
+TRANSACTIONAL_TARGET
static ulint lock_get_n_rec_locks()
{
ulint n_locks = 0;
@@ -4244,6 +4310,9 @@ lock_print_info_summary(
FILE* file, /*!< in: file where to print */
ibool nowait) /*!< in: whether to wait for lock_sys.latch */
{
+ /* Here, lock elision does not make sense, because
+ for the output we are going to invoke system calls,
+ which would interrupt a memory transaction. */
if (!nowait) {
lock_sys.wr_lock(SRW_LOCK_CALL);
} else if (!lock_sys.wr_lock_try()) {
@@ -4799,7 +4868,6 @@ static void lock_rec_block_validate(const page_id_t page_id)
}
}
-
static my_bool lock_validate_table_locks(rw_trx_hash_element_t *element, void*)
{
lock_sys.assert_locked();
@@ -4853,6 +4921,7 @@ be suspended for some reason; if not, then puts the transaction and
the query thread to the lock wait state and inserts a waiting request
for a gap x-lock to the lock queue.
@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+TRANSACTIONAL_TARGET
dberr_t
lock_rec_insert_check_and_lock(
/*===========================*/
@@ -5043,7 +5112,6 @@ static my_bool lock_rec_other_trx_holds_expl_callback(
@param[in] rec user record
@param[in] id page identifier
*/
-
static void lock_rec_other_trx_holds_expl(trx_t *caller_trx, trx_t *trx,
const rec_t *rec,
const page_id_t id)
@@ -5572,6 +5640,7 @@ static void lock_cancel_waiting_and_release(lock_t *lock)
trx->mutex_unlock();
}
#ifdef WITH_WSREP
+TRANSACTIONAL_TARGET
void lock_sys_t::cancel_lock_wait_for_trx(trx_t *trx)
{
lock_sys.wr_lock(SRW_LOCK_CALL);
@@ -5588,19 +5657,21 @@ void lock_sys_t::cancel_lock_wait_for_trx(trx_t *trx)
#endif /* WITH_WSREP */
/** Cancel a waiting lock request.
-@param lock waiting lock request
-@param trx active transaction
-@param check_victim whether to check trx->lock.was_chosen_as_deadlock_victim
+@tparam check_victim whether to check for DB_DEADLOCK
+@param lock waiting lock request
+@param trx active transaction
@retval DB_SUCCESS if no lock existed
@retval DB_DEADLOCK if trx->lock.was_chosen_as_deadlock_victim was set
@retval DB_LOCK_WAIT if the lock was canceled */
-dberr_t lock_sys_t::cancel(trx_t *trx, lock_t *lock, bool check_victim)
+template<bool check_victim>
+dberr_t lock_sys_t::cancel(trx_t *trx, lock_t *lock)
{
mysql_mutex_assert_owner(&lock_sys.wait_mutex);
ut_ad(trx->lock.wait_lock == lock);
ut_ad(trx->state == TRX_STATE_ACTIVE);
dberr_t err= DB_SUCCESS;
-
+ /* This would be too large for a memory transaction, except in the
+ DB_DEADLOCK case, which was already tested in lock_trx_handle_wait(). */
if (lock->is_table())
{
if (!lock_sys.rd_lock_try())
@@ -5619,10 +5690,25 @@ dberr_t lock_sys_t::cancel(trx_t *trx, lock_t *lock, bool check_victim)
{
resolve_table_lock:
dict_table_t *table= lock->un_member.tab_lock.table;
- table->lock_mutex_lock();
+ if (!table->lock_mutex_trylock())
+ {
+ /* The correct latching order is:
+ lock_sys.latch, table->lock_mutex_lock(), lock_sys.wait_mutex.
+ Thus, we must release lock_sys.wait_mutex for a blocking wait. */
+ mysql_mutex_unlock(&lock_sys.wait_mutex);
+ table->lock_mutex_lock();
+ mysql_mutex_lock(&lock_sys.wait_mutex);
+ lock= trx->lock.wait_lock;
+ if (!lock)
+ goto retreat;
+ else if (check_victim && trx->lock.was_chosen_as_deadlock_victim)
+ {
+ err= DB_DEADLOCK;
+ goto retreat;
+ }
+ }
if (lock->is_waiting())
lock_cancel_waiting_and_release(lock);
- table->lock_mutex_unlock();
/* Even if lock->is_waiting() did not hold above, we must return
DB_LOCK_WAIT, or otherwise optimistic parallel replication could
occasionally hang. Potentially affected tests:
@@ -5630,6 +5716,8 @@ resolve_table_lock:
rpl.rpl_parallel_optimistic_nobinlog
rpl.rpl_parallel_optimistic_xa_lsu_off */
err= DB_LOCK_WAIT;
+retreat:
+ table->lock_mutex_unlock();
}
lock_sys.rd_unlock();
}
@@ -5680,7 +5768,7 @@ void lock_sys_t::cancel(trx_t *trx)
if (!trx->dict_operation)
{
trx->error_state= DB_INTERRUPTED;
- cancel(trx, lock, false);
+ cancel<false>(trx, lock);
}
}
lock_sys.deadlock_check();
@@ -5733,7 +5821,7 @@ dberr_t lock_trx_handle_wait(trx_t *trx)
if (trx->lock.was_chosen_as_deadlock_victim)
err= DB_DEADLOCK;
else if (lock_t *wait_lock= trx->lock.wait_lock)
- err= lock_sys_t::cancel(trx, wait_lock, true);
+ err= lock_sys_t::cancel<true>(trx, wait_lock);
lock_sys.deadlock_check();
mysql_mutex_unlock(&lock_sys.wait_mutex);
return err;
@@ -5782,13 +5870,27 @@ static my_bool lock_table_locks_lookup(rw_trx_hash_element_t *element,
/** Check if there are any locks on a table.
@return true if table has either table or record locks. */
+TRANSACTIONAL_TARGET
bool lock_table_has_locks(dict_table_t *table)
{
if (table->n_rec_locks)
return true;
- table->lock_mutex_lock();
- auto len= UT_LIST_GET_LEN(table->locks);
- table->lock_mutex_unlock();
+ ulint len;
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (xbegin())
+ {
+ if (table->lock_mutex_is_locked())
+ xabort();
+ len= UT_LIST_GET_LEN(table->locks);
+ xend();
+ }
+ else
+#endif
+ {
+ table->lock_mutex_lock();
+ len= UT_LIST_GET_LEN(table->locks);
+ table->lock_mutex_unlock();
+ }
if (len)
return true;
#ifdef UNIV_DEBUG
@@ -5968,7 +6070,7 @@ namespace Deadlock
static trx_t *report(trx_t *const trx, bool current_trx)
{
mysql_mutex_assert_owner(&lock_sys.wait_mutex);
- ut_ad(lock_sys.is_writer() == !current_trx);
+ ut_ad(xtest() || lock_sys.is_writer() == !current_trx);
/* Normally, trx should be a direct part of the deadlock
cycle. However, if innodb_deadlock_detect had been OFF in the
@@ -6001,6 +6103,9 @@ namespace Deadlock
undo_no_t victim_weight= ~0ULL;
unsigned victim_pos= 0, trx_pos= 0;
+ /* Here, lock elision does not make sense, because
+ for the output we are going to invoke system calls,
+ which would interrupt a memory transaction. */
if (current_trx && !lock_sys.wr_lock_try())
{
mysql_mutex_unlock(&lock_sys.wait_mutex);
@@ -6149,18 +6254,22 @@ static bool Deadlock::check_and_resolve(trx_t *trx)
return false;
if (lock_t *wait_lock= trx->lock.wait_lock)
- lock_sys_t::cancel(trx, wait_lock, false);
+ lock_sys_t::cancel<false>(trx, wait_lock);
lock_sys.deadlock_check();
return true;
}
/** Check for deadlocks while holding only lock_sys.wait_mutex. */
+TRANSACTIONAL_TARGET
void lock_sys_t::deadlock_check()
{
ut_ad(!is_writer());
mysql_mutex_assert_owner(&wait_mutex);
bool acquired= false;
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ bool elided= false;
+#endif
if (Deadlock::to_be_checked)
{
@@ -6169,7 +6278,16 @@ void lock_sys_t::deadlock_check()
auto i= Deadlock::to_check.begin();
if (i == Deadlock::to_check.end())
break;
- if (!acquired)
+ if (acquired);
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ else if (xbegin())
+ {
+ if (latch.is_locked_or_waiting())
+ xabort();
+ acquired= elided= true;
+ }
+#endif
+ else
{
acquired= wr_lock_try();
if (!acquired)
@@ -6189,6 +6307,10 @@ void lock_sys_t::deadlock_check()
Deadlock::to_be_checked= false;
}
ut_ad(Deadlock::to_check.empty());
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (elided)
+ return;
+#endif
if (acquired)
wr_unlock();
}
@@ -6208,6 +6330,7 @@ void lock_update_split_and_merge(
const page_id_t l{left_block->page.id()};
const page_id_t r{right_block->page.id()};
+ /* This would likely be too large for a memory transaction. */
LockMultiGuard g{lock_sys.rec_hash, l, r};
const rec_t *left_next_rec= page_rec_get_next_const(orig_pred);
ut_ad(!page_rec_is_metadata(left_next_rec));
diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc
index a33b79d284b..c2b9337a905 100644
--- a/storage/innobase/log/log0log.cc
+++ b/storage/innobase/log/log0log.cc
@@ -1146,14 +1146,19 @@ wait_suspend_loop:
if (!buf_pool.is_initialised()) {
ut_ad(!srv_was_started);
- } else if (ulint pending_io = buf_pool.io_pending()) {
+ } else if (const ulint writes = os_aio_pending_writes()) {
+wait_for_io:
if (srv_print_verbose_log && count > 600) {
- ib::info() << "Waiting for " << pending_io << " buffer"
- " page I/Os to complete";
+ ib::info() << "Waiting for "
+ << buf_pool.n_pend_reads + writes
+ << " buffer page I/Os to complete";
count = 0;
}
+ os_aio_wait_until_no_pending_writes();
goto loop;
+ } else if (buf_pool.some_io_pending()) {
+ goto wait_for_io;
} else {
buf_flush_buffer_pool();
}
diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc
index 05f7b1d053b..925d2919a60 100644
--- a/storage/innobase/log/log0recv.cc
+++ b/storage/innobase/log/log0recv.cc
@@ -2848,6 +2848,7 @@ func_exit:
/** Reads in pages which have hashed log records, from an area around a given
page number.
@param[in] page_id page id */
+TRANSACTIONAL_TARGET
static void recv_read_in_area(page_id_t page_id)
{
uint32_t page_nos[RECV_READ_AHEAD_AREA];
@@ -2862,7 +2863,9 @@ static void recv_read_in_area(page_id_t page_id)
&& i->first.space() == page_id.space()
&& i->first.page_no() < up_limit; i++) {
if (i->second.state == page_recv_t::RECV_NOT_PROCESSED
- && !buf_pool.page_hash_contains(i->first)) {
+ && !buf_pool.page_hash_contains(
+ i->first,
+ buf_pool.page_hash.cell_get(i->first.fold()))) {
i->second.state = page_recv_t::RECV_BEING_READ;
*p++ = i->first.page_no();
}
@@ -3181,7 +3184,7 @@ next_page:
/* Instead of flushing, last_batch could sort the buf_pool.flush_list
in ascending order of buf_page_t::oldest_modification. */
- buf_flush_sync();
+ buf_flush_sync_batch(recovered_lsn);
if (!last_batch)
{
diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc
index ae3183703c2..b41687c9d2f 100644
--- a/storage/innobase/mtr/mtr0mtr.cc
+++ b/storage/innobase/mtr/mtr0mtr.cc
@@ -497,7 +497,7 @@ struct Shrink
ut_ad(id.space() == high.space());
ut_ad(bpage.state() == BUF_BLOCK_FILE_PAGE);
if (bpage.oldest_modification() > 1)
- bpage.clear_oldest_modification(false);
+ bpage.reset_oldest_modification();
slot->type= static_cast<mtr_memo_type_t>(slot->type & ~MTR_MEMO_MODIFY);
}
return true;
diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc
index 7f99b6ef26b..545ddec7671 100644
--- a/storage/innobase/os/os0file.cc
+++ b/storage/innobase/os/os0file.cc
@@ -3811,8 +3811,10 @@ static void os_aio_wait_until_no_pending_writes_low()
tpool::tpool_wait_end();
}
-/** Wait until there are no pending asynchronous writes.
-Only used on FLUSH TABLES...FOR EXPORT. */
+/** @return the number of pending writes */
+size_t os_aio_pending_writes() { return write_slots->pending_io_count(); }
+
+/** Wait until there are no pending asynchronous writes. */
void os_aio_wait_until_no_pending_writes()
{
os_aio_wait_until_no_pending_writes_low();
diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc
index b9245eb4325..5af291b30b4 100644
--- a/storage/innobase/row/row0import.cc
+++ b/storage/innobase/row/row0import.cc
@@ -2226,8 +2226,6 @@ row_import_cleanup(
DBUG_EXECUTE_IF("ib_import_before_checkpoint_crash", DBUG_SUICIDE(););
- log_make_checkpoint();
-
return(err);
}
diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc
index b67c1212271..9b2ea9db542 100644
--- a/storage/innobase/row/row0ins.cc
+++ b/storage/innobase/row/row0ins.cc
@@ -695,6 +695,7 @@ row_ins_set_detailed(
Acquires dict_foreign_err_mutex, rewinds dict_foreign_err_file
and displays information about the given transaction.
The caller must release dict_foreign_err_mutex. */
+TRANSACTIONAL_TARGET
static
void
row_ins_foreign_trx_print(
@@ -708,7 +709,7 @@ row_ins_foreign_trx_print(
ut_ad(!srv_read_only_mode);
{
- LockMutexGuard g{SRW_LOCK_CALL};
+ TMLockMutexGuard g{SRW_LOCK_CALL};
n_rec_locks = trx->lock.n_rec_locks;
n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks);
heap_size = mem_heap_get_size(trx->lock.lock_heap);
diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc
index a26a862e1ab..a334e59ece0 100644
--- a/storage/innobase/srv/srv0srv.cc
+++ b/storage/innobase/srv/srv0srv.cc
@@ -69,6 +69,9 @@ Created 10/8/1995 Heikki Tuuri
#include "fil0pagecompress.h"
#include "trx0types.h"
#include <list>
+#include "log.h"
+
+#include "transactional_lock_guard.h"
#include <my_service_manager.h>
/* The following is the maximum allowed duration of a lock wait. */
@@ -566,7 +569,7 @@ char srv_buffer_pool_load_at_startup = TRUE;
#ifdef HAVE_PSI_STAGE_INTERFACE
/** Performance schema stage event for monitoring ALTER TABLE progress
-everything after flush log_make_checkpoint(). */
+in ha_innobase::commit_inplace_alter_table(). */
PSI_stage_info srv_stage_alter_table_end
= {0, "alter table (end)", PSI_FLAG_STAGE_PROGRESS};
@@ -693,13 +696,15 @@ srv_free(void)
/*********************************************************************//**
Boots the InnoDB server. */
-void
-srv_boot(void)
-/*==========*/
+void srv_boot()
{
- srv_thread_pool_init();
- trx_pool_init();
- srv_init();
+#ifndef NO_ELISION
+ if (transactional_lock_enabled())
+ sql_print_information("InnoDB: Using transactional memory");
+#endif
+ srv_thread_pool_init();
+ trx_pool_init();
+ srv_init();
}
/******************************************************************//**
diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc
index 3f0ccf44b1e..45e20a57951 100644
--- a/storage/innobase/srv/srv0start.cc
+++ b/storage/innobase/srv/srv0start.cc
@@ -249,7 +249,8 @@ static dberr_t create_log_file(bool create_new_db, lsn_t lsn,
}
DBUG_PRINT("ib_log", ("After innodb_log_abort_6"));
- DBUG_ASSERT(!buf_pool.any_io_pending());
+ DBUG_ASSERT(!buf_pool.some_io_pending());
+ DBUG_ASSERT(!os_aio_pending_writes());
DBUG_EXECUTE_IF("innodb_log_abort_7", return DB_ERROR;);
DBUG_PRINT("ib_log", ("After innodb_log_abort_7"));
@@ -325,6 +326,13 @@ static dberr_t create_log_file(bool create_new_db, lsn_t lsn,
log_sys.log.write_header_durable(lsn);
+ ut_ad(srv_startup_is_before_trx_rollback_phase);
+ if (create_new_db) {
+ srv_startup_is_before_trx_rollback_phase = false;
+ }
+
+ /* Enable checkpoints in buf_flush_page_cleaner(). */
+ recv_sys.recovery_on = false;
mysql_mutex_unlock(&log_sys.mutex);
log_make_checkpoint();
@@ -879,92 +887,58 @@ buffer pools. Flush the redo log buffer to the redo log file.
@return lsn upto which data pages have been flushed. */
static lsn_t srv_prepare_to_delete_redo_log_file(bool old_exists)
{
- DBUG_ENTER("srv_prepare_to_delete_redo_log_file");
-
- lsn_t flushed_lsn;
- ulint count = 0;
-
- if (log_sys.log.subformat != 2) {
- srv_log_file_size = 0;
- }
+ DBUG_ENTER("srv_prepare_to_delete_redo_log_file");
- for (;;) {
- /* Clean the buffer pool. */
- buf_flush_sync();
+ /* Disable checkpoints in the page cleaner. */
+ ut_ad(!recv_sys.recovery_on);
+ recv_sys.recovery_on= true;
- DBUG_EXECUTE_IF("innodb_log_abort_1", DBUG_RETURN(0););
- DBUG_PRINT("ib_log", ("After innodb_log_abort_1"));
+ buf_flush_sync();
- mysql_mutex_lock(&log_sys.mutex);
+ if (log_sys.log.subformat != 2)
+ srv_log_file_size= 0;
- fil_names_clear(log_sys.get_lsn(), false);
-
- flushed_lsn = log_sys.get_lsn();
+ DBUG_EXECUTE_IF("innodb_log_abort_1", DBUG_RETURN(0););
+ DBUG_PRINT("ib_log", ("After innodb_log_abort_1"));
- {
- ib::info info;
- if (srv_log_file_size == 0
- || (log_sys.log.format & ~log_t::FORMAT_ENCRYPTED)
- != log_t::FORMAT_10_5) {
- info << "Upgrading redo log: ";
- } else if (!old_exists
- || srv_log_file_size
- != srv_log_file_size_requested) {
- if (srv_encrypt_log
- == (my_bool)log_sys.is_encrypted()) {
- info << (srv_encrypt_log
- ? "Resizing encrypted"
- : "Resizing");
- } else if (srv_encrypt_log) {
- info << "Encrypting and resizing";
- } else {
- info << "Removing encryption"
- " and resizing";
- }
-
- info << " redo log from " << srv_log_file_size
- << " to ";
- } else if (srv_encrypt_log) {
- info << "Encrypting redo log: ";
- } else {
- info << "Removing redo log encryption: ";
- }
-
- info << srv_log_file_size_requested
- << " bytes; LSN=" << flushed_lsn;
- }
-
- mysql_mutex_unlock(&log_sys.mutex);
+ mysql_mutex_lock(&log_sys.mutex);
+ fil_names_clear(log_sys.get_lsn(), false);
+ const lsn_t flushed_lsn= log_sys.get_lsn();
- if (flushed_lsn != log_sys.get_flushed_lsn()) {
- log_write_up_to(flushed_lsn, false);
- log_sys.log.flush();
- }
+ {
+ ib::info info;
+ if (srv_log_file_size == 0 ||
+ (log_sys.log.format & ~log_t::FORMAT_ENCRYPTED) != log_t::FORMAT_10_5)
+ info << "Upgrading redo log: ";
+ else if (!old_exists || srv_log_file_size != srv_log_file_size_requested)
+ {
+ if (srv_encrypt_log == (my_bool)log_sys.is_encrypted())
+ info << (srv_encrypt_log ? "Resizing encrypted" : "Resizing");
+ else
+ info << (srv_encrypt_log
+ ? "Encrypting and resizing"
+ : "Removing encryption and resizing");
+
+ info << " redo log from " << srv_log_file_size << " to ";
+ }
+ else
+ info << (srv_encrypt_log
+ ? "Encrypting redo log: " : "Removing redo log encryption: ");
+ info << srv_log_file_size_requested << " bytes; LSN=" << flushed_lsn;
+ }
- ut_ad(flushed_lsn == log_sys.get_lsn());
-
- /* Check if the buffer pools are clean. If not
- retry till it is clean. */
- if (ulint pending_io = buf_pool.io_pending()) {
- count++;
- /* Print a message every 60 seconds if we
- are waiting to clean the buffer pools */
- if (srv_print_verbose_log && count > 600) {
- ib::info() << "Waiting for "
- << pending_io << " buffer "
- << "page I/Os to complete";
- count = 0;
- }
+ mysql_mutex_unlock(&log_sys.mutex);
- std::this_thread::sleep_for(
- std::chrono::milliseconds(100));
- continue;
- }
+ if (flushed_lsn != log_sys.get_flushed_lsn())
+ {
+ log_write_up_to(flushed_lsn, false);
+ log_sys.log.flush();
+ }
- break;
- }
+ ut_ad(flushed_lsn == log_sys.get_lsn());
+ ut_ad(!os_aio_pending_writes());
- DBUG_RETURN(flushed_lsn);
+ DBUG_RETURN(flushed_lsn);
}
/** Tries to locate LOG_FILE_NAME and check it's size, etc
@@ -1241,7 +1215,7 @@ dberr_t srv_start(bool create_new_db)
ut_ad(buf_page_cleaner_is_active);
}
- srv_startup_is_before_trx_rollback_phase = !create_new_db;
+ srv_startup_is_before_trx_rollback_phase = true;
/* Check if undo tablespaces and redo log files exist before creating
a new system tablespace */
@@ -1290,7 +1264,6 @@ dberr_t srv_start(bool create_new_db)
if (create_new_db) {
flushed_lsn = log_sys.get_lsn();
log_sys.set_flushed_lsn(flushed_lsn);
- buf_flush_sync();
err = create_log_file(true, flushed_lsn, logfile0);
@@ -1347,6 +1320,9 @@ dberr_t srv_start(bool create_new_db)
if (!log_set_capacity(srv_log_file_size_requested)) {
return(srv_init_abort(DB_ERROR));
}
+
+ /* Enable checkpoints in the page cleaner. */
+ recv_sys.recovery_on = false;
}
file_checked:
@@ -1592,7 +1568,8 @@ file_checked:
ut_ad(srv_force_recovery <= SRV_FORCE_IGNORE_CORRUPT);
ut_ad(recv_no_log_write);
err = fil_write_flushed_lsn(log_sys.get_lsn());
- DBUG_ASSERT(!buf_pool.any_io_pending());
+ DBUG_ASSERT(!buf_pool.some_io_pending());
+ DBUG_ASSERT(!os_aio_pending_writes());
log_sys.log.close_file();
if (err == DB_SUCCESS) {
bool trunc = srv_operation
@@ -1636,7 +1613,8 @@ file_checked:
threads until creating a log checkpoint at the
end of create_log_file(). */
ut_d(recv_no_log_write = true);
- DBUG_ASSERT(!buf_pool.any_io_pending());
+ DBUG_ASSERT(!buf_pool.some_io_pending());
+ DBUG_ASSERT(!os_aio_pending_writes());
DBUG_EXECUTE_IF("innodb_log_abort_3",
return(srv_init_abort(DB_ERROR)););
@@ -1953,11 +1931,8 @@ void innodb_shutdown()
break;
case SRV_OPERATION_RESTORE:
case SRV_OPERATION_RESTORE_EXPORT:
- srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
- if (!buf_page_cleaner_is_active) {
- break;
- }
mysql_mutex_lock(&buf_pool.flush_list_mutex);
+ srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
while (buf_page_cleaner_is_active) {
pthread_cond_signal(&buf_pool.do_flush_list);
my_cond_wait(&buf_pool.done_flush_list,
diff --git a/storage/innobase/sync/srw_lock.cc b/storage/innobase/sync/srw_lock.cc
index 82f8d615477..b54191d91b0 100644
--- a/storage/innobase/sync/srw_lock.cc
+++ b/storage/innobase/sync/srw_lock.cc
@@ -19,6 +19,73 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include "srw_lock.h"
#include "srv0srv.h"
#include "my_cpu.h"
+#include "transactional_lock_guard.h"
+
+#ifdef NO_ELISION
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+# include <intrin.h>
+bool have_transactional_memory;
+bool transactional_lock_enabled()
+{
+ int regs[4];
+ __cpuid(regs, 0);
+ if (regs[0] < 7)
+ return false;
+ __cpuidex(regs, 7, 0);
+ /* Restricted Transactional Memory (RTM) */
+ have_transactional_memory= regs[1] & 1U << 11;
+ return have_transactional_memory;
+}
+#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+# include <cpuid.h>
+bool have_transactional_memory;
+bool transactional_lock_enabled()
+{
+ if (__get_cpuid_max(0, nullptr) < 7)
+ return false;
+ unsigned eax, ebx, ecx, edx;
+ __cpuid_count(7, 0, eax, ebx, ecx, edx);
+ /* Restricted Transactional Memory (RTM) */
+ have_transactional_memory= ebx & 1U << 11;
+ return have_transactional_memory;
+}
+
+# ifdef UNIV_DEBUG
+TRANSACTIONAL_TARGET
+bool xtest() { return have_transactional_memory && _xtest(); }
+# endif
+#elif defined __powerpc64__
+# ifdef __linux__
+# include <sys/auxv.h>
+
+# ifndef PPC_FEATURE2_HTM_NOSC
+# define PPC_FEATURE2_HTM_NOSC 0x01000000
+# endif
+# ifndef PPC_FEATURE2_HTM_NO_SUSPEND
+# define PPC_FEATURE2_HTM_NO_SUSPEND 0x00080000
+# endif
+
+# ifndef AT_HWCAP2
+# define AT_HWCAP2 26
+# endif
+# endif
+bool have_transactional_memory;
+bool transactional_lock_enabled()
+{
+# ifdef __linux__
+ return getauxval(AT_HWCAP2) &
+ (PPC_FEATURE2_HTM_NOSC | PPC_FEATURE2_HTM_NO_SUSPEND);
+# endif
+}
+
+# ifdef UNIV_DEBUG
+TRANSACTIONAL_TARGET bool xtest()
+{
+ return have_transactional_memory &&
+ _HTM_STATE (__builtin_ttest ()) == _HTM_TRANSACTIONAL;
+}
+# endif
+#endif
/** @return the parameter for srw_pause() */
static inline unsigned srw_pause_delay()
@@ -477,7 +544,7 @@ template<> void srw_lock_<true>::rd_wait()
return;
}
- IF_WIN(AcquireSRWLockShared(&lock), rw_rdlock(&lock));
+ IF_WIN(AcquireSRWLockShared(&lk), rw_rdlock(&lk));
}
template<> void srw_lock_<true>::wr_wait()
@@ -491,7 +558,7 @@ template<> void srw_lock_<true>::wr_wait()
return;
}
- IF_WIN(AcquireSRWLockExclusive(&lock), rw_wrlock(&lock));
+ IF_WIN(AcquireSRWLockExclusive(&lk), rw_wrlock(&lk));
}
#endif
diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc
index 3dd1b093cf6..5b4e6fb8957 100644
--- a/storage/innobase/trx/trx0purge.cc
+++ b/storage/innobase/trx/trx0purge.cc
@@ -77,7 +77,7 @@ TrxUndoRsegsIterator::TrxUndoRsegsIterator()
/** Sets the next rseg to purge in purge_sys.
Executed in the purge coordinator thread.
@return whether anything is to be purged */
-inline bool TrxUndoRsegsIterator::set_next()
+TRANSACTIONAL_INLINE inline bool TrxUndoRsegsIterator::set_next()
{
mysql_mutex_lock(&purge_sys.pq_mutex);
@@ -110,23 +110,38 @@ inline bool TrxUndoRsegsIterator::set_next()
purge_sys.rseg = *m_iter++;
mysql_mutex_unlock(&purge_sys.pq_mutex);
- purge_sys.rseg->latch.rd_lock();
- ut_a(purge_sys.rseg->last_page_no != FIL_NULL);
- ut_ad(purge_sys.rseg->last_trx_no() == m_rsegs.trx_no);
-
- /* We assume in purge of externally stored fields that space id is
- in the range of UNDO tablespace space ids */
+ /* We assume in purge of externally stored fields that space
+ id is in the range of UNDO tablespace space ids */
ut_ad(purge_sys.rseg->space->id == TRX_SYS_SPACE
|| srv_is_undo_tablespace(purge_sys.rseg->space->id));
- ut_a(purge_sys.tail.trx_no <= purge_sys.rseg->last_trx_no());
+ trx_id_t last_trx_no, tail_trx_no;
+ {
+#ifdef SUX_LOCK_GENERIC
+ purge_sys.rseg->latch.rd_lock();
+#else
+ transactional_shared_lock_guard<srw_spin_lock_low> rg
+ {purge_sys.rseg->latch};
+#endif
+ last_trx_no = purge_sys.rseg->last_trx_no();
+ tail_trx_no = purge_sys.tail.trx_no;
- purge_sys.tail.trx_no = purge_sys.rseg->last_trx_no();
- purge_sys.hdr_offset = purge_sys.rseg->last_offset();
- purge_sys.hdr_page_no = purge_sys.rseg->last_page_no;
+ purge_sys.tail.trx_no = last_trx_no;
+ purge_sys.hdr_offset = purge_sys.rseg->last_offset();
+ purge_sys.hdr_page_no = purge_sys.rseg->last_page_no;
+
+#ifdef SUX_LOCK_GENERIC
+ purge_sys.rseg->latch.rd_unlock();
+#endif
+ }
+
+ /* Only the purge coordinator task will access
+ purge_sys.rseg_iter or purge_sys.hdr_page_no. */
+ ut_ad(last_trx_no == m_rsegs.trx_no);
+ ut_a(purge_sys.hdr_page_no != FIL_NULL);
+ ut_a(tail_trx_no <= last_trx_no);
- purge_sys.rseg->latch.rd_unlock();
return(true);
}
@@ -550,7 +565,7 @@ __attribute__((optimize(0)))
Removes unnecessary history data from rollback segments. NOTE that when this
function is called, the caller must not have any latches on undo log pages!
*/
-static void trx_purge_truncate_history()
+TRANSACTIONAL_TARGET static void trx_purge_truncate_history()
{
ut_ad(purge_sys.head <= purge_sys.tail);
purge_sys_t::iterator &head= purge_sys.head.trx_no
@@ -617,12 +632,18 @@ static void trx_purge_truncate_history()
{
if (rseg.space != &space)
continue;
+#ifdef SUX_LOCK_GENERIC
rseg.latch.rd_lock();
+#else
+ transactional_shared_lock_guard<srw_spin_lock_low> g{rseg.latch};
+#endif
ut_ad(rseg.skip_allocation());
if (rseg.is_referenced())
{
not_free:
+#ifdef SUX_LOCK_GENERIC
rseg.latch.rd_unlock();
+#endif
return;
}
@@ -645,7 +666,9 @@ not_free:
goto not_free;
}
+#ifdef SUX_LOCK_GENERIC
rseg.latch.rd_unlock();
+#endif
}
ib::info() << "Truncating " << file->name;
@@ -692,7 +715,7 @@ not_free:
if (bpage->oldest_modification() > 1)
{
- bpage->clear_oldest_modification(false);
+ bpage->reset_oldest_modification();
mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX);
}
else
@@ -938,10 +961,7 @@ Chooses the next undo log to purge and updates the info in purge_sys. This
function is used to initialize purge_sys when the next record to purge is
not known, and also to update the purge system info on the next record when
purge has handled the whole undo log for a transaction. */
-static
-void
-trx_purge_choose_next_log(void)
-/*===========================*/
+TRANSACTIONAL_TARGET static void trx_purge_choose_next_log()
{
ut_ad(!purge_sys.next_stored);
diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc
index 08e05edb896..45bd36d9669 100644
--- a/storage/innobase/trx/trx0rec.cc
+++ b/storage/innobase/trx/trx0rec.cc
@@ -1948,16 +1948,30 @@ dberr_t trx_undo_report_rename(trx_t* trx, const dict_table_t* table)
return err;
}
-ATTRIBUTE_COLD ATTRIBUTE_NOINLINE
+TRANSACTIONAL_TARGET ATTRIBUTE_COLD ATTRIBUTE_NOINLINE
/** @return whether the transaction holds an exclusive lock on a table */
static bool trx_has_lock_x(const trx_t &trx, dict_table_t& table)
{
if (table.is_temporary())
return true;
- table.lock_mutex_lock();
- const auto n= table.n_lock_x_or_s;
- table.lock_mutex_unlock();
+ uint32_t n;
+
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+ if (xbegin())
+ {
+ if (table.lock_mutex_is_locked())
+ xabort();
+ n= table.n_lock_x_or_s;
+ xend();
+ }
+ else
+#endif
+ {
+ table.lock_mutex_lock();
+ n= table.n_lock_x_or_s;
+ table.lock_mutex_unlock();
+ }
/* This thread is executing trx. No other thread can modify our table locks
(only record locks might be created, in an implicit-to-explicit conversion).
diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc
index 61e1fd50cd5..18c93d5a8cc 100644
--- a/storage/innobase/trx/trx0trx.cc
+++ b/storage/innobase/trx/trx0trx.cc
@@ -456,7 +456,7 @@ void trx_t::free()
}
/** Transition to committed state, to release implicit locks. */
-inline void trx_t::commit_state()
+TRANSACTIONAL_INLINE inline void trx_t::commit_state()
{
ut_ad(state == TRX_STATE_PREPARED
|| state == TRX_STATE_PREPARED_RECOVERED
@@ -473,9 +473,8 @@ inline void trx_t::commit_state()
makes modifications to the database, will get an lsn larger than the
committing transaction T. In the case where the log flush fails, and
T never gets committed, also T2 will never get committed. */
- mutex.wr_lock();
+ TMTrxGuard tg{*this};
state= TRX_STATE_COMMITTED_IN_MEMORY;
- mutex.wr_unlock();
ut_ad(id || !is_referenced());
}
@@ -498,8 +497,7 @@ inline void trx_t::release_locks()
}
/** At shutdown, frees a transaction object. */
-void
-trx_free_at_shutdown(trx_t *trx)
+TRANSACTIONAL_TARGET void trx_free_at_shutdown(trx_t *trx)
{
ut_ad(trx->is_recovered);
ut_a(trx_state_eq(trx, TRX_STATE_PREPARED)
@@ -1228,7 +1226,7 @@ void trx_t::evict_table(table_id_t table_id, bool reset_only)
}
/** Mark a transaction committed in the main memory data structures. */
-inline void trx_t::commit_in_memory(const mtr_t *mtr)
+TRANSACTIONAL_INLINE inline void trx_t::commit_in_memory(const mtr_t *mtr)
{
must_flush_log_later= false;
read_view.close();
@@ -1395,7 +1393,7 @@ void trx_t::commit_cleanup()
/** Commit the transaction in a mini-transaction.
@param mtr mini-transaction (if there are any persistent modifications) */
-void trx_t::commit_low(mtr_t *mtr)
+TRANSACTIONAL_TARGET void trx_t::commit_low(mtr_t *mtr)
{
ut_ad(!mtr || mtr->is_active());
ut_d(bool aborted = in_rollback && error_state == DB_DEADLOCK);
@@ -1771,6 +1769,7 @@ trx_print_latched(
/**********************************************************************//**
Prints info about a transaction.
Acquires and releases lock_sys.latch. */
+TRANSACTIONAL_TARGET
void
trx_print(
/*======*/
@@ -1781,7 +1780,7 @@ trx_print(
{
ulint n_rec_locks, n_trx_locks, heap_size;
{
- LockMutexGuard g{SRW_LOCK_CALL};
+ TMLockMutexGuard g{SRW_LOCK_CALL};
n_rec_locks= trx->lock.n_rec_locks;
n_trx_locks= UT_LIST_GET_LEN(trx->lock.trx_locks);
heap_size= mem_heap_get_size(trx->lock.lock_heap);
@@ -1833,6 +1832,7 @@ static lsn_t trx_prepare_low(trx_t *trx)
/****************************************************************//**
Prepares a transaction. */
+TRANSACTIONAL_TARGET
static
void
trx_prepare(
@@ -1848,9 +1848,10 @@ trx_prepare(
DBUG_EXECUTE_IF("ib_trx_crash_during_xa_prepare_step", DBUG_SUICIDE(););
ut_a(trx->state == TRX_STATE_ACTIVE);
- trx->mutex_lock();
- trx->state = TRX_STATE_PREPARED;
- trx->mutex_unlock();
+ {
+ TMTrxGuard tg{*trx};
+ trx->state = TRX_STATE_PREPARED;
+ }
if (lsn) {
/* Depending on the my.cnf options, we may now write the log