diff options
39 files changed, 1303 insertions, 1900 deletions
diff --git a/extra/mariabackup/fil_cur.cc b/extra/mariabackup/fil_cur.cc index 8f06005a9e4..7aa20b8700e 100644 --- a/extra/mariabackup/fil_cur.cc +++ b/extra/mariabackup/fil_cur.cc @@ -93,7 +93,6 @@ xb_fil_node_close_file( mutex_enter(&fil_system.mutex); ut_ad(node); - ut_a(node->n_pending == 0); ut_a(node->n_pending_flushes == 0); ut_a(!node->being_extended); @@ -108,20 +107,10 @@ xb_fil_node_close_file( ut_a(ret); node->handle = OS_FILE_CLOSED; + mutex_exit(&fil_system.mutex); ut_a(fil_system.n_open > 0); fil_system.n_open--; - - if (node->space->purpose == FIL_TYPE_TABLESPACE && - fil_is_user_tablespace_id(node->space->id)) { - - ut_a(UT_LIST_GET_LEN(fil_system.LRU) > 0); - - /* The node is in the LRU list, remove it */ - UT_LIST_REMOVE(fil_system.LRU, node); - } - - mutex_exit(&fil_system.mutex); } /************************************************************************ @@ -180,18 +169,8 @@ xb_fil_cur_open( return(XB_FIL_CUR_SKIP); } - mutex_enter(&fil_system.mutex); fil_system.n_open++; - - if (node->space->purpose == FIL_TYPE_TABLESPACE && - fil_is_user_tablespace_id(node->space->id)) { - - /* Put the node to the LRU list */ - UT_LIST_ADD_FIRST(fil_system.LRU, node); - } - - mutex_exit(&fil_system.mutex); } ut_ad(node->is_open()); @@ -427,7 +406,7 @@ xb_fil_cur_read( retry_count = 10; ret = XB_FIL_CUR_SUCCESS; - fil_space_t *space = fil_space_acquire_for_io(cursor->space_id); + fil_space_t *space = fil_space_t::get_for_io(cursor->space_id); if (!space) { return XB_FIL_CUR_ERROR; diff --git a/extra/mariabackup/xtrabackup.cc b/extra/mariabackup/xtrabackup.cc index 3189bcd14cb..39025862276 100644 --- a/extra/mariabackup/xtrabackup.cc +++ b/extra/mariabackup/xtrabackup.cc @@ -3011,6 +3011,7 @@ void xb_fil_io_init() { fil_system.create(srv_file_per_table ? 50000 : 5000); + fil_system.freeze_space_list = 1; fil_system.space_id_reuse_warned = true; } @@ -3087,24 +3088,16 @@ xb_load_single_table_tablespace( bool is_empty_file = file->exists() && file->is_empty_file(); if (err == DB_SUCCESS && file->space_id() != SRV_TMP_SPACE_ID) { - os_offset_t node_size = os_file_get_size(file->handle()); - os_offset_t n_pages; - - ut_a(node_size != (os_offset_t) -1); - - n_pages = node_size / fil_space_t::physical_size(file->flags()); - - space = fil_space_create( + space = fil_space_t::create( name, file->space_id(), file->flags(), FIL_TYPE_TABLESPACE, NULL/* TODO: crypt_data */); ut_a(space != NULL); - space->add(file->filepath(), OS_FILE_CLOSED, uint32_t(n_pages), - false, false); + space->add(file->filepath(), OS_FILE_CLOSED, 0, false, false); /* by opening the tablespace we forcing node and space objects in the cache to be populated with fields from space header */ - space->open(); + space->get_size(); if (srv_operation == SRV_OPERATION_RESTORE_DELTA || xb_close_files) { @@ -3406,19 +3399,6 @@ xb_load_tablespaces() return(DB_SUCCESS); } -/************************************************************************ -Initialize the tablespace memory cache and populate it by scanning for and -opening data files. -@returns DB_SUCCESS or error code.*/ -static -dberr_t -xb_data_files_init() -{ - xb_fil_io_init(); - - return(xb_load_tablespaces()); -} - /** Destroy the tablespace memory cache. */ static void xb_data_files_close() { @@ -4607,6 +4587,22 @@ xb_delta_open_matching_space( return file; } + if (!info.space_id && fil_system.sys_space) { + fil_node_t *node + = UT_LIST_GET_FIRST(fil_system.sys_space->chain); + for (; node; node = UT_LIST_GET_NEXT(chain, node)) { + if (!strcmp(node->name, real_name)) { + break; + } + } + if (node && node->handle != OS_FILE_CLOSED) { + *success = true; + return node->handle; + } + msg("mariabackup: Cannot find file %s\n", real_name); + return OS_FILE_CLOSED; + } + log_mutex_enter(); if (!fil_is_user_tablespace_id(info.space_id)) { found: @@ -4704,8 +4700,8 @@ exit: ut_ad(fil_space_t::zip_size(flags) == info.zip_size); ut_ad(fil_space_t::physical_size(flags) == info.page_size); - if (fil_space_create(dest_space_name, info.space_id, flags, - FIL_TYPE_TABLESPACE, 0)) { + if (fil_space_t::create(dest_space_name, info.space_id, flags, + FIL_TYPE_TABLESPACE, 0)) { *success = xb_space_create_file(real_name, info.space_id, flags, &file); } else { @@ -4925,7 +4921,7 @@ xtrabackup_apply_delta( os_file_close(src_file); os_file_delete(0,src_path); } - if (dst_file != OS_FILE_CLOSED) + if (dst_file != OS_FILE_CLOSED && info.space_id) os_file_close(dst_file); return TRUE; @@ -4933,7 +4929,7 @@ error: aligned_free(incremental_buffer); if (src_file != OS_FILE_CLOSED) os_file_close(src_file); - if (dst_file != OS_FILE_CLOSED) + if (dst_file != OS_FILE_CLOSED && info.space_id) os_file_close(dst_file); msg("Error: xtrabackup_apply_delta(): " "failed to apply %s to %s.\n", src_path, dst_path); @@ -5387,8 +5383,8 @@ static bool xtrabackup_prepare_func(char** argv) srv_allow_writes_event = os_event_create(0); os_event_set(srv_allow_writes_event); #endif - dberr_t err = xb_data_files_init(); - if (err != DB_SUCCESS) { + xb_fil_io_init(); + if (dberr_t err = xb_load_tablespaces()) { msg("mariabackup: error: xb_data_files_init() failed " "with error %s\n", ut_strerr(err)); goto error_cleanup; @@ -5396,7 +5392,8 @@ static bool xtrabackup_prepare_func(char** argv) inc_dir_tables_hash.create(1000); - ok = xtrabackup_apply_deltas(); + ok = fil_system.sys_space->open(false) + && xtrabackup_apply_deltas(); xb_data_files_close(); @@ -5426,6 +5423,8 @@ static bool xtrabackup_prepare_func(char** argv) goto error_cleanup; } + fil_system.freeze_space_list = 0; + /* increase IO threads */ if (srv_n_file_io_threads < 10) { srv_n_read_io_threads = 4; @@ -5447,6 +5446,8 @@ static bool xtrabackup_prepare_func(char** argv) goto error_cleanup; } + ut_ad(!fil_system.freeze_space_list); + if (ok) { msg("Last binlog file %s, position %lld", trx_sys.recovered_binlog_filename, diff --git a/mysql-test/suite/encryption/t/innodb-remove-encryption.test b/mysql-test/suite/encryption/t/innodb-remove-encryption.test index 90c6925d125..aeafd99325b 100644 --- a/mysql-test/suite/encryption/t/innodb-remove-encryption.test +++ b/mysql-test/suite/encryption/t/innodb-remove-encryption.test @@ -29,6 +29,7 @@ create table t1(a int not null primary key, b char(200)) engine=innodb; --source include/wait_condition.inc SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0; +--sorted_result SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0; --echo # Success! @@ -41,6 +42,7 @@ SET GLOBAL innodb_encrypt_tables = off; --let $wait_condition=SELECT COUNT(*) = $tables_count FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0 AND ROTATING_OR_FLUSHING = 0; --source include/wait_condition.inc +--sorted_result SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0; SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0; @@ -51,6 +53,7 @@ SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_ --let $restart_parameters=--skip-file-key-management --innodb-encrypt-tables=OFF --innodb-encryption-threads=0 --innodb-tablespaces-encryption -- source include/restart_mysqld.inc +--sorted_result SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0; SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0; diff --git a/mysql-test/suite/encryption/t/innodb_encrypt_key_rotation_age.test b/mysql-test/suite/encryption/t/innodb_encrypt_key_rotation_age.test index bc4c43e1ce8..ef38560c469 100644 --- a/mysql-test/suite/encryption/t/innodb_encrypt_key_rotation_age.test +++ b/mysql-test/suite/encryption/t/innodb_encrypt_key_rotation_age.test @@ -26,6 +26,7 @@ let $restart_parameters= --innodb_encryption_threads=5 --innodb_encryption_rotat --source include/wait_condition.inc SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0; +--sorted_result SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0; --echo # Restart the server with innodb_encryption_rotate_key_age= 0 @@ -45,6 +46,7 @@ create table t4 (f1 int not null)engine=innodb encrypted=NO; SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0; +--sorted_result SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0; --echo # Disable encryption when innodb_encryption_rotate_key_age is 0 @@ -57,6 +59,7 @@ set global innodb_encrypt_tables = OFF; --let $wait_condition=SELECT COUNT(*) >= $tables_count FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0 AND ROTATING_OR_FLUSHING = 0; --source include/wait_condition.inc +--sorted_result SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0; --echo # Display only encrypted create tables (t3) SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0; @@ -73,11 +76,13 @@ set global innodb_encrypt_tables = ON; SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0; --echo # Display only unencrypted create tables (t4) +--sorted_result SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0; --let $restart_parameters= -- source include/restart_mysqld.inc SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0; +--sorted_result SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0; DROP TABLE t4, t3, t2, t1; diff --git a/mysql-test/suite/innodb/r/table_definition_cache_debug.result b/mysql-test/suite/innodb/r/table_definition_cache_debug.result index 2c2c6de44ae..df171c89cd4 100644 --- a/mysql-test/suite/innodb/r/table_definition_cache_debug.result +++ b/mysql-test/suite/innodb/r/table_definition_cache_debug.result @@ -1,4 +1,4 @@ -call mtr.add_suppression("InnoDB: innodb_open_files=13 is exceeded"); +call mtr.add_suppression("InnoDB: innodb_open_files=.* is exceeded"); SET @save_tdc= @@GLOBAL.table_definition_cache; SET @save_toc= @@GLOBAL.table_open_cache; SET GLOBAL table_definition_cache= 400; diff --git a/mysql-test/suite/innodb/t/innodb-trim.test b/mysql-test/suite/innodb/t/innodb-trim.test index 0f38ea5ba84..3f8eb5f2c71 100644 --- a/mysql-test/suite/innodb/t/innodb-trim.test +++ b/mysql-test/suite/innodb/t/innodb-trim.test @@ -32,18 +32,6 @@ commit; set autocommit=1; -let $success= `SELECT variable_value FROM information_schema.global_status WHERE variable_name = 'innodb_num_page_compressed_trim_op'`; - -if (!$success) { ---disable_query_log ---disable_result_log - DROP PROCEDURE innodb_insert_proc; - DROP TABLE innodb_page_compressed; ---enable_query_log ---enable_result_log - --skip "Test requires TRIM"; -} - DROP PROCEDURE innodb_insert_proc; DROP TABLE innodb_page_compressed; diff --git a/mysql-test/suite/innodb/t/table_definition_cache_debug.test b/mysql-test/suite/innodb/t/table_definition_cache_debug.test index 70467b53435..6a466af4cc5 100644 --- a/mysql-test/suite/innodb/t/table_definition_cache_debug.test +++ b/mysql-test/suite/innodb/t/table_definition_cache_debug.test @@ -4,7 +4,7 @@ # This test is slow on buildbot. --source include/big_test.inc -call mtr.add_suppression("InnoDB: innodb_open_files=13 is exceeded"); +call mtr.add_suppression("InnoDB: innodb_open_files=.* is exceeded"); SET @save_tdc= @@GLOBAL.table_definition_cache; SET @save_toc= @@GLOBAL.table_open_cache; diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt index 4fac94d211e..10f183790a7 100644 --- a/storage/innobase/CMakeLists.txt +++ b/storage/innobase/CMakeLists.txt @@ -1,3 +1,4 @@ + # Copyright (c) 2006, 2017, Oracle and/or its affiliates. All rights reserved. # Copyright (c) 2014, 2020, MariaDB Corporation. # @@ -186,7 +187,6 @@ SET(INNOBASE_SOURCES include/mtr0mtr.h include/mtr0mtr.ic include/mtr0types.h - include/os0api.h include/os0event.h include/os0file.h include/os0file.ic diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index 81a04701da1..c280ed555fe 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -3304,21 +3304,34 @@ upd_sys: /** Prefetch siblings of the leaf for the pessimistic operation. -@param block leaf page */ -static void btr_cur_prefetch_siblings(const buf_block_t* block) +@param block leaf page +@param index index of the page */ +static void btr_cur_prefetch_siblings(const buf_block_t *block, + const dict_index_t *index) { - const page_t *page= block->frame; - ut_ad(page_is_leaf(page)); + ut_ad(page_is_leaf(block->frame)); + + if (index->is_ibuf()) + return; + const page_t *page= block->frame; uint32_t prev= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV)); uint32_t next= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT)); if (prev != FIL_NULL) - buf_read_page_background(page_id_t(block->page.id().space(), prev), + { + ut_a(index->table->space->acquire_for_io()); + buf_read_page_background(index->table->space, + page_id_t(block->page.id().space(), prev), block->zip_size(), false); + } if (next != FIL_NULL) - buf_read_page_background(page_id_t(block->page.id().space(), next), + { + ut_a(index->table->space->acquire_for_io()); + buf_read_page_background(index->table->space, + page_id_t(block->page.id().space(), next), block->zip_size(), false); + } } /*************************************************************//** @@ -3436,8 +3449,8 @@ fail: /* prefetch siblings of the leaf for the pessimistic operation, if the page is leaf. */ - if (page_is_leaf(page) && !index->is_ibuf()) { - btr_cur_prefetch_siblings(block); + if (page_is_leaf(page)) { + btr_cur_prefetch_siblings(block, index); } fail_err: @@ -4575,7 +4588,7 @@ any_extern: /* prefetch siblings of the leaf for the pessimistic operation. */ - btr_cur_prefetch_siblings(block); + btr_cur_prefetch_siblings(block, index); return(DB_OVERFLOW); } @@ -4766,10 +4779,10 @@ func_exit: } } - if (err != DB_SUCCESS && !index->is_ibuf()) { + if (err != DB_SUCCESS) { /* prefetch siblings of the leaf for the pessimistic operation. */ - btr_cur_prefetch_siblings(block); + btr_cur_prefetch_siblings(block, index); } return(err); @@ -5481,7 +5494,7 @@ btr_cur_optimistic_delete_func( if (!no_compress_needed) { /* prefetch siblings of the leaf for the pessimistic operation. */ - btr_cur_prefetch_siblings(block); + btr_cur_prefetch_siblings(block, cursor->index); goto func_exit; } diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index daf5e1aa511..2046ffd4273 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -2768,7 +2768,7 @@ buf_zip_decompress( ulint size = page_zip_get_size(&block->page.zip); /* The tablespace will not be found if this function is called during IMPORT. */ - fil_space_t* space= fil_space_acquire_for_io(block->page.id().space()); + fil_space_t* space= fil_space_t::get_for_io(block->page.id().space()); const unsigned key_version = mach_read_from_4( frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); fil_space_crypt_t* crypt_data = space ? space->crypt_data : NULL; @@ -3034,10 +3034,9 @@ buf_page_get_low( /* fall through */ case BUF_GET: case BUF_GET_IF_IN_POOL_OR_WATCH: - fil_space_t* s = fil_space_acquire_for_io(page_id.space()); + fil_space_t* s = fil_space_get(page_id.space()); ut_ad(s); ut_ad(s->zip_size() == zip_size); - s->release_for_io(); } #endif /* UNIV_DEBUG */ @@ -3107,7 +3106,7 @@ lookup: } /* The call path is buf_read_page() -> - buf_read_page_low() (fil_io()) -> + buf_read_page_low() (fil_space_t::io()) -> buf_page_read_complete() -> buf_decrypt_after_read(). Here fil_space_t* is used and we decrypt -> buf_page_check_corrupt() where page @@ -3161,8 +3160,7 @@ lookup: asserting. */ if (page_id.space() == TRX_SYS_SPACE) { } else if (page_id.space() == SRV_TMP_SPACE_ID) { - } else if (fil_space_t* space - = fil_space_acquire_for_io( + } else if (fil_space_t* space= fil_space_t::get_for_io( page_id.space())) { bool set = dict_set_corrupted_by_space(space); space->release_for_io(); @@ -3376,8 +3374,8 @@ re_evict: if (mode != BUF_GET_IF_IN_POOL && mode != BUF_GET_IF_IN_POOL_OR_WATCH) { } else if (!ibuf_debug) { - } else if (fil_space_t* space = - fil_space_acquire_for_io(page_id.space())) { + } else if (fil_space_t* space + = fil_space_t::get_for_io(page_id.space())) { /* Try to evict the block from the buffer pool, to use the insert buffer (change buffer) as much as possible. */ @@ -4869,17 +4867,4 @@ std::ostream& operator<<(std::ostream &out, const page_id_t page_id) << ", page number=" << page_id.page_no() << "]"; return out; } - -/** -Calculate the length of trim (punch_hole) operation. -@param[in] bpage Page control block -@param[in] write_length Write length -@return length of the trim or zero. */ -ulint -buf_page_get_trim_length( - const buf_page_t* bpage, - ulint write_length) -{ - return bpage->physical_size() - write_length; -} #endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index ad515e4e194..6b1a32d8930 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -125,7 +125,8 @@ too_small: byte *fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG + trx_sys_block->frame; - for (uint32_t prev_page_no= 0, i= 0; i < 2 * size + FSP_EXTENT_SIZE / 2; i++) + for (uint32_t prev_page_no= 0, i= 0, extent_size= FSP_EXTENT_SIZE; + i < 2 * size + extent_size / 2; i++) { buf_block_t *new_block= fseg_alloc_free_page(fseg_header, prev_page_no + 1, FSP_UP, &mtr); @@ -362,15 +363,13 @@ void buf_dblwr_t::recover() continue; } - fil_space_t* space= fil_space_acquire_for_io(space_id); + fil_space_t *space= fil_space_t::get_for_io(space_id); if (!space) /* The tablespace that this page once belonged to does not exist */ continue; - fil_space_open_if_needed(space); - - if (UNIV_UNLIKELY(page_no >= space->size)) + if (UNIV_UNLIKELY(page_no >= space->get_size())) { /* Do not report the warning for undo tablespaces, because they can be truncated in place. */ @@ -385,7 +384,6 @@ next_page: } const ulint physical_size= space->physical_size(); - const ulint zip_size= space->zip_size(); ut_ad(!buf_is_zeroes(span<const byte>(page, physical_size))); /* We want to ensure that for partial reads the unread portion of @@ -393,18 +391,15 @@ next_page: memset(read_buf, 0x0, physical_size); /* Read in the actual page from the file */ - fil_io_t fio= fil_io(IORequest(IORequest::READ | IORequest::DBLWR_RECOVER), - true, page_id, zip_size, - 0, physical_size, read_buf, nullptr); + fil_io_t fio= space->io(IORequest(IORequest::DBLWR_RECOVER), + os_offset_t{page_no} * physical_size, + physical_size, read_buf); if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) ib::warn() << "Double write buffer recovery: " << page_id << " (tablespace '" << space->name << "') read failed with error: " << fio.err; - if (fio.node) - fio.node->space->release_for_io(); - if (buf_is_zeroes(span<const byte>(read_buf, physical_size))) { /* We will check if the copy in the doublewrite buffer is @@ -425,17 +420,15 @@ next_page: /* Write the good page from the doublewrite buffer to the intended position. */ - fio= fil_io(IORequestWrite, true, page_id, zip_size, 0, physical_size, - page, nullptr); + space->reacquire_for_io(); + fio= space->io(IORequestWrite, + os_offset_t{page_id.page_no()} * physical_size, + physical_size, page); - if (fio.node) - { - ut_ad(fio.err == DB_SUCCESS); + if (fio.err == DB_SUCCESS) ib::info() << "Recovered page " << page_id << " to '" << fio.node->name << "' from the doublewrite buffer."; - fio.node->space->release_for_io(); - goto next_page; - } + goto next_page; } recv_sys.dblwr.pages.clear(); @@ -513,7 +506,7 @@ static void buf_dblwr_check_page_lsn(const page_t* page, const fil_space_t& s) static void buf_dblwr_check_page_lsn(const buf_page_t &b, const byte *page) { - if (fil_space_t *space= fil_space_acquire_for_io(b.id().space())) + if (fil_space_t *space= fil_space_t::get_for_io(b.id().space())) { buf_dblwr_check_page_lsn(page, *space); space->release_for_io(); @@ -577,7 +570,7 @@ bool buf_dblwr_t::flush_buffered_writes(const ulint size) #ifdef UNIV_DEBUG for (ulint len2= 0, i= 0; i < old_first_free; len2 += srv_page_size, i++) { - buf_page_t *bpage= buf_block_arr[i].bpage; + buf_page_t *bpage= buf_block_arr[i].request.bpage; if (bpage->zip.data) /* No simple validate for ROW_FORMAT=COMPRESSED pages exists. */ @@ -590,18 +583,22 @@ bool buf_dblwr_t::flush_buffered_writes(const ulint size) } #endif /* UNIV_DEBUG */ /* Write out the first block of the doublewrite buffer */ - fil_io_t fio= fil_io(IORequestWrite, true, block1, 0, 0, - std::min(size, old_first_free) << srv_page_size_shift, - write_buf, nullptr); - fio.node->space->release_for_io(); + ut_a(fil_system.sys_space->acquire_for_io()); + fil_system.sys_space->io(IORequestWrite, + os_offset_t{block1.page_no()} << + srv_page_size_shift, + std::min(size, old_first_free) << + srv_page_size_shift, write_buf); if (old_first_free > size) { /* Write out the second block of the doublewrite buffer. */ - fio= fil_io(IORequestWrite, true, block2, 0, 0, - (old_first_free - size) << srv_page_size_shift, - write_buf + (size << srv_page_size_shift), nullptr); - fio.node->space->release_for_io(); + ut_a(fil_system.sys_space->acquire_for_io()); + fil_system.sys_space->io(IORequestWrite, + os_offset_t{block2.page_no()} << + srv_page_size_shift, + (old_first_free - size) << srv_page_size_shift, + write_buf + (size << srv_page_size_shift)); } /* increment the doublewrite flushed pages counter */ @@ -609,7 +606,7 @@ bool buf_dblwr_t::flush_buffered_writes(const ulint size) srv_stats.dblwr_writes.inc(); /* Now flush the doublewrite buffer data to disk */ - fil_flush(TRX_SYS_SPACE); + fil_system.sys_space->flush(); /* We know that the writes have been flushed to disk now and in recovery we will find them in the doublewrite buffer @@ -629,8 +626,8 @@ bool buf_dblwr_t::flush_buffered_writes(const ulint size) for (ulint i= 0; i < old_first_free; i++) { auto e= buf_block_arr[i]; - buf_page_t* bpage= e.bpage; - ut_a(bpage->in_file()); + buf_page_t* bpage= e.request.bpage; + ut_ad(bpage->in_file()); /* We request frame here to get correct buffer in case of encryption and/or page compression */ @@ -650,8 +647,7 @@ bool buf_dblwr_t::flush_buffered_writes(const ulint size) ut_d(buf_dblwr_check_page_lsn(*bpage, static_cast<const byte*>(frame))); } - fil_io(IORequest(IORequest::WRITE, bpage, e.lru), false, - bpage->id(), bpage->zip_size(), 0, e_size, frame, bpage); + e.space->io(e.request, bpage->physical_offset(), e_size, frame, bpage); } return true; @@ -680,12 +676,20 @@ void buf_dblwr_t::flush_buffered_writes() /** Schedule a page write. If the doublewrite memory buffer is full, flush_buffered_writes() will be invoked to make space. -@param bpage buffer pool page to be written -@param lru true=buf_pool.LRU; false=buf_pool.flush_list +@param space tablespace +@param request asynchronous write request @param size payload size in bytes */ -void buf_dblwr_t::add_to_batch(buf_page_t *bpage, bool lru, size_t size) +void buf_dblwr_t::add_to_batch(fil_space_t *space, const IORequest &request, + size_t size) { - ut_ad(bpage->in_file()); + ut_ad(request.is_async()); + ut_ad(request.is_write()); + ut_ad(request.bpage); + ut_ad(request.bpage->in_file()); + ut_ad(space->id == request.bpage->id().space()); + ut_ad(space->pending_io()); + ut_ad(!srv_read_only_mode); + const ulint buf_size= 2 * block_size(); mysql_mutex_lock(&mutex); @@ -707,13 +711,13 @@ void buf_dblwr_t::add_to_batch(buf_page_t *bpage, bool lru, size_t size) /* We request frame here to get correct buffer in case of encryption and/or page compression */ - void *frame= buf_page_get_frame(bpage); + void *frame= buf_page_get_frame(request.bpage); memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(p, frame, size); - ut_ad(!bpage->zip_size() || bpage->zip_size() == size); + ut_ad(!request.bpage->zip_size() || request.bpage->zip_size() == size); ut_ad(reserved == first_free); ut_ad(reserved < buf_size); - buf_block_arr[first_free++]= { bpage, lru, size }; + new (buf_block_arr + first_free++) element{space, request, size}; reserved= first_free; if (first_free != buf_size || !flush_buffered_writes(buf_size / 2)) diff --git a/storage/innobase/buf/buf0dump.cc b/storage/innobase/buf/buf0dump.cc index b66f5e39744..19a9e09e4a1 100644 --- a/storage/innobase/buf/buf0dump.cc +++ b/storage/innobase/buf/buf0dump.cc @@ -626,6 +626,14 @@ buf_load() so all pages from a given tablespace are consecutive. */ ulint cur_space_id = dump[0].space(); fil_space_t* space = fil_space_acquire_silent(cur_space_id); + if (space) { + bool ok = space->acquire_for_io(); + space->release(); + if (!ok) { + space = nullptr; + } + } + ulint zip_size = space ? space->zip_size() : 0; PSI_stage_progress* pfs_stage_progress __attribute__((unused)) @@ -644,24 +652,34 @@ buf_load() } if (this_space_id != cur_space_id) { - if (space != NULL) { - space->release(); + if (space) { + space->release_for_io(); } cur_space_id = this_space_id; space = fil_space_acquire_silent(cur_space_id); - if (space != NULL) { - zip_size = space->zip_size(); + if (!space) { + continue; } + + bool ok = space->acquire_for_io(); + space->release(); + + if (!ok) { + space = nullptr; + continue; + } + + zip_size = space->zip_size(); } /* JAN: TODO: As we use background page read below, if tablespace is encrypted we cant use it. */ - if (space == NULL || - (space && space->crypt_data && - space->crypt_data->encryption != FIL_ENCRYPTION_OFF && - space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)) { + if (!space || dump[i].page_no() >= space->get_size() || + (space->crypt_data && + space->crypt_data->encryption != FIL_ENCRYPTION_OFF && + space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)) { continue; } @@ -671,11 +689,12 @@ buf_load() continue; } - buf_read_page_background(dump[i], zip_size, true); + space->reacquire_for_io(); + buf_read_page_background(space, dump[i], zip_size, true); if (buf_load_abort_flag) { - if (space != NULL) { - space->release(); + if (space) { + space->release_for_io(); } buf_load_abort_flag = false; ut_free(dump); @@ -702,8 +721,8 @@ buf_load() #endif } - if (space != NULL) { - space->release(); + if (space) { + space->release_for_io(); } ut_free(dump); diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index b69026ef990..25523ab53f1 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -782,6 +782,11 @@ static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space) { ut_ad(bpage->in_file()); ut_ad(bpage->ready_for_flush()); + ut_ad((space->purpose == FIL_TYPE_TEMPORARY) == + (space == fil_system.temp_space)); + ut_ad(space->purpose == FIL_TYPE_TABLESPACE || + space->atomic_write_supported); + ut_ad(space->pending_io()); rw_lock_t *rw_lock; @@ -807,11 +812,6 @@ static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space) io_fix and oldest_modification()!=0. Thus, it cannot be relocated in the buffer pool or removed from flush_list or LRU_list. */ - ut_ad((space->purpose == FIL_TYPE_TEMPORARY) == - (space == fil_system.temp_space)); - ut_ad(space->purpose == FIL_TYPE_TABLESPACE || - space->atomic_write_supported); - DBUG_PRINT("ib_buf", ("%s %u page %u:%u", lru ? "LRU" : "flush_list", bpage->id().space(), bpage->id().page_no())); @@ -850,82 +850,66 @@ static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space) } } - size_t size, orig_size; - ulint type= IORequest::WRITE; - - if (UNIV_UNLIKELY(!rw_lock)) /* ROW_FORMAT=COMPRESSED */ - { - ut_ad(!space->full_crc32()); - ut_ad(!space->is_compressed()); /* not page_compressed */ - orig_size= size= bpage->zip_size(); - if (status != buf_page_t::FREED) - { - buf_flush_update_zip_checksum(frame, orig_size); - frame= buf_page_encrypt(space, bpage, frame, &size); - } - ut_ad(size == bpage->zip_size()); - } + if (status == buf_page_t::FREED) + buf_release_freed_page(&block->page); else { - byte *page= block->frame; - orig_size= size= block->physical_size(); + space->reacquire_for_io(); + ut_ad(status == buf_page_t::NORMAL || status == buf_page_t::INIT_ON_FLUSH); + size_t size, orig_size; + IORequest::Type type= lru ? IORequest::WRITE_LRU : IORequest::WRITE_ASYNC; - if (status == buf_page_t::FREED); - else if (space->full_crc32()) + if (UNIV_UNLIKELY(!rw_lock)) /* ROW_FORMAT=COMPRESSED */ { - /* innodb_checksum_algorithm=full_crc32 is not implemented for - ROW_FORMAT=COMPRESSED pages. */ - ut_ad(!frame); - page= buf_page_encrypt(space, bpage, page, &size); - buf_flush_init_for_writing(block, page, nullptr, true); + ut_ad(!space->full_crc32()); + ut_ad(!space->is_compressed()); /* not page_compressed */ + orig_size= size= bpage->zip_size(); + buf_flush_update_zip_checksum(frame, size); + frame= buf_page_encrypt(space, bpage, frame, &size); + ut_ad(size == bpage->zip_size()); } else { - buf_flush_init_for_writing(block, page, frame ? &bpage->zip : nullptr, - false); - page= buf_page_encrypt(space, bpage, frame ? frame : page, &size); - } + byte *page= block->frame; + orig_size= size= block->physical_size(); + + if (space->full_crc32()) + { + /* innodb_checksum_algorithm=full_crc32 is not implemented for + ROW_FORMAT=COMPRESSED pages. */ + ut_ad(!frame); + page= buf_page_encrypt(space, bpage, page, &size); + buf_flush_init_for_writing(block, page, nullptr, true); + } + else + { + buf_flush_init_for_writing(block, page, frame ? &bpage->zip : nullptr, + false); + page= buf_page_encrypt(space, bpage, frame ? frame : page, &size); + } #if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32 - if (size != orig_size && space->punch_hole) - type|= IORequest::PUNCH_HOLE; + if (size != orig_size && space->punch_hole) + type= lru ? IORequest::PUNCH_LRU : IORequest::PUNCH; #else DBUG_EXECUTE_IF("ignore_punch_hole", if (size != orig_size && space->punch_hole) - type|= IORequest::PUNCH_HOLE;); + type= lru ? IORequest::PUNCH_LRU : IORequest::PUNCH;); #endif - frame= page; - } - - IORequest request(type, bpage, lru); + frame=page; + } - ut_ad(status == bpage->status); + ut_ad(status == bpage->status); - switch (status) { - default: - ut_ad(status == buf_page_t::FREED); - buf_release_freed_page(bpage); - break; - case buf_page_t::NORMAL: - if (space->use_doublewrite()) - { - ut_ad(!srv_read_only_mode); - if (lru) - buf_pool.n_flush_LRU++; - else - buf_pool.n_flush_list++; - buf_dblwr.add_to_batch(bpage, lru, size); - break; - } - /* fall through */ - case buf_page_t::INIT_ON_FLUSH: if (lru) buf_pool.n_flush_LRU++; else buf_pool.n_flush_list++; - /* FIXME: pass space to fil_io() */ - fil_io(request, false, bpage->id(), bpage->zip_size(), 0, - bpage->physical_size(), frame, bpage); + if (status != buf_page_t::NORMAL || !space->use_doublewrite()) + space->io(IORequest(type, bpage), + bpage->physical_offset(), size, frame, bpage); + else + buf_dblwr.add_to_batch(space, IORequest(type, bpage), size); } /* Increment the I/O operation count used for selecting LRU policy. */ @@ -973,8 +957,7 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space, ? static_cast<uint32_t>(s) : read_ahead; page_id_t low= id - (id.page_no() % buf_flush_area); page_id_t high= low + buf_flush_area; - high.set_page_no(std::min(high.page_no(), - static_cast<uint32_t>(space.committed_size - 1))); + high.set_page_no(std::min(high.page_no(), space.last_page_number())); if (!contiguous) { @@ -1018,13 +1001,12 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space, return i; } +MY_ATTRIBUTE((nonnull)) /** Write punch-hole or zeroes of the freed ranges when innodb_immediate_scrub_data_uncompressed from the freed ranges. -@param[in] space tablespace which contains freed ranges -@param[in] freed_ranges freed ranges of the page to be flushed */ +@param space tablespace which may contain ranges of freed pages */ static void buf_flush_freed_pages(fil_space_t *space) { - ut_ad(space != NULL); const bool punch_hole= space->punch_hole; if (!srv_immediate_scrub_data_uncompressed && !punch_hole) return; @@ -1043,27 +1025,24 @@ static void buf_flush_freed_pages(fil_space_t *space) for (const auto &range : freed_ranges) { - ulint page_size= space->zip_size(); - if (!page_size) - page_size= srv_page_size; + const ulint physical_size= space->physical_size(); if (punch_hole) { - const auto len= (range.last - range.first + 1) * page_size; - const page_id_t page_id(space->id, range.first); - fil_io_t fio= fil_io(IORequestWrite, true, page_id, space->zip_size(), - 0, len, nullptr, nullptr, false, true); - if (fio.node) - fio.node->space->release_for_io(); + space->reacquire_for_io(); + space->io(IORequest(IORequest::PUNCH_RANGE), + os_offset_t{range.first} * physical_size, + (range.last - range.first + 1) * physical_size, + nullptr); } else if (srv_immediate_scrub_data_uncompressed) { - for (auto i= range.first; i <= range.last; i++) + for (os_offset_t i= range.first; i <= range.last; i++) { - const page_id_t page_id(space->id, i); - fil_io(IORequestWrite, false, page_id, space->zip_size(), 0, - space->zip_size() ? space->zip_size() : srv_page_size, - const_cast<byte*>(field_ref_zero), nullptr, false, false); + space->reacquire_for_io(); + space->io(IORequest(IORequest::WRITE_ASYNC), + i * physical_size, physical_size, + const_cast<byte*>(field_ref_zero)); } } buf_pool.stat.n_pages_written+= (range.last - range.first + 1); @@ -1093,7 +1072,8 @@ static ulint buf_flush_try_neighbors(fil_space_t *space, ut_ad(page_id >= id); ut_ad(page_id < high); - for (ulint id_fold= id.fold(); id < high; ++id, ++id_fold) + for (ulint id_fold= id.fold(); id < high && !space->is_stopping(); + ++id, ++id_fold) { if (count + n_flushed >= n_to_flush) { @@ -1190,7 +1170,7 @@ static ulint buf_free_from_unzip_LRU_list_batch(ulint max) @retval nullptr if the pages for this tablespace should be discarded */ static fil_space_t *buf_flush_space(const uint32_t id) { - fil_space_t *space= fil_space_acquire_for_io(id); + fil_space_t *space= fil_space_t::get_for_io(id); if (space) buf_flush_freed_pages(space); return space; @@ -1204,6 +1184,37 @@ struct flush_counters_t ulint evicted; }; +/** Try to discard a dirty page. +@param bpage dirty page whose tablespace is not accessible */ +static void buf_flush_discard_page(buf_page_t *bpage) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); + ut_ad(bpage->in_file()); + ut_ad(bpage->oldest_modification()); + + rw_lock_t *rw_lock; + + if (bpage->state() != BUF_BLOCK_FILE_PAGE) + rw_lock= nullptr; + else + { + rw_lock= &reinterpret_cast<buf_block_t*>(bpage)->lock; + if (!rw_lock_sx_lock_nowait(rw_lock, 0)) + return; + } + + bpage->status= buf_page_t::NORMAL; + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_flush_remove(bpage); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + if (rw_lock) + rw_lock_sx_unlock(rw_lock); + + buf_LRU_free_page(bpage, true); +} + /** Flush dirty blocks from the end of the LRU list. @param max maximum number of blocks to make available in buf_pool.free @param n counts of flushed and evicted pages */ @@ -1219,6 +1230,9 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN ? 0 : srv_flush_neighbors; fil_space_t *space= nullptr; + uint32_t last_space_id= FIL_NULL; + static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency"); + static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency"); for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU); bpage && n->flushed + n->evicted < max && @@ -1244,13 +1258,25 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) const uint32_t space_id= page_id.space(); if (!space || space->id != space_id) { - if (space) - space->release_for_io(); - space= buf_flush_space(space_id); - if (!space) - continue; + if (last_space_id != space_id) + { + if (space) + space->release_for_io(); + space= buf_flush_space(space_id); + last_space_id= space_id; + } + else + ut_ad(!space); + } + else if (space->is_stopping()) + { + space->release_for_io(); + space= nullptr; } - if (neighbors && space->is_rotational()) + + if (!space) + buf_flush_discard_page(bpage); + else if (neighbors && space->is_rotational()) { mysql_mutex_unlock(&buf_pool.mutex); n->flushed+= buf_flush_try_neighbors(space, page_id, neighbors == 1, @@ -1328,6 +1354,9 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn) const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN ? 0 : srv_flush_neighbors; fil_space_t *space= nullptr; + uint32_t last_space_id= FIL_NULL; + static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency"); + static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency"); /* Start from the end of the list looking for a suitable block to be flushed. */ @@ -1361,17 +1390,29 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn) const uint32_t space_id= page_id.space(); if (!space || space->id != space_id) { - if (space) - space->release_for_io(); - space= buf_flush_space(space_id); - if (!space) - continue; + if (last_space_id != space_id) + { + if (space) + space->release_for_io(); + space= buf_flush_space(space_id); + last_space_id= space_id; + } + else + ut_ad(!space); } - if (neighbors && space->is_rotational()) + else if (space->is_stopping()) + { + space->release_for_io(); + space= nullptr; + } + + if (!space) + buf_flush_discard_page(bpage); + else if (neighbors && space->is_rotational()) { mysql_mutex_unlock(&buf_pool.mutex); count+= buf_flush_try_neighbors(space, page_id, neighbors == 1, - false, count, max_n); + false, count, max_n); reacquire_mutex: mysql_mutex_lock(&buf_pool.mutex); } @@ -1476,10 +1517,9 @@ ulint buf_flush_lists(ulint max_n, lsn_t lsn) while not holding buf_pool.flush_list_mutex */ if (running || !UT_LIST_GET_LEN(buf_pool.flush_list)) { + if (!running) + mysql_cond_broadcast(cond); mysql_mutex_unlock(&buf_pool.mutex); - if (running) - return 0; - mysql_cond_broadcast(cond); return 0; } n_flush++; diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc index bc81a8e9b86..daea53ec130 100644 --- a/storage/innobase/buf/buf0rea.cc +++ b/storage/innobase/buf/buf0rea.cc @@ -261,26 +261,23 @@ flag is cleared and the x-lock released by an i/o-handler thread. @param[out] err DB_SUCCESS or DB_TABLESPACE_DELETED if we are trying to read from a non-existent tablespace +@param[in,out] space tablespace @param[in] sync true if synchronous aio is desired @param[in] mode BUF_READ_IBUF_PAGES_ONLY, ..., @param[in] page_id page id @param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] unzip true=request uncompressed page -@param[in] ignore whether to ignore out-of-bounds page_id -@return 1 if a read request was queued, 0 if the page already resided -in buf_pool, or if the page is in the doublewrite buffer blocks in -which case it is never read into the pool, or if the tablespace does -not exist or is being dropped */ +@return whether a read request was queued */ static -ulint +bool buf_read_page_low( dberr_t* err, + fil_space_t* space, bool sync, ulint mode, const page_id_t page_id, ulint zip_size, - bool unzip, - bool ignore = false) + bool unzip) { buf_page_t* bpage; @@ -290,17 +287,22 @@ buf_read_page_low( ib::error() << "Trying to read doublewrite buffer page " << page_id; ut_ad(0); - return(0); +nothing_read: + space->release_for_io(); + return false; } - if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id)) { + if (sync) { + } else if (trx_sys_hdr_page(page_id) + || ibuf_bitmap_page(page_id, zip_size) + || (!recv_no_ibuf_operations + && ibuf_page(page_id, zip_size, nullptr))) { /* Trx sys header is so low in the latching order that we play safe and do not leave the i/o-completion to an asynchronous - i/o-thread. Ibuf bitmap pages must always be read with + i/o-thread. Change buffer pages must always be read with syncronous i/o, to make sure they do not get involved in thread deadlocks. */ - sync = true; } @@ -311,20 +313,19 @@ buf_read_page_low( bpage = buf_page_init_for_read(mode, page_id, zip_size, unzip); if (bpage == NULL) { - - return(0); + goto nothing_read; } - DBUG_LOG("ib_buf", - "read page " << page_id << " zip_size=" << zip_size - << " unzip=" << unzip << ',' << (sync ? "sync" : "async")); - ut_ad(bpage->in_file()); if (sync) { - thd_wait_begin(NULL, THD_WAIT_DISKIO); + thd_wait_begin(nullptr, THD_WAIT_DISKIO); } + DBUG_LOG("ib_buf", + "read page " << page_id << " zip_size=" << zip_size + << " unzip=" << unzip << ',' << (sync ? "sync" : "async")); + void* dst; if (zip_size) { @@ -335,20 +336,18 @@ buf_read_page_low( dst = ((buf_block_t*) bpage)->frame; } - fil_io_t fio = fil_io( - IORequestRead, sync, page_id, zip_size, 0, - zip_size ? zip_size : srv_page_size, - dst, bpage, ignore); + const ulint len = zip_size ? zip_size : srv_page_size; + auto fio = space->io(IORequest(sync + ? IORequest::READ_SYNC + : IORequest::READ_ASYNC), + page_id.page_no() * len, len, dst, bpage); *err= fio.err; if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) { - if (ignore || fio.err == DB_TABLESPACE_DELETED) { + if (!sync || fio.err == DB_TABLESPACE_DELETED) { buf_pool.corrupted_evict(bpage); - if (sync && fio.node) { - fio.node->space->release_for_io(); - } - return(0); + return false; } ut_error; @@ -357,16 +356,16 @@ buf_read_page_low( if (sync) { thd_wait_end(NULL); - /* The i/o was already completed in fil_io() */ + /* The i/o was already completed in space->io() */ *err = buf_page_read_complete(bpage, *fio.node); - fio.node->space->release_for_io(); + space->release_for_io(); if (*err != DB_SUCCESS) { - return(0); + return false; } } - return(1); + return true; } /** Applies a random read-ahead in buf_pool if there are at least a threshold @@ -411,7 +410,7 @@ buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf) ulint count= 5 + buf_read_ahead_area / 8; const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area); page_id_t high= low + buf_read_ahead_area; - high.set_page_no(std::min(high.page_no(), space->committed_size - 1)); + high.set_page_no(std::min(high.page_no(), space->last_page_number())); /* Count how many blocks in the area have been recently accessed, that is, reside near the start of the LRU list. */ @@ -427,10 +426,14 @@ buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf) goto read_ahead; } +no_read_ahead: space->release(); return 0; read_ahead: + if (!space->acquire_for_io()) + goto no_read_ahead; + /* Read all the suitable blocks within the area */ const ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE; @@ -441,13 +444,16 @@ read_ahead: if (space->is_stopping()) break; dberr_t err; - count+= buf_read_page_low(&err, false, ibuf_mode, i, zip_size, false); + space->reacquire_for_io(); + if (buf_read_page_low(&err, space, false, ibuf_mode, i, zip_size, false)) + count++; } if (count) DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u", count, space->chain.start->name, low.page_no())); + space->release_for_io(); space->release(); /* Read ahead is considered one I/O operation for the purpose of @@ -472,41 +478,49 @@ after decryption normal page checksum does not match. @retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */ dberr_t buf_read_page(const page_id_t page_id, ulint zip_size) { - dberr_t err = DB_SUCCESS; - - ulint count = buf_read_page_low( - &err, true, BUF_READ_ANY_PAGE, page_id, zip_size, false); - - srv_stats.buf_pool_reads.add(count); + fil_space_t *space= fil_space_acquire(page_id.space()); + if (!space) + { + ib::info() << "trying to read page " << page_id + << " in nonexisting or being-dropped tablespace"; + return DB_TABLESPACE_DELETED; + } + else if (!space->acquire_for_io()) + { + ib::warn() << "unable to read " << page_id << " from tablespace " + << space->name; + space->release(); + return DB_PAGE_CORRUPTED; + } - if (err == DB_TABLESPACE_DELETED) { - ib::info() << "trying to read page " << page_id - << " in nonexisting or being-dropped tablespace"; - } + space->release(); - /* Increment number of I/O operations used for LRU policy. */ - buf_LRU_stat_inc_io(); + dberr_t err; + if (buf_read_page_low(&err, space, true, BUF_READ_ANY_PAGE, + page_id, zip_size, false)) + srv_stats.buf_pool_reads.add(1); - return(err); + buf_LRU_stat_inc_io(); + return err; } /** High-level function which reads a page asynchronously from a file to the buffer buf_pool if it is not already there. Sets the io_fix flag and sets an exclusive lock on the buffer frame. The flag is cleared and the x-lock released by the i/o-handler thread. +@param[in,out] space tablespace @param[in] page_id page id @param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] sync true if synchronous aio is desired */ -void -buf_read_page_background(const page_id_t page_id, ulint zip_size, bool sync) +void buf_read_page_background(fil_space_t *space, const page_id_t page_id, + ulint zip_size, bool sync) { - ulint count; dberr_t err; - count = buf_read_page_low( - &err, sync, - BUF_READ_ANY_PAGE, - page_id, zip_size, false, true); + if (buf_read_page_low(&err, space, sync, BUF_READ_ANY_PAGE, + page_id, zip_size, false)) { + srv_stats.buf_pool_reads.add(1); + } switch (err) { case DB_SUCCESS: @@ -528,8 +542,6 @@ buf_read_page_background(const page_id_t page_id, ulint zip_size, bool sync) << page_id; } - srv_stats.buf_pool_reads.add(count); - /* We do not increment number of I/O operations used for LRU policy here (buf_LRU_stat_inc_io()). We use this in heuristics to decide about evicting uncompressed version of compressed pages from the @@ -598,10 +610,19 @@ buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf) fil_space_t *space= fil_space_acquire(page_id.space()); if (!space) return 0; - if (high_1.page_no() >= space->committed_size) + else { - /* The area is not whole. */ + bool ok= space->acquire_for_io(); space->release(); + if (!ok) + return 0; + } + + if (high_1.page_no() > space->last_page_number()) + { + /* The area is not whole. */ +fail: + space->release_for_io(); return 0; } @@ -628,8 +649,7 @@ buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf) { hard_fail: hash_lock->read_unlock(); - space->release(); - return 0; + goto fail; } const byte *f; switch (UNIV_EXPECT(bpage->state(), BUF_BLOCK_FILE_PAGE)) { @@ -661,7 +681,7 @@ hard_fail: if (id != new_low && id != new_high_1) /* This is not a border page of the area: return */ goto hard_fail; - if (new_high_1.page_no() >= space->committed_size) + if (new_high_1.page_no() > space->last_page_number()) /* The area is not whole */ goto hard_fail; } @@ -671,8 +691,7 @@ failed: hash_lock->read_unlock(); if (--count) continue; - space->release(); - return 0; + goto fail; } const unsigned accessed= bpage->is_accessed(); @@ -702,7 +721,8 @@ failed: if (space->is_stopping()) break; dberr_t err; - count+= buf_read_page_low(&err, false, ibuf_mode, new_low, zip_size, + space->reacquire_for_io(); + count+= buf_read_page_low(&err, space, false, ibuf_mode, new_low, zip_size, false); } @@ -710,7 +730,7 @@ failed: DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u", count, space->chain.start->name, new_low.page_no())); - space->release(); + space->release_for_io(); /* Read ahead is considered one I/O operation for the purpose of LRU policy decision. */ @@ -721,24 +741,19 @@ failed: } /** Issues read requests for pages which recovery wants to read in. -@param[in] sync true if the caller wants this function to wait -for the highest address page to get read in, before this function returns @param[in] space_id tablespace id @param[in] page_nos array of page numbers to read, with the highest page number the last in the array @param[in] n number of page numbers in the array */ -void buf_read_recv_pages(bool sync, ulint space_id, const uint32_t *page_nos, - ulint n) +void buf_read_recv_pages(ulint space_id, const uint32_t* page_nos, ulint n) { - fil_space_t* space = fil_space_get(space_id); + fil_space_t* space = fil_space_t::get_for_io(space_id); - if (space == NULL) { - /* The tablespace is missing: do nothing */ + if (!space) { + /* The tablespace is missing or unreadable: do nothing */ return; } - fil_space_open_if_needed(space); - const ulint zip_size = space->zip_size(); for (ulint i = 0; i < n; i++) { @@ -769,9 +784,10 @@ void buf_read_recv_pages(bool sync, ulint space_id, const uint32_t *page_nos, } dberr_t err; - buf_read_page_low( - &err, sync && i + 1 == n, - BUF_READ_ANY_PAGE, cur_page_id, zip_size, true); + space->reacquire_for_io(); + buf_read_page_low(&err, space, false, + BUF_READ_ANY_PAGE, cur_page_id, zip_size, + true); if (err == DB_DECRYPTION_FAILED || err == DB_PAGE_CORRUPTED) { ib::error() << "Recovery failed to read or decrypt " @@ -779,5 +795,8 @@ void buf_read_recv_pages(bool sync, ulint space_id, const uint32_t *page_nos, } } - DBUG_PRINT("ib_buf", ("recovery read-ahead (%u pages)", n)); + + DBUG_PRINT("ib_buf", ("recovery read (%u pages) for %s", n, + space->chain.start->name)); + space->release_for_io(); } diff --git a/storage/innobase/dict/dict0crea.cc b/storage/innobase/dict/dict0crea.cc index 7a27160ccd5..fb3247ecdcf 100644 --- a/storage/innobase/dict/dict0crea.cc +++ b/storage/innobase/dict/dict0crea.cc @@ -951,7 +951,7 @@ void dict_drop_index_tree(btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr) if (fil_space_t* s = fil_space_acquire_silent(space_id)) { /* Ensure that the tablespace file exists in order to avoid a crash in buf_page_get_gen(). */ - if (s->size || fil_space_get_size(space_id)) { + if (root_page_no < s->get_size()) { btr_free_if_exists(page_id_t(space_id, root_page_no), s->zip_size(), mach_read_from_8(ptr), mtr); diff --git a/storage/innobase/dict/dict0load.cc b/storage/innobase/dict/dict0load.cc index 489f4d491d1..753bcf74967 100644 --- a/storage/innobase/dict/dict0load.cc +++ b/storage/innobase/dict/dict0load.cc @@ -2975,15 +2975,15 @@ err_exit: } if (err == DB_SUCCESS && table->is_readable()) { - if (table->space && !fil_space_get_size(table->space_id)) { + const auto root = dict_table_get_first_index(table)->page; + + if (root >= table->space->get_size()) { corrupted: table->corrupted = true; table->file_unreadable = true; err = DB_CORRUPTION; } else { - const page_id_t page_id( - table->space->id, - dict_table_get_first_index(table)->page); + const page_id_t page_id(table->space->id, root); mtr.start(); buf_block_t* block = buf_page_get( page_id, table->space->zip_size(), diff --git a/storage/innobase/fil/fil0crypt.cc b/storage/innobase/fil/fil0crypt.cc index 5587355f23a..e73337a3bdd 100644 --- a/storage/innobase/fil/fil0crypt.cc +++ b/storage/innobase/fil/fil0crypt.cc @@ -975,8 +975,7 @@ static inline void fil_crypt_read_crypt_data(fil_space_t* space) { - if (space->crypt_data || space->size - || !fil_space_get_size(space->id)) { + if (space->crypt_data || space->size || !space->get_size()) { /* The encryption metadata has already been read, or the tablespace is not encrypted and the file has been opened already, or the file cannot be accessed, @@ -2246,15 +2245,9 @@ static void fil_crypt_rotation_list_fill() } /* Ensure that crypt_data has been initialized. */ - if (!space->size) { - ut_d(const fil_space_t* s=) - fil_system.read_page0(space->id); - ut_ad(!s || s == space); - if (!space->size) { - /* Page 0 was not loaded. - Skip this tablespace. */ - goto next; - } + if (!space->get_size()) { + /* Page 0 was not loaded. Skip this tablespace. */ + goto next; } /* Skip ENCRYPTION!=DEFAULT tablespaces. */ diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 2da60b079f7..ad9d2828467 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -49,25 +49,81 @@ Created 10/25/1995 Heikki Tuuri #include "os0event.h" #include "sync0sync.h" #include "buf0flu.h" -#include "os0api.h" #ifdef UNIV_LINUX # include <sys/types.h> # include <sys/sysmacros.h> # include <dirent.h> #endif -/** Tries to close a file in the LRU list. The caller must hold the fil_sys -mutex. -@return true if success, false if should retry later; since i/o's -generally complete in < 100 ms, and as InnoDB writes at most 128 pages -from the buffer pool in a batch, and then immediately flushes the -files, there is a good chance that the next time we find a suitable -node from the LRU list. -@param[in] print_info if true, prints information why it - cannot close a file */ -static -bool -fil_try_to_close_file_in_LRU(bool print_info); +/** Determine if the space id is a user tablespace id or not. +@param space_id tablespace identifier +@return true if it is a user tablespace ID */ +inline bool fil_is_user_tablespace_id(ulint space_id) +{ + return space_id != TRX_SYS_SPACE && space_id != SRV_TMP_SPACE_ID && + !srv_is_undo_tablespace(space_id); +} + +/** Try to close a file. +@return true if success, false if should retry later +@param print_info if true, prints information why it cannot close a file */ +static bool fil_try_to_close_file(bool print_info) +{ + ut_ad(mutex_own(&fil_system.mutex)); + for (fil_space_t *space= UT_LIST_GET_FIRST(fil_system.space_list); space; + space= UT_LIST_GET_NEXT(space_list, space)) + { + switch (space->purpose) { + case FIL_TYPE_TEMPORARY: + continue; + case FIL_TYPE_IMPORT: + break; + case FIL_TYPE_TABLESPACE: + if (!fil_is_user_tablespace_id(space->id)) + continue; + } + + /* We are using an approximation of LRU replacement policy. In + fil_node_open_file_low(), newly opened files are moved to the end + of fil_system.space_list, so that they would be less likely to be + closed here. */ + fil_node_t *node= UT_LIST_GET_FIRST(space->chain); + ut_ad(node); + ut_ad(!UT_LIST_GET_NEXT(chain, node)); + + if (!node->is_open()) + continue; + + if (auto n= space->set_closing()) + { + if (print_info) + ib::info() << "Cannot close file " << node->name + << " because of " << n << " pending operations"; + continue; + } + + if (auto n= node->n_pending_flushes) + { + if (print_info) + ib::info() << "Cannot close file " << node->name + << ", because n_pending_flushes " << n; + continue; + } + + if (node->needs_flush) + { + if (print_info) + ib::info() << "Cannot close file " << node->name + << ", because is should be flushed first"; + continue; + } + + node->close(); + return true; + } + + return false; +} /** Test if a tablespace file can be renamed to a new filepath by checking if that the old filepath exists and the new filepath does not exist. @@ -143,16 +199,7 @@ from a file, versus reading from a raw disk. To have fast access to a tablespace or a log file, we put the data structures to a hash table. Each tablespace and log file is given an unique 32-bit -identifier. - -Some operating systems do not support many open files at the same time, -though NT seems to tolerate at least 900 open files. Therefore, we put the -open files in an LRU-list. If we need to open another file, we may close the -file at the end of the LRU-list. When an i/o-operation is pending on a file, -the file cannot be closed. We take the file nodes with pending i/o-operations -out of the LRU-list and keep a count of pending operations. When an operation -completes, we decrement the count and return the file node to the LRU-list if -the count drops to zero. */ +identifier. */ /** Reference to the server data directory. Usually it is the current working directory ".", but in the MySQL Embedded Server Library @@ -172,18 +219,6 @@ fil_system_t fil_system; /** At this age or older a space/page will be rotated */ UNIV_INTERN extern uint srv_fil_crypt_rotate_key_age; -/** Determine if the space id is a user tablespace id or not. -@param[in] space_id Space ID to check -@return true if it is a user tablespace ID */ -inline -bool -fil_is_user_tablespace_id(ulint space_id) -{ - return(space_id != TRX_SYS_SPACE - && space_id != SRV_TMP_SPACE_ID - && !srv_is_undo_tablespace(space_id)); -} - #ifdef UNIV_DEBUG /** Try fil_validate() every this many times */ # define FIL_VALIDATE_SKIP 17 @@ -205,43 +240,6 @@ fil_validate_skip(void) } #endif /* UNIV_DEBUG */ -/********************************************************************//** -Determines if a file node belongs to the least-recently-used list. -@return true if the file belongs to fil_system.LRU mutex. */ -UNIV_INLINE -bool -fil_space_belongs_in_lru( -/*=====================*/ - const fil_space_t* space) /*!< in: file space */ -{ - switch (space->purpose) { - case FIL_TYPE_TEMPORARY: - return(false); - case FIL_TYPE_TABLESPACE: - return(fil_is_user_tablespace_id(space->id)); - case FIL_TYPE_IMPORT: - return(true); - } - - ut_ad(0); - return(false); -} - -/********************************************************************//** -NOTE: you must call fil_mutex_enter_and_prepare_for_io() first! - -Prepares a file node for i/o. Opens the file if it is closed. Updates the -pending i/o's field in the node and the system appropriately. Takes the node -off the LRU list if it is in the LRU list. The caller must hold the fil_sys -mutex. -@return false if the file can't be opened, otherwise true */ -static -bool -fil_node_prepare_for_io( -/*====================*/ - fil_node_t* node, /*!< in: file node */ - fil_space_t* space); /*!< in: space */ - /*******************************************************************//** Returns the table space by a given id, NULL if not found. It is unsafe to dereference the returned pointer. It is fine to check @@ -351,7 +349,7 @@ static bool fil_comp_algo_validate(const fil_space_t* space) @param[in] is_raw whether this is a raw device @param[in] atomic_write true if atomic write could be enabled @param[in] max_pages maximum number of pages in file, -or ULINT_MAX for unlimited +or UINT32_MAX for unlimited @return file object */ fil_node_t* fil_space_t::add(const char* name, pfs_os_file_t handle, uint32_t size, bool is_raw, bool atomic_write, @@ -387,114 +385,108 @@ fil_node_t* fil_space_t::add(const char* name, pfs_os_file_t handle, this->size += size; UT_LIST_ADD_LAST(chain, node); if (node->is_open()) { - fil_system.n_open++; + ++fil_system.n_open; } mutex_exit(&fil_system.mutex); return node; } -/** Open a file node of a tablespace. -@param[in,out] node File node -@return false if the file can't be opened, otherwise true */ -static bool fil_node_open_file(fil_node_t* node) +/** Open a tablespace file. +@param node data file +@return whether the file was successfully opened */ +static bool fil_node_open_file_low(fil_node_t *node) { - bool success; - bool read_only_mode; - fil_space_t* space = node->space; - - ut_ad(mutex_own(&fil_system.mutex)); - ut_a(node->n_pending == 0); - ut_a(!node->is_open()); - - read_only_mode = space->purpose != FIL_TYPE_TEMPORARY - && srv_read_only_mode; - - const bool first_time_open = node->size == 0; - - bool o_direct_possible = !FSP_FLAGS_HAS_PAGE_COMPRESSION(space->flags); - if (const ulint ssize = FSP_FLAGS_GET_ZIP_SSIZE(space->flags)) { - compile_time_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096); - if (ssize < 3) { - o_direct_possible = false; - } - } - - if (first_time_open - || (space->purpose == FIL_TYPE_TABLESPACE - && node == UT_LIST_GET_FIRST(space->chain) - && srv_startup_is_before_trx_rollback_phase)) { - /* We do not know the size of the file yet. First we - open the file in the normal mode, no async I/O here, - for simplicity. Then do some checks, and close the - file again. NOTE that we could not use the simple - file read function os_file_read() in Windows to read - from a file opened for async I/O! */ - -retry: - node->handle = os_file_create( - innodb_data_file_key, node->name, - node->is_raw_disk - ? OS_FILE_OPEN_RAW | OS_FILE_ON_ERROR_NO_EXIT - : OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT, - OS_FILE_AIO, - o_direct_possible - ? OS_DATA_FILE - : OS_DATA_FILE_NO_O_DIRECT, - read_only_mode, - &success); - - if (!success) { - /* The following call prints an error message */ - ulint err = os_file_get_last_error(true); - if (err == EMFILE + 100) { - if (fil_try_to_close_file_in_LRU(true)) - goto retry; - } - - ib::warn() << "Cannot open '" << node->name << "'." - " Have you deleted .ibd files under a" - " running mysqld server?"; - return(false); - } - - if (!node->read_page0(first_time_open)) { -fail: - os_file_close(node->handle); - node->handle = OS_FILE_CLOSED; - return false; - } + ut_ad(!node->is_open()); + ut_ad(node->space->is_closing()); + ut_ad(mutex_own(&fil_system.mutex)); + const auto flags= node->space->flags; + bool o_direct_possible= !FSP_FLAGS_HAS_PAGE_COMPRESSION(flags); + static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096, "compatibility"); + if (const auto ssize= FSP_FLAGS_GET_ZIP_SSIZE(flags)) + if (ssize < 3) + o_direct_possible= false; + + for (;;) + { + bool success; + node->handle= os_file_create(innodb_data_file_key, node->name, + node->is_raw_disk + ? OS_FILE_OPEN_RAW | OS_FILE_ON_ERROR_NO_EXIT + : OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT, + OS_FILE_AIO, o_direct_possible + ? OS_DATA_FILE : OS_DATA_FILE_NO_O_DIRECT, + srv_read_only_mode, &success); + if (success) + break; + + /* The following call prints an error message */ + if (os_file_get_last_error(true) == EMFILE + 100 && + fil_try_to_close_file(true)) + continue; - if (first_time_open && !fil_comp_algo_validate(space)) { - goto fail; - } + ib::warn() << "Cannot open '" << node->name << "'."; + return false; + } - } else { - node->handle = os_file_create( - innodb_data_file_key, node->name, - node->is_raw_disk - ? OS_FILE_OPEN_RAW | OS_FILE_ON_ERROR_NO_EXIT - : OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT, - OS_FILE_AIO, - o_direct_possible - ? OS_DATA_FILE - : OS_DATA_FILE_NO_O_DIRECT, - read_only_mode, - &success); - } + if (node->size); + else if (!node->read_page0() || !fil_comp_algo_validate(node->space)) + { + os_file_close(node->handle); + node->handle= OS_FILE_CLOSED; + return false; + } - ut_a(success); - ut_a(node->is_open()); + ut_ad(node->is_open()); - fil_system.n_open++; + if (UNIV_LIKELY(!fil_system.freeze_space_list)) + { + /* Move the file last in fil_system.space_list, so that + fil_try_to_close_file() should close it as a last resort. */ + UT_LIST_REMOVE(fil_system.space_list, node->space); + UT_LIST_ADD_LAST(fil_system.space_list, node->space); + } - if (fil_space_belongs_in_lru(space)) { + fil_system.n_open++; + return true; +} - /* Put the node to the LRU list */ - UT_LIST_ADD_FIRST(fil_system.LRU, node); - } +/** Open a tablespace file. +@param node data file +@return whether the file was successfully opened */ +static bool fil_node_open_file(fil_node_t *node) +{ + ut_ad(mutex_own(&fil_system.mutex)); + ut_ad(!node->is_open()); + ut_ad(fil_is_user_tablespace_id(node->space->id) || + srv_operation == SRV_OPERATION_BACKUP || + srv_operation == SRV_OPERATION_RESTORE || + srv_operation == SRV_OPERATION_RESTORE_DELTA); + ut_ad(node->space->purpose != FIL_TYPE_TEMPORARY); + ut_ad(node->space->pending_io()); + + for (ulint count= 0; fil_system.n_open >= srv_max_n_open_files; count++) + { + if (fil_try_to_close_file(count > 1)) + count= 0; + else if (count >= 2) + { + ib::warn() << "innodb_open_files=" << srv_max_n_open_files + << " is exceeded (" << fil_system.n_open + << ") files stay open)"; + break; + } + else + { + mutex_exit(&fil_system.mutex); + os_thread_sleep(20000); + /* Flush tablespaces so that we can close modified files. */ + fil_flush_file_spaces(); + mutex_enter(&fil_system.mutex); + } + } - return(true); + return fil_node_open_file_low(node); } /** Close the file handle. */ @@ -520,8 +512,9 @@ pfs_os_file_t fil_node_t::detach() void fil_node_t::prepare_to_close_or_detach() { ut_ad(mutex_own(&fil_system.mutex)); + ut_ad(space->is_closing()); + ut_ad(!space->pending_io()); ut_a(is_open()); - ut_a(n_pending == 0); ut_a(n_pending_flushes == 0); ut_a(!being_extended); ut_a(!needs_flush || space->purpose == FIL_TYPE_TEMPORARY || @@ -529,78 +522,13 @@ void fil_node_t::prepare_to_close_or_detach() ut_a(fil_system.n_open > 0); fil_system.n_open--; - - if (fil_space_belongs_in_lru(space)) - { - ut_a(UT_LIST_GET_LEN(fil_system.LRU) > 0); - UT_LIST_REMOVE(fil_system.LRU, this); - } -} - -/** Tries to close a file in the LRU list. The caller must hold the fil_sys -mutex. -@return true if success, false if should retry later; since i/o's -generally complete in < 100 ms, and as InnoDB writes at most 128 pages -from the buffer pool in a batch, and then immediately flushes the -files, there is a good chance that the next time we find a suitable -node from the LRU list. -@param[in] print_info if true, prints information why it - cannot close a file*/ -static -bool -fil_try_to_close_file_in_LRU( - - bool print_info) -{ - fil_node_t* node; - - ut_ad(mutex_own(&fil_system.mutex)); - - if (print_info) { - ib::info() << "fil_sys open file LRU len " - << UT_LIST_GET_LEN(fil_system.LRU); - } - - for (node = UT_LIST_GET_LAST(fil_system.LRU); - node != NULL; - node = UT_LIST_GET_PREV(LRU, node)) { - - if (!node->needs_flush - && node->n_pending_flushes == 0 - && !node->being_extended) { - - node->close(); - - return(true); - } - - if (!print_info) { - continue; - } - - if (const auto n = node->n_pending_flushes) { - ib::info() << "Cannot close file " << node->name - << ", because n_pending_flushes " << n; - } - - if (node->needs_flush) { - ib::warn() << "Cannot close file " << node->name - << ", because is should be flushed first"; - } - - if (node->being_extended) { - ib::info() << "Cannot close file " << node->name - << ", because it is being extended"; - } - } - - return(false); } /** Flush any writes cached by the file system. @param[in,out] space tablespace -@param[in] metadata whether to update file system metadata */ -static void fil_flush_low(fil_space_t* space, bool metadata = false) +@param[in] metadata whether to update file system metadata +@return whether fil_system.mutex was released and reacquired */ +static bool fil_flush_low(fil_space_t* space, bool metadata = false) { ut_ad(mutex_own(&fil_system.mutex)); ut_ad(!space->is_stopping()); @@ -621,9 +549,10 @@ static void fil_flush_low(fil_space_t* space, bool metadata = false) } #endif /* UNIV_DEBUG */ - if (!metadata) return; + if (!metadata) return false; } + bool reacquired = false; /* Prevent dropping of the space while we are flushing */ space->n_pending_flushes++; @@ -653,6 +582,7 @@ static void fil_flush_low(fil_space_t* space, bool metadata = false) mutex_exit(&fil_system.mutex); os_file_flush(node->handle); + reacquired = true; mutex_enter(&fil_system.mutex); @@ -673,6 +603,7 @@ skip_flush: } space->n_pending_flushes--; + return reacquired; } /** Try to extend a tablespace. @@ -692,6 +623,8 @@ fil_space_extend_must_retry( ut_ad(mutex_own(&fil_system.mutex)); ut_ad(UT_LIST_GET_LAST(space->chain) == node); ut_ad(size >= FIL_IBD_FILE_INITIAL_SIZE); + ut_ad(node->space == space); + ut_ad(space->pending_io()); *success = space->size >= size; @@ -712,12 +645,6 @@ fil_space_extend_must_retry( node->being_extended = true; - if (!fil_node_prepare_for_io(node, space)) { - /* The tablespace data file, such as .ibd file, is missing */ - node->being_extended = false; - return(false); - } - /* At this point it is safe to release fil_system.mutex. No other thread can rename, delete, close or extend the file because we have set the node->being_extended flag. */ @@ -765,8 +692,6 @@ fil_space_extend_must_retry( const uint32_t pages_in_MiB = node->size & ~uint32_t((1U << (20U - srv_page_size_shift)) - 1); - node->complete_io(); - /* Keep the last data file size info up to date, rounded to full megabytes */ @@ -790,105 +715,56 @@ fil_space_extend_must_retry( } } -/** Acquire fil_system.mutex and try to make sure we can open at least one -file while holding it. This should be called before calling -fil_node_prepare_for_io(), because that function may need to open a file. */ -static -fil_space_t* -fil_mutex_enter_and_prepare_for_io( - ulint space_id) /*!< in: space id */ +/** @return whether the file is usable for io() */ +ATTRIBUTE_COLD bool fil_space_t::prepare_for_io() { - for (ulint count = 0;;) { - mutex_enter(&fil_system.mutex); - - fil_space_t* space = fil_space_get_by_id(space_id); + ut_ad(pending_io()); + mutex_enter(&fil_system.mutex); + fil_node_t *node= UT_LIST_GET_LAST(chain); + ut_ad(!id || purpose == FIL_TYPE_TEMPORARY || + node == UT_LIST_GET_FIRST(chain)); - if (!space) { - return nullptr; - } + const bool is_open= node && (node->is_open() || fil_node_open_file(node)); - fil_node_t* node = UT_LIST_GET_LAST(space->chain); - ut_ad(space->id == 0 - || node == UT_LIST_GET_FIRST(space->chain)); - - if (space->id == 0) { - /* We keep the system tablespace files always - open; this is important in preventing - deadlocks in this module, as a page read - completion often performs another read from - the insert buffer. The insert buffer is in - tablespace 0, and we cannot end up waiting in - this function. */ - } else if (!node || node->is_open()) { - /* If the file is already open, no need to do - anything; if the space does not exist, we handle the - situation in the function which called this - function */ - } else { - while (fil_system.n_open >= srv_max_n_open_files) { - /* Too many files are open */ - if (fil_try_to_close_file_in_LRU(count > 1)) { - /* No problem */ - } else if (count >= 2) { - ib::warn() << "innodb_open_files=" - << srv_max_n_open_files - << " is exceeded (" - << fil_system.n_open - << ") files stay open)"; - break; - } else { - mutex_exit(&fil_system.mutex); - os_thread_sleep(20000); - /* Flush tablespaces so that we can - close modified files in the LRU list */ - fil_flush_file_spaces(); - - count++; - mutex_enter(&fil_system.mutex); - continue; - } - } - } - - uint32_t size = space->recv_size; - if (UNIV_UNLIKELY(size != 0)) { - ut_ad(node); - bool success; - if (fil_space_extend_must_retry(space, node, size, - &success)) { - continue; - } + if (!is_open) + release_for_io(); + else if (auto desired_size= recv_size) + { + bool success; + while (fil_space_extend_must_retry(this, node, desired_size, &success)) + mutex_enter(&fil_system.mutex); - ut_ad(mutex_own(&fil_system.mutex)); - /* Crash recovery requires the file extension - to succeed. */ - ut_a(success); - /* InnoDB data files cannot shrink. */ - ut_a(space->size >= size); - if (size > space->committed_size) { - space->committed_size = size; - } + ut_ad(mutex_own(&fil_system.mutex)); + /* Crash recovery requires the file extension to succeed. */ + ut_a(success); + /* InnoDB data files cannot shrink. */ + ut_a(size >= desired_size); + if (desired_size > committed_size) + committed_size= desired_size; - /* There could be multiple concurrent I/O requests for - this tablespace (multiple threads trying to extend - this tablespace). + /* There could be multiple concurrent I/O requests for this + tablespace (multiple threads trying to extend this tablespace). - Also, fil_space_set_recv_size_and_flags() may have been - invoked again during the file extension while - fil_system.mutex was not being held by us. + Also, fil_space_set_recv_size_and_flags() may have been invoked + again during the file extension while fil_system.mutex was not + being held by us. - Only if space->recv_size matches what we read - originally, reset the field. In this way, a - subsequent I/O request will handle any pending - fil_space_set_recv_size_and_flags(). */ + Only if recv_size matches what we read originally, reset the + field. In this way, a subsequent I/O request will handle any + pending fil_space_set_recv_size_and_flags(). */ - if (size == space->recv_size) { - space->recv_size = 0; - } - } + if (desired_size == recv_size) + { + recv_size= 0; + goto clear; + } + } + else +clear: + n_pending_ios.fetch_and(NOT_CLOSING); - return space; - } + mutex_exit(&fil_system.mutex); + return is_open; } /** Try to extend a tablespace if it is smaller than the specified size. @@ -897,18 +773,20 @@ fil_mutex_enter_and_prepare_for_io( @return whether the tablespace is at least as big as requested */ bool fil_space_extend(fil_space_t *space, uint32_t size) { - ut_ad(!srv_read_only_mode || space->purpose == FIL_TYPE_TEMPORARY); + ut_ad(!srv_read_only_mode || space->purpose == FIL_TYPE_TEMPORARY); + if (!space->acquire_for_io()) + return false; - bool success; + bool success; - do { - fil_mutex_enter_and_prepare_for_io(space->id); - } while (fil_space_extend_must_retry( - space, UT_LIST_GET_LAST(space->chain), size, - &success)); + do + mutex_enter(&fil_system.mutex); + while (fil_space_extend_must_retry(space, UT_LIST_GET_LAST(space->chain), + size, &success)); - mutex_exit(&fil_system.mutex); - return(success); + mutex_exit(&fil_system.mutex); + space->release_for_io(); + return success; } /** Prepare to free a file from fil_system. */ @@ -927,7 +805,7 @@ pfs_os_file_t fil_node_t::close_to_free(bool detach_handle) fil_system.unflushed_spaces.remove(*space); } - if (n_pending || n_pending_flushes) + if (n_pending_flushes || space->set_closing()) { mutex_exit(&fil_system.mutex); os_thread_sleep(100); @@ -935,11 +813,6 @@ pfs_os_file_t fil_node_t::close_to_free(bool detach_handle) continue; } - if (fil_space_belongs_in_lru(space)) - { - ut_ad(UT_LIST_GET_LEN(fil_system.LRU) > 0); - UT_LIST_REMOVE(fil_system.LRU, this); - } ut_a(!being_extended); if (detach_handle) { @@ -1020,7 +893,7 @@ fil_space_free_low( /* Wait for fil_space_t::release_for_io(); after fil_system_t::detach(), the tablespace cannot be found, so - fil_space_acquire_for_io() would return NULL */ + fil_space_t::get_for_io() would return NULL */ while (space->pending_io()) { os_thread_sleep(100); } @@ -1092,24 +965,19 @@ fil_space_free( return(space != NULL); } -/** Create a space memory object and put it to the fil_system hash table. -Error messages are issued to the server log. -@param[in] name tablespace name -@param[in] id tablespace identifier -@param[in] flags tablespace flags -@param[in] purpose tablespace purpose -@param[in,out] crypt_data encryption information -@param[in] mode encryption mode -@return pointer to created tablespace, to be filled in with fil_space_t::add() -@retval NULL on failure (such as when the same tablespace exists) */ -fil_space_t* -fil_space_create( - const char* name, - ulint id, - ulint flags, - fil_type_t purpose, - fil_space_crypt_t* crypt_data, - fil_encryption_t mode) +/** Create a tablespace in fil_system. +@param name tablespace name +@param id tablespace identifier +@param flags tablespace flags +@param purpose tablespace purpose +@param crypt_data encryption information +@param mode encryption mode +@return pointer to created tablespace, to be filled in with add() +@retval nullptr on failure (such as when the same tablespace exists) */ +fil_space_t *fil_space_t::create(const char *name, ulint id, ulint flags, + fil_type_t purpose, + fil_space_crypt_t *crypt_data, + fil_encryption_t mode) { fil_space_t* space; @@ -1119,19 +987,6 @@ fil_space_create( DBUG_EXECUTE_IF("fil_space_create_failure", return(NULL);); - mutex_enter(&fil_system.mutex); - - space = fil_space_get_by_id(id); - - if (space != NULL) { - ib::error() << "Trying to add tablespace '" << name - << "' with id " << id - << " to the tablespace memory cache, but tablespace '" - << space->name << "' already exists in the cache!"; - mutex_exit(&fil_system.mutex); - return(NULL); - } - /* FIXME: if calloc() is defined as an inline function that calls memset() or bzero(), then GCC 6 -flifetime-dse can optimize it away */ space= new (ut_zalloc_nokey(sizeof(*space))) fil_space_t; @@ -1141,24 +996,12 @@ fil_space_create( UT_LIST_INIT(space->chain, &fil_node_t::chain); - if ((purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_IMPORT) - && id > fil_system.max_assigned_id) { - if (!fil_system.space_id_reuse_warned) { - fil_system.space_id_reuse_warned = true; - - ib::warn() << "Allocated tablespace ID " << id - << " for " << name << ", old maximum was " - << fil_system.max_assigned_id; - } - - fil_system.max_assigned_id = id; - } - space->purpose = purpose; space->flags = flags; space->magic_n = FIL_SPACE_MAGIC_N; space->crypt_data = crypt_data; + space->n_pending_ios.store(CLOSING, std::memory_order_relaxed); DBUG_LOG("tablespace", "Created metadata for " << id << " name " << name); @@ -1183,6 +1026,34 @@ fil_space_create( space->atomic_write_supported = true; } + mutex_enter(&fil_system.mutex); + + if (const fil_space_t *old_space = fil_space_get_by_id(id)) { + ib::error() << "Trying to add tablespace '" << name + << "' with id " << id + << " to the tablespace memory cache, but tablespace '" + << old_space->name << "' already exists in the cache!"; + mutex_exit(&fil_system.mutex); + rw_lock_free(&space->latch); + space->~fil_space_t(); + ut_free(space->name); + ut_free(space); + return(NULL); + } + + if ((purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_IMPORT) + && id > fil_system.max_assigned_id) { + if (!fil_system.space_id_reuse_warned) { + fil_system.space_id_reuse_warned = true; + + ib::warn() << "Allocated tablespace ID " << id + << " for " << name << ", old maximum was " + << fil_system.max_assigned_id; + } + + fil_system.max_assigned_id = id; + } + HASH_INSERT(fil_space_t, hash, &fil_system.spaces, id, space); UT_LIST_ADD_LAST(fil_system.space_list, space); @@ -1192,6 +1063,17 @@ fil_space_create( fil_system.max_assigned_id = id; } + switch (id) { + case 0: + ut_ad(!fil_system.sys_space); + fil_system.sys_space = space; + break; + case SRV_TMP_SPACE_ID: + ut_ad(!fil_system.temp_space); + fil_system.temp_space = space; + break; + } + /* Inform key rotation that there could be something to do */ if (purpose == FIL_TYPE_TABLESPACE @@ -1261,62 +1143,33 @@ fil_assign_new_space_id( return(success); } -/** Trigger a call to fil_node_t::read_page0() -@param[in] id tablespace identifier -@return tablespace -@retval NULL if the tablespace does not exist or cannot be read */ -fil_space_t* fil_system_t::read_page0(ulint id) +/** Read the first page of a data file. +@return whether the page was found valid */ +bool fil_space_t::read_page0() { - mutex_exit(&mutex); - - ut_ad(id != 0); - - /* It is possible that the tablespace is dropped while we are - not holding the mutex. */ - fil_space_t* space = fil_mutex_enter_and_prepare_for_io(id); - - if (space == NULL || UT_LIST_GET_LEN(space->chain) == 0) { - return(NULL); - } - - /* The following code must change when InnoDB supports - multiple datafiles per tablespace. */ - ut_a(1 == UT_LIST_GET_LEN(space->chain)); - - fil_node_t* node = UT_LIST_GET_FIRST(space->chain); - - /* It must be a single-table tablespace and we have not opened - the file yet; the following calls will open it and update the - size fields */ - - if (!fil_node_prepare_for_io(node, space)) { - /* The single-table tablespace can't be opened, - because the ibd file is missing. */ - return(NULL); - } + ut_ad(fil_system.is_initialised()); + ut_ad(mutex_own(&fil_system.mutex)); + if (size) + return true; - node->complete_io(); + fil_node_t *node= UT_LIST_GET_FIRST(chain); + if (!node) + return false; + ut_ad(!UT_LIST_GET_NEXT(chain, node)); - return space; + n_pending_ios.fetch_add(1, std::memory_order_acquire); + const bool ok= node->is_open() || fil_node_open_file(node); + release_for_io(); + return ok; } -/*******************************************************************//** -Returns a pointer to the fil_space_t that is in the memory cache -associated with a space id. The caller must lock fil_system.mutex. -@return file_space_t pointer, NULL if space not found */ -UNIV_INLINE -fil_space_t* -fil_space_get_space( -/*================*/ - ulint id) /*!< in: space id */ +/** Look up a tablespace and ensure that its first page has been validated. */ +static fil_space_t *fil_space_get_space(ulint id) { - fil_space_t* space = fil_space_get_by_id(id); - if (space == NULL || space->size != 0) { - return(space); - } - - space = fil_system.read_page0(id); - return(space); + if (fil_space_t *space= fil_space_get_by_id(id)) + if (space->read_page0()) + return space; + return nullptr; } void fil_space_set_recv_size_and_flags(ulint id, uint32_t size, uint32_t flags) @@ -1333,53 +1186,52 @@ void fil_space_set_recv_size_and_flags(ulint id, uint32_t size, uint32_t flags) mutex_exit(&fil_system.mutex); } -/*******************************************************************//** -Returns the size of the space in pages. The tablespace must be cached in the -memory cache. -@return space size, 0 if space not found */ -ulint -fil_space_get_size( -/*===============*/ - ulint id) /*!< in: space id */ +/** Open each file. Never invoked on .ibd files. +@param create_new_db whether to skip the call to fil_node_t::read_page0() +@return whether all files were opened */ +bool fil_space_t::open(bool create_new_db) { - fil_space_t* space; - ulint size; + ut_ad(fil_system.is_initialised()); + ut_ad(!id || create_new_db); - ut_ad(fil_system.is_initialised()); - mutex_enter(&fil_system.mutex); - - space = fil_space_get_space(id); - - size = space ? space->size : 0; + bool success= true; + bool skip_read= create_new_db; - mutex_exit(&fil_system.mutex); + mutex_enter(&fil_system.mutex); - return(size); -} + for (fil_node_t *node= UT_LIST_GET_FIRST(chain); node; + node= UT_LIST_GET_NEXT(chain, node)) + { + if (!node->is_open() && !fil_node_open_file_low(node)) + { +err_exit: + success= false; + break; + } -/** Open each file. Only invoked on fil_system.temp_space. -@return whether all files were opened */ -bool fil_space_t::open() -{ - ut_ad(fil_system.is_initialised()); + if (create_new_db) + continue; + if (skip_read) + { + size+= node->size; + continue; + } - mutex_enter(&fil_system.mutex); - ut_ad(this == fil_system.temp_space - || srv_operation == SRV_OPERATION_BACKUP - || srv_operation == SRV_OPERATION_RESTORE - || srv_operation == SRV_OPERATION_RESTORE_DELTA); + if (!node->read_page0()) + { + fil_system.n_open--; + os_file_close(node->handle); + node->handle= OS_FILE_CLOSED; + goto err_exit; + } - for (fil_node_t* node = UT_LIST_GET_FIRST(chain); - node != NULL; - node = UT_LIST_GET_NEXT(chain, node)) { - if (!node->is_open() && !fil_node_open_file(node)) { - mutex_exit(&fil_system.mutex); - return false; - } - } + skip_read= true; + } - mutex_exit(&fil_system.mutex); - return true; + if (!create_new_db) + committed_size= size; + mutex_exit(&fil_system.mutex); + return success; } /** Close each file. Only invoked on fil_system.temp_space. */ @@ -1491,7 +1343,6 @@ void fil_system_t::create(ulint hash_size) void fil_system_t::close() { ut_ad(this == &fil_system); - ut_a(!UT_LIST_GET_LEN(LRU)); ut_a(unflushed_spaces.empty()); ut_a(!UT_LIST_GET_LEN(space_list)); ut_ad(!sys_space); @@ -1513,67 +1364,6 @@ void fil_system_t::close() #endif /* UNIV_LINUX */ } -/** Opens all system tablespace data files. They stay open until the -database server shutdown. This should be called at a server startup after the -space objects for the system tablespace have been created. The -purpose of this operation is to make sure we never run out of file descriptors -if we need to read from the insert buffer. */ -void -fil_open_system_tablespace_files() -{ - fil_space_t* space; - - mutex_enter(&fil_system.mutex); - - for (space = UT_LIST_GET_FIRST(fil_system.space_list); - space != NULL; - space = UT_LIST_GET_NEXT(space_list, space)) { - - fil_node_t* node; - - if (fil_space_belongs_in_lru(space)) { - - continue; - } - - for (node = UT_LIST_GET_FIRST(space->chain); - node != NULL; - node = UT_LIST_GET_NEXT(chain, node)) { - - if (!node->is_open()) { - if (!fil_node_open_file(node)) { - /* This func is called during server's - startup. If some file of log or system - tablespace is missing, the server - can't start successfully. So we should - assert for it. */ - ut_a(0); - } - } - - if (srv_max_n_open_files < 10 + fil_system.n_open) { - - ib::warn() << "You must raise the value of" - " innodb_open_files in my.cnf!" - " Remember that InnoDB keeps all" - " log files and all system" - " tablespace files open" - " for the whole time mysqld is" - " running, and needs to open also" - " some .ibd files if the" - " file-per-table storage model is used." - " Current open files " - << fil_system.n_open - << ", max allowed open files " - << srv_max_n_open_files - << "."; - } - } - } - - mutex_exit(&fil_system.mutex); -} - /** Close all tablespace files at shutdown */ void fil_close_all_files() { @@ -1605,21 +1395,21 @@ next: } for (ulint count = 10000; count--; ) { + if (!space->set_closing() + && !node->n_pending_flushes) { + node->close(); + goto next; + } mutex_exit(&fil_system.mutex); os_thread_sleep(100); mutex_enter(&fil_system.mutex); if (!node->is_open()) { goto next; } - if (!node->n_pending - && !node->n_pending_flushes) { - node->close(); - goto next; - } } ib::error() << "File '" << node->name - << "' has " << node->n_pending + << "' has " << space->pending_io() << " operations and " << node->n_pending_flushes << " flushes"; @@ -1670,16 +1460,18 @@ fil_write_flushed_lsn( byte* buf; ut_ad(!srv_read_only_mode); - buf = static_cast<byte*>(aligned_malloc(srv_page_size, srv_page_size)); + if (!fil_system.sys_space->acquire_for_io()) { + return DB_ERROR; + } - const page_id_t page_id(TRX_SYS_SPACE, 0); + buf = static_cast<byte*>(aligned_malloc(srv_page_size, srv_page_size)); - fil_io_t fio = fil_io(IORequestRead, true, page_id, 0, 0, - srv_page_size, buf, NULL); + auto fio = fil_system.sys_space->io(IORequestRead, 0, srv_page_size, + buf); if (fio.err == DB_SUCCESS) { - fio.node->space->release_for_io(); - mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, lsn); + mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, + lsn); ulint fsp_flags = mach_read_from_4( buf + FSP_HEADER_OFFSET + FSP_SPACE_FLAGS); @@ -1688,13 +1480,11 @@ fil_write_flushed_lsn( buf_flush_assign_full_crc32_checksum(buf); } - fio = fil_io(IORequestWrite, true, page_id, 0, 0, - srv_page_size, buf, NULL); + fio = fil_system.sys_space->io(IORequestWrite, + 0, srv_page_size, buf); fil_flush_file_spaces(); - } - - if (fio.node) { - fio.node->space->release_for_io(); + } else { + fil_system.sys_space->release_for_io(); } aligned_free(buf); @@ -1735,20 +1525,25 @@ when it could be dropped concurrently. @param[in] id tablespace ID @return the tablespace @retval NULL if missing */ -fil_space_t* -fil_space_acquire_for_io(ulint id) +fil_space_t *fil_space_t::get_for_io(ulint id) { - mutex_enter(&fil_system.mutex); + mutex_enter(&fil_system.mutex); - fil_space_t* space = fil_space_get_by_id(id); + fil_space_t *space= fil_space_get_by_id(id); - if (space) { - space->acquire_for_io(); - } + uint32_t f= space + ? space->n_pending_ios.fetch_add(1, std::memory_order_relaxed) + : 0; - mutex_exit(&fil_system.mutex); + mutex_exit(&fil_system.mutex); - return(space); + if ((f & CLOSING) && !space->prepare_for_io()) + { + // FIXME: issue an error message! + space= nullptr; + } + + return space; } /** Write a log record about a file operation. @@ -1986,12 +1781,12 @@ fil_check_pending_io( /* The following code must change when InnoDB supports multiple datafiles per tablespace. */ - ut_a(UT_LIST_GET_LEN(space->chain) == 1); + ut_ad(UT_LIST_GET_LEN(space->chain) == 1); *node = UT_LIST_GET_FIRST(space->chain); const auto f = space->n_pending_flushes; - const auto p = (*node)->n_pending; + const auto p = space->pending_io(); if (f || p) { ut_a(!(*node)->being_extended); @@ -2105,15 +1900,14 @@ void fil_close_tablespace(ulint id) rw_lock_x_lock(&space->latch); /* Invalidate in the buffer pool all pages belonging to the - tablespace. Since we have set space->stop_new_ops = true, readahead + tablespace. Since we have invoked space->set_stopping(), readahead can no longer read more pages of this tablespace to buf_pool. Thus we can clean the tablespace out of buf_pool - completely and permanently. The flag stop_new_ops also prevents - fil_flush() from being applied to this tablespace. */ + completely and permanently. */ while (buf_flush_dirty_pages(id)); /* Ensure that all asynchronous IO is completed. */ os_aio_wait_until_no_pending_writes(); - fil_flush(id); + ut_ad(space->is_stopping()); /* If the free is successful, the X lock will be released before the space memory data structure is freed. */ @@ -2191,7 +1985,7 @@ dberr_t fil_delete_tablespace(ulint id, bool if_exists, when we checked it above. A write request can be issued any time because we don't check - the ::stop_new_ops flag when queueing a block for write. + fil_space_t::is_stopping() when queueing a block for write. We deal with pending write requests in the following function where we'd minimally evict all dirty pages belonging to this @@ -2199,7 +1993,7 @@ dberr_t fil_delete_tablespace(ulint id, bool if_exists, we'll wait for IO to complete. To deal with potential read requests, we will check the - ::stop_new_ops flag in fil_io(). */ + is_stopping() in fil_space_t::io(). */ err = DB_SUCCESS; buf_flush_remove_pages(id); @@ -2728,14 +2522,14 @@ err_exit: buf_flush_init_for_writing(NULL, page, &page_zip, false); - *err = os_file_write( - IORequestWrite, path, file, page_zip.data, 0, zip_size); + *err = os_file_write(IORequestWrite, path, file, + page_zip.data, 0, zip_size); } else { buf_flush_init_for_writing(NULL, page, NULL, fil_space_t::full_crc32(flags)); - *err = os_file_write( - IORequestWrite, path, file, page, 0, srv_page_size); + *err = os_file_write(IORequestWrite, path, file, + page, 0, srv_page_size); } aligned_free(page); @@ -2763,9 +2557,9 @@ err_exit: } } - fil_space_t* space = fil_space_create(name, space_id, flags, - FIL_TYPE_TABLESPACE, - crypt_data, mode); + fil_space_t* space = fil_space_t::create(name, space_id, flags, + FIL_TYPE_TABLESPACE, + crypt_data, mode); if (!space) { free(crypt_data); *err = DB_ERROR; @@ -3143,7 +2937,7 @@ skip_validate: first_page) : NULL; - fil_space_t* space = fil_space_create( + fil_space_t* space = fil_space_t::create( tablename.m_name, id, flags, purpose, crypt_data); if (!space) { goto error; @@ -3157,11 +2951,17 @@ skip_validate: df_dict.is_open() ? df_dict.filepath() : df_default.filepath(), OS_FILE_CLOSED, 0, false, true); - if (validate && purpose != FIL_TYPE_IMPORT && !srv_read_only_mode) { + if (validate && !srv_read_only_mode) { df_remote.close(); df_dict.close(); df_default.close(); - fsp_flags_try_adjust(space, flags & ~FSP_FLAGS_MEM_MASK); + if (space->acquire_for_io()) { + if (purpose != FIL_TYPE_IMPORT) { + fsp_flags_try_adjust(space, flags + & ~FSP_FLAGS_MEM_MASK); + } + space->release_for_io(); + } } if (err) *err = DB_SUCCESS; @@ -3491,7 +3291,7 @@ fil_ibd_load( ? fil_space_read_crypt_data(fil_space_t::zip_size(flags), first_page) : NULL; - space = fil_space_create( + space = fil_space_t::create( file.name(), space_id, flags, FIL_TYPE_TABLESPACE, crypt_data); if (space == NULL) { @@ -3557,7 +3357,7 @@ void fsp_flags_try_adjust(fil_space_t* space, ulint flags) return; } if (!space->size && (space->purpose != FIL_TYPE_TABLESPACE - || !fil_space_get_size(space->id))) { + || !space->get_size())) { return; } /* This code is executed during server startup while no @@ -3596,7 +3396,7 @@ func_exit: memory cache. Note that if we have not done a crash recovery at the database startup, there may be many tablespaces which are not yet in the memory cache. @param[in] id Tablespace ID -@param[in] name Tablespace name used in fil_space_create(). +@param[in] name Tablespace name used in fil_space_t::create(). @param[in] table_flags table flags @return the tablespace @retval NULL if no matching tablespace exists in the memory cache */ @@ -3648,281 +3448,159 @@ func_exit: /*============================ FILE I/O ================================*/ -/********************************************************************//** -NOTE: you must call fil_mutex_enter_and_prepare_for_io() first! - -Prepares a file node for i/o. Opens the file if it is closed. Updates the -pending i/o's field in the node and the system appropriately. Takes the node -off the LRU list if it is in the LRU list. The caller must hold the fil_sys -mutex. -@return false if the file can't be opened, otherwise true */ -static -bool -fil_node_prepare_for_io( -/*====================*/ - fil_node_t* node, /*!< in: file node */ - fil_space_t* space) /*!< in: space */ -{ - ut_ad(node && space); - ut_ad(mutex_own(&fil_system.mutex)); - - if (fil_system.n_open > srv_max_n_open_files + 5) { - ib::warn() << "Open files " << fil_system.n_open - << " exceeds the limit " << srv_max_n_open_files; - } - - if (!node->is_open()) { - /* File is closed: open it */ - ut_a(node->n_pending == 0); - - if (!fil_node_open_file(node)) { - return(false); - } - } - - if (node->n_pending++ == 0 && fil_space_belongs_in_lru(space)) { - UT_LIST_REMOVE(fil_system.LRU, node); - } - - return(true); -} - /** Report information about an invalid page access. */ ATTRIBUTE_COLD __attribute__((noreturn)) static void -fil_report_invalid_page_access(const page_id_t id, const char *name, - ulint byte_offset, ulint len, bool is_read) +fil_report_invalid_page_access(const char *name, + os_offset_t offset, ulint len, bool is_read) { - ib::fatal() - << "Trying to " << (is_read ? "read " : "write ") - << id - << " which is outside the bounds of tablespace " << name - << ". Byte offset " << byte_offset << ", len " << len; + ib::fatal() << "Trying to " << (is_read ? "read " : "write ") << len + << " bytes at " << offset + << " outside the bounds of the file: " << name; } -/** Reads or writes data. This operation could be asynchronous (aio). - -@param[in,out] type IO context -@param[in] sync true if synchronous aio is desired -@param[in] page_id page id -@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 -@param[in] byte_offset remainder of offset in bytes; in aio this - must be divisible by the OS block size -@param[in] len how many bytes to read or write; this must - not cross a file boundary; in aio this must - be a block size multiple -@param[in,out] buf buffer where to store read data or from where - to write; in aio this must be appropriately - aligned -@param[in] message message for aio handler if non-sync aio - used, else ignored -@param[in] ignore whether to ignore errors -@param[in] punch_hole punch the hole to the file for page_compressed - tablespace -@return status and file descriptor */ -fil_io_t -fil_io( - const IORequest& type, - bool sync, - const page_id_t page_id, - ulint zip_size, - ulint byte_offset, - ulint len, - void* buf, - void* message, - bool ignore, - bool punch_hole) + +/** Update the data structures on write completion */ +inline void fil_node_t::complete_write() { - os_offset_t offset; - - ut_ad(type.validate()); - - ut_ad(len > 0); - ut_ad(byte_offset < srv_page_size); - ut_ad(!zip_size || byte_offset == 0); - ut_ad(srv_page_size == 1UL << srv_page_size_shift); - compile_time_assert((1U << UNIV_PAGE_SIZE_SHIFT_MAX) - == UNIV_PAGE_SIZE_MAX); - compile_time_assert((1U << UNIV_PAGE_SIZE_SHIFT_MIN) - == UNIV_PAGE_SIZE_MIN); - ut_ad(fil_validate_skip()); + ut_ad(!mutex_own(&fil_system.mutex)); + ut_ad(space->pending_io()); - /* ibuf bitmap pages must be read in the sync AIO mode: */ - ut_ad(recv_no_ibuf_operations - || type.is_write() - || !ibuf_bitmap_page(page_id, zip_size) - || sync); + if (space->purpose != FIL_TYPE_TEMPORARY && !space->is_stopping() && + srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC) + { + mutex_enter(&fil_system.mutex); + if (!space->is_stopping()) + { + needs_flush= true; - ulint mode; + if (!space->is_in_unflushed_spaces) + { + space->is_in_unflushed_spaces= true; + fil_system.unflushed_spaces.push_front(*space); + } + } + mutex_exit(&fil_system.mutex); + } +#ifdef UNIV_DEBUG + else + { + mutex_enter(&fil_system.mutex); + if (!space->is_stopping()) + { + ut_ad(!space->is_in_unflushed_spaces); + ut_ad(!needs_flush); + } + mutex_exit(&fil_system.mutex); + } +#endif /* UNIV_DEBUG */ +} - if (sync) { - mode = OS_AIO_SYNC; - } else if (type.is_read() - && !recv_no_ibuf_operations - && ibuf_page(page_id, zip_size, NULL)) { - mode = OS_AIO_IBUF; - } else { - mode = OS_AIO_NORMAL; - } +/** Read or write data. +@param type I/O context +@param offset offset in bytes +@param len number of bytes +@param buf the data to be read or written +@param bpage buffer block (for type.is_async() completion callback) +@return status and file descriptor */ +fil_io_t fil_space_t::io(const IORequest &type, os_offset_t offset, size_t len, + void *buf, buf_page_t *bpage) +{ + ut_ad(pending_io()); + ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_ad((len % OS_FILE_LOG_BLOCK_SIZE) == 0); + ut_ad(fil_validate_skip()); if (type.is_read()) { - srv_stats.data_read.add(len); - - } else if (type.is_write()) { - - ut_ad(!srv_read_only_mode - || fsp_is_system_temporary(page_id.space())); - + } else { + ut_ad(type.is_write() || type.type == IORequest::PUNCH_RANGE); + ut_ad(!srv_read_only_mode || this == fil_system.temp_space); srv_stats.data_written.add(len); } - /* Acquire fil_system.mutex and make sure that we can open at - least one file while holding it, if the file is not already open */ - fil_space_t* space = fil_mutex_enter_and_prepare_for_io( - page_id.space()); - - if (!space - || (type.is_read() - && !sync - && space->is_stopping() - && !space->is_being_truncated)) { - - mutex_exit(&fil_system.mutex); - if (!ignore) { - ib::error() - << "Trying to do I/O to a tablespace which" - " does not exist. I/O type: " - << (type.is_read() ? "read" : "write") - << ", page: " << page_id - << ", I/O length: " << len << " bytes"; - } + fil_node_t* node= UT_LIST_GET_FIRST(chain); + ut_ad(node); + if (type.type == IORequest::READ_ASYNC && is_stopping() + && !is_being_truncated) { + release_for_io(); return {DB_TABLESPACE_DELETED, nullptr}; } - ulint cur_page_no = page_id.page_no(); - fil_node_t* node = UT_LIST_GET_FIRST(space->chain); - - for (;;) { - - if (node == NULL) { - if (ignore) { - mutex_exit(&fil_system.mutex); - return {DB_ERROR, nullptr}; - } - - fil_report_invalid_page_access( - page_id, space->name, byte_offset, len, - type.is_read()); + ulint p = static_cast<ulint>(offset >> srv_page_size_shift); - } else if (fil_is_user_tablespace_id(space->id) - && node->size == 0) { - - /* We do not know the size of a single-table tablespace - before we open the file */ - break; - - } else if (node->size > cur_page_no) { - /* Found! */ - break; - - } else { - cur_page_no -= node->size; + if (UNIV_LIKELY_NULL(UT_LIST_GET_NEXT(chain, node))) { + ut_ad(this == fil_system.sys_space + || this == fil_system.temp_space); + ut_ad(!(offset & ((1 << srv_page_size_shift) - 1))); + while (node->size <= p) { + p -= node->size; node = UT_LIST_GET_NEXT(chain, node); - } - } - - /* Open file if closed */ - if (UNIV_UNLIKELY(!fil_node_prepare_for_io(node, space))) { - ut_ad(fil_is_user_tablespace_id(space->id)); - mutex_exit(&fil_system.mutex); - - if (!ignore) { - ib::error() - << "Trying to do I/O to a tablespace '" - << space->name - << "' which exists without .ibd data file." - " I/O type: " - << (type.is_read() - ? "read" : "write") - << ", page: " - << page_id - << ", I/O length: " << len << " bytes"; + if (!node) { + if (type.type == IORequest::READ_ASYNC) { + release_for_io(); + return {DB_ERROR, nullptr}; + } + fil_report_invalid_page_access(name, offset, + len, + type.is_read()); + } } - return {DB_TABLESPACE_DELETED, nullptr}; + offset = os_offset_t{p} << srv_page_size_shift; } - if (node->size <= cur_page_no) { - if (ignore) { + if (UNIV_UNLIKELY(node->size <= p)) { + if (type.type == IORequest::READ_ASYNC) { + release_for_io(); /* If we can tolerate the non-existent pages, we should return with DB_ERROR and let caller decide what to do. */ - node->complete_io(type.is_write()); - mutex_exit(&fil_system.mutex); return {DB_ERROR, nullptr}; } fil_report_invalid_page_access( - page_id, space->name, byte_offset, len, - type.is_read()); + node->name, offset, len, type.is_read()); } - space->acquire_for_io(); - /* Now we have made the changes in the data structures of fil_system */ - mutex_exit(&fil_system.mutex); - - if (!zip_size) zip_size = srv_page_size; - - offset = os_offset_t(cur_page_no) * zip_size + byte_offset; - ut_ad(node->size - cur_page_no >= (len + (zip_size - 1)) / zip_size); - - /* Do AIO */ - - ut_a(byte_offset % OS_FILE_LOG_BLOCK_SIZE == 0); - ut_a((len % OS_FILE_LOG_BLOCK_SIZE) == 0); - - const char* name = node->name == NULL ? space->name : node->name; - - ut_ad(!type.is_write() - || !fil_is_user_tablespace_id(page_id.space()) - || offset == page_id.page_no() * zip_size); - - dberr_t err = DB_SUCCESS; + dberr_t err; - if (punch_hole) { - /* Punch the hole to the file */ + if (type.type == IORequest::PUNCH_RANGE) { err = os_file_punch_hole(node->handle, offset, len); /* Punch hole is not supported, make space not to support punch hole */ if (UNIV_UNLIKELY(err == DB_IO_NO_PUNCH_HOLE)) { - node->space->punch_hole = false; + punch_hole = false; err = DB_SUCCESS; } + goto release_sync_write; } else { - IORequest req_type(type); - req_type.set_fil_node(node); /* Queue the aio request */ err = os_aio( - req_type, - mode, name, node->handle, buf, offset, len, - space->purpose != FIL_TYPE_TEMPORARY - && srv_read_only_mode, - node, message); + IORequest(type, node), + node->name, node->handle, buf, offset, len, + purpose != FIL_TYPE_TEMPORARY && srv_read_only_mode, + node, bpage); } /* We an try to recover the page from the double write buffer if the decompression fails or the page is corrupt. */ - ut_a(type.is_dblwr_recover() || err == DB_SUCCESS); - if (sync) { - mutex_enter(&fil_system.mutex); - node->complete_io(type.is_write()); - mutex_exit(&fil_system.mutex); + ut_a(type.type == IORequest::DBLWR_RECOVER || err == DB_SUCCESS); + if (!type.is_async()) { + if (type.is_write()) { +release_sync_write: + node->complete_write(); +release: + release_for_io(); + } ut_ad(fil_validate_skip()); } + if (err != DB_SUCCESS) { + goto release; + } return {err, node}; } @@ -3941,8 +3619,6 @@ void fil_aio_callback(os_aio_userdata_t *data) return; } - ut_ad(data->type.validate()); - buf_page_t *bpage= static_cast<buf_page_t*>(data->message); if (!bpage) { @@ -3951,14 +3627,9 @@ void fil_aio_callback(os_aio_userdata_t *data) ut_ad(data->type.is_write()); ut_ad(!srv_read_only_mode); write_completed: - mutex_enter(&fil_system.mutex); - node->complete_io(true); - mutex_exit(&fil_system.mutex); - node->space->release_for_io(); - return; + node->complete_write(); } - - if (data->type.is_write()) + else if (data->type.is_write()) { ut_ad(!srv_read_only_mode || node->space->purpose == FIL_TYPE_TEMPORARY); bool dblwr= node->space->use_doublewrite(); @@ -3970,111 +3641,68 @@ write_completed: buf_page_write_complete(bpage, data->type, dblwr); goto write_completed; } + else + { + ut_ad(data->type.is_read()); - ut_ad(data->type.is_read()); - - /* IMPORTANT: since i/o handling for reads will read also the insert - buffer in fil_system.sys_space, we have to be very careful not to - introduce deadlocks. We never close the system tablespace (0) data - files via fil_system.LRU and we use a dedicated I/O thread to serve - change buffer requests. */ - const page_id_t id(bpage->id()); + /* IMPORTANT: since i/o handling for reads will read also the insert + buffer in fil_system.sys_space, we have to be very careful not to + introduce deadlocks. We never close the system tablespace (0) data + files via fil_system.LRU and we never issue asynchronous reads of + change buffer pages. */ + const page_id_t id(bpage->id()); - if (dberr_t err= buf_page_read_complete(bpage, *node)) - { - if (recv_recovery_is_on() && !srv_force_recovery) - recv_sys.found_corrupt_fs= true; + if (dberr_t err= buf_page_read_complete(bpage, *node)) + { + if (recv_recovery_is_on() && !srv_force_recovery) + recv_sys.found_corrupt_fs= true; - ib::error() << "Failed to read page " << id.page_no() - << " from file '" << node->name << "': " << err; + ib::error() << "Failed to read page " << id.page_no() + << " from file '" << node->name << "': " << err; + } } - mutex_enter(&fil_system.mutex); - node->complete_io(); - mutex_exit(&fil_system.mutex); node->space->release_for_io(); } -/**********************************************************************//** -Flushes to disk possible writes cached by the OS. If the space does not exist -or is being dropped, does not do anything. */ -void -fil_flush( -/*======*/ - ulint space_id) /*!< in: file space id (this can be a group of - log files or a tablespace of the database) */ +/** Flush pending writes from the file system cache to the file */ +void fil_space_t::flush() { - mutex_enter(&fil_system.mutex); - - if (fil_space_t* space = fil_space_get_by_id(space_id)) { - if (space->purpose != FIL_TYPE_TEMPORARY - && !space->is_stopping()) { - fil_flush_low(space); - } - } - - mutex_exit(&fil_system.mutex); -} - -/** Flush a tablespace. -@param[in,out] space tablespace to flush */ -void -fil_flush(fil_space_t* space) -{ - ut_ad(space->pending_io()); - ut_ad(space->purpose == FIL_TYPE_TABLESPACE - || space->purpose == FIL_TYPE_IMPORT); - - if (!space->is_stopping()) { - mutex_enter(&fil_system.mutex); - if (!space->is_stopping()) { - fil_flush_low(space); - } - mutex_exit(&fil_system.mutex); - } + ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_IMPORT); + if (!is_stopping()) + { + mutex_enter(&fil_system.mutex); + if (!is_stopping()) + fil_flush_low(this); + mutex_exit(&fil_system.mutex); + } } /** Flush to disk the writes in file spaces of the given type possibly cached by the OS. */ void fil_flush_file_spaces() { - ulint* space_ids; - ulint n_space_ids; - - mutex_enter(&fil_system.mutex); - - n_space_ids = fil_system.unflushed_spaces.size(); - if (n_space_ids == 0) { - - mutex_exit(&fil_system.mutex); + if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC) { + ut_d(mutex_enter(&fil_system.mutex)); + ut_ad(fil_system.unflushed_spaces.empty()); + ut_d(mutex_exit(&fil_system.mutex)); return; } - space_ids = static_cast<ulint*>( - ut_malloc_nokey(n_space_ids * sizeof(*space_ids))); - - n_space_ids = 0; +rescan: + mutex_enter(&fil_system.mutex); for (sized_ilist<fil_space_t, unflushed_spaces_tag_t>::iterator it = fil_system.unflushed_spaces.begin(), end = fil_system.unflushed_spaces.end(); it != end; ++it) { - - if (it->purpose == FIL_TYPE_TABLESPACE && !it->is_stopping()) { - space_ids[n_space_ids++] = it->id; + if (!it->is_stopping() && fil_flush_low(&*it)) { + mutex_exit(&fil_system.mutex); + goto rescan; } } mutex_exit(&fil_system.mutex); - - /* Flush the spaces. It will not hurt to call fil_flush() on - a non-existing space id. */ - for (ulint i = 0; i < n_space_ids; i++) { - - fil_flush(space_ids[i]); - } - - ut_free(space_ids); } /** Functor to validate the file node list of a tablespace. */ @@ -4091,7 +3719,6 @@ struct Check { @param[in] elem file node to visit */ void operator()(const fil_node_t* elem) { - ut_a(elem->is_open() || !elem->n_pending); n_open += elem->is_open(); size += elem->size; } @@ -4128,7 +3755,6 @@ Checks the consistency of the tablespace cache. @return true if ok */ bool fil_validate() { - fil_node_t* fil_node; ulint n_open = 0; mutex_enter(&fil_system.mutex); @@ -4141,18 +3767,6 @@ bool fil_validate() ut_a(fil_system.n_open == n_open); - ut_list_validate(fil_system.LRU); - - for (fil_node = UT_LIST_GET_FIRST(fil_system.LRU); - fil_node != 0; - fil_node = UT_LIST_GET_NEXT(LRU, fil_node)) { - - ut_a(fil_node->n_pending == 0); - ut_a(!fil_node->being_extended); - ut_a(fil_node->is_open()); - ut_a(fil_space_belongs_in_lru(fil_node->space)); - } - mutex_exit(&fil_system.mutex); return(true); diff --git a/storage/innobase/fsp/fsp0file.cc b/storage/innobase/fsp/fsp0file.cc index e8fc47f3e41..57164113647 100644 --- a/storage/innobase/fsp/fsp0file.cc +++ b/storage/innobase/fsp/fsp0file.cc @@ -296,8 +296,6 @@ Datafile::read_first_page(bool read_only_mode) m_first_page = static_cast<byte*>( aligned_malloc(UNIV_PAGE_SIZE_MAX, srv_page_size)); - constexpr IORequest request(IORequest::READ | - IORequest::DISABLE_PARTIAL_IO_WARNINGS); dberr_t err = DB_ERROR; size_t page_size = UNIV_PAGE_SIZE_MAX; @@ -308,7 +306,8 @@ Datafile::read_first_page(bool read_only_mode) ulint n_read = 0; err = os_file_read_no_error_handling( - request, m_handle, m_first_page, 0, page_size, &n_read); + IORequestReadPartial, m_handle, m_first_page, 0, + page_size, &n_read); if (err == DB_IO_ERROR && n_read >= UNIV_PAGE_SIZE_MIN) { diff --git a/storage/innobase/fsp/fsp0space.cc b/storage/innobase/fsp/fsp0space.cc index 1ed4af86367..b0a80efe7c4 100644 --- a/storage/innobase/fsp/fsp0space.cc +++ b/storage/innobase/fsp/fsp0space.cc @@ -130,7 +130,7 @@ Tablespace::open_or_create(bool is_temp) fsp_flags = FSP_FLAGS_PAGE_SSIZE(); } - space = fil_space_create( + space = fil_space_t::create( m_name, m_space_id, fsp_flags, is_temp ? FIL_TYPE_TEMPORARY : FIL_TYPE_TABLESPACE, diff --git a/storage/innobase/fsp/fsp0sysspace.cc b/storage/innobase/fsp/fsp0sysspace.cc index f8342157560..a2c9e1bc688 100644 --- a/storage/innobase/fsp/fsp0sysspace.cc +++ b/storage/innobase/fsp/fsp0sysspace.cc @@ -906,13 +906,10 @@ SysTablespace::open_or_create( if (it != begin) { } else if (is_temp) { ut_ad(space_id() == SRV_TMP_SPACE_ID); - space = fil_space_create( + space = fil_space_t::create( name(), SRV_TMP_SPACE_ID, flags(), FIL_TYPE_TEMPORARY, NULL); - - mutex_enter(&fil_system.mutex); - fil_system.temp_space = space; - mutex_exit(&fil_system.mutex); + ut_ad(space == fil_system.temp_space); if (!space) { return DB_ERROR; } @@ -920,12 +917,10 @@ SysTablespace::open_or_create( ut_ad(space->full_crc32()); } else { ut_ad(space_id() == TRX_SYS_SPACE); - space = fil_space_create( + space = fil_space_t::create( name(), TRX_SYS_SPACE, it->flags(), FIL_TYPE_TABLESPACE, NULL); - mutex_enter(&fil_system.mutex); - fil_system.sys_space = space; - mutex_exit(&fil_system.mutex); + ut_ad(space == fil_system.sys_space); if (!space) { return DB_ERROR; } diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc index de4195b5727..e7e66bb0e8d 100644 --- a/storage/innobase/handler/i_s.cc +++ b/storage/innobase/handler/i_s.cc @@ -7044,6 +7044,7 @@ i_s_tablespaces_encryption_fill_table( } mutex_enter(&fil_system.mutex); + fil_system.freeze_space_list++; for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system.space_list); space; space = UT_LIST_GET_NEXT(space_list, space)) { @@ -7060,6 +7061,7 @@ i_s_tablespaces_encryption_fill_table( } } + fil_system.freeze_space_list--; mutex_exit(&fil_system.mutex); DBUG_RETURN(0); } diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc index 494ae2798ee..9e9bc241828 100644 --- a/storage/innobase/ibuf/ibuf0ibuf.cc +++ b/storage/innobase/ibuf/ibuf0ibuf.cc @@ -2300,7 +2300,7 @@ static void ibuf_read_merge_pages(const uint32_t* space_ids, for (ulint i = 0; i < n_stored; i++) { const ulint space_id = space_ids[i]; - fil_space_t* s = fil_space_acquire_for_io(space_id); + fil_space_t* s = fil_space_t::get_for_io(space_id); if (!s) { tablespace_deleted: /* The tablespace was not found: remove all @@ -4631,26 +4631,14 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space) const unsigned zip_size = space->zip_size(); const unsigned physical_size = space->physical_size(); - /* fil_space_t::size and fil_space_t::free_limit would still be 0 - at this point. So, we will have to read page 0. */ - ut_ad(!space->free_limit); - ut_ad(!space->size); - mtr_t mtr; - uint32_t size; - mtr.start(); - if (buf_block_t* sp = buf_page_get(page_id_t(space->id, 0), - zip_size, - RW_S_LATCH, &mtr)) { - size = std::min( - mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT - + sp->frame), - mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE - + sp->frame)); - } else { - size = 0; + uint32_t size= std::min(space->free_limit, space->size); + + if (size == 0) { + return(DB_TABLE_NOT_FOUND); } - mtr.commit(); + + mtr_t mtr; mutex_enter(&ibuf_mutex); diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index d2b52c4f520..d8e152f1ffa 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -978,6 +978,15 @@ public: return zip.ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip.ssize : 0; } + /** @return the byte offset of the page within a file */ + os_offset_t physical_offset() const + { + os_offset_t o= id().page_no(); + return zip.ssize + ? o << (zip.ssize + (UNIV_ZIP_SIZE_SHIFT_MIN - 1)) + : o << srv_page_size_shift; + } + /** @return whether the block is mapped to a data file */ bool in_file() const { diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h index 1b9415d38be..aac4715250d 100644 --- a/storage/innobase/include/buf0dblwr.h +++ b/storage/innobase/include/buf0dblwr.h @@ -52,10 +52,10 @@ class buf_dblwr_t struct element { - /** block descriptor */ - buf_page_t *bpage; - /** true=buf_pool.flush_list, false=buf_pool.LRU */ - bool lru; + /** tablespace */ + fil_space_t *space; + /** asynchronous write request */ + IORequest request; /** payload size in bytes */ size_t size; }; @@ -103,10 +103,11 @@ public: /** Schedule a page write. If the doublewrite memory buffer is full, flush_buffered_writes() will be invoked to make space. - @param bpage buffer pool page to be written - @param lru true=buf_pool.LRU; false=buf_pool.flush_list + @param space tablespace + @param request asynchronous write request @param size payload size in bytes */ - void add_to_batch(buf_page_t *bpage, bool lru, size_t size); + void add_to_batch(fil_space_t *space, const IORequest &request, + size_t size) MY_ATTRIBUTE((nonnull)); /** Determine whether the doublewrite buffer is initialized */ bool is_initialised() const diff --git a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h index e111bbd7a02..87c6b5d7e75 100644 --- a/storage/innobase/include/buf0rea.h +++ b/storage/innobase/include/buf0rea.h @@ -46,11 +46,13 @@ dberr_t buf_read_page(const page_id_t page_id, ulint zip_size); buffer buf_pool if it is not already there. Sets the io_fix flag and sets an exclusive lock on the buffer frame. The flag is cleared and the x-lock released by the i/o-handler thread. +@param[in,out] space tablespace @param[in] page_id page id @param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @param[in] sync true if synchronous aio is desired */ -void -buf_read_page_background(const page_id_t page_id, ulint zip_size, bool sync); +void buf_read_page_background(fil_space_t *space, const page_id_t page_id, + ulint zip_size, bool sync) + MY_ATTRIBUTE((nonnull)); /** Applies a random read-ahead in buf_pool if there are at least a threshold value of accessed pages from the random read-ahead area. Does not read any @@ -101,14 +103,11 @@ ulint buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf); /** Issues read requests for pages which recovery wants to read in. -@param[in] sync true if the caller wants this function to wait -for the highest address page to get read in, before this function returns @param[in] space_id tablespace id @param[in] page_nos array of page numbers to read, with the highest page number the last in the array @param[in] n number of page numbers in the array */ -void buf_read_recv_pages(bool sync, ulint space_id, const uint32_t *page_nos, - ulint n); +void buf_read_recv_pages(ulint space_id, const uint32_t* page_nos, ulint n); /** @name Modes used in read-ahead @{ */ /** read only pages belonging to the insert buffer tree */ diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 0fa0c0b598b..57e5c43199b 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -313,6 +313,25 @@ new_range: /** Tablespace or log data space */ #ifndef UNIV_INNOCHECKSUM +struct fil_io_t +{ + /** error code */ + dberr_t err; + /** file; node->space->release_for_io() must follow IORequestRead call */ + fil_node_t *node; +}; + +/** Tablespace encryption mode */ +enum fil_encryption_t +{ + /** Encrypted if innodb_encrypt_tables=ON (srv_encrypt_tables) */ + FIL_ENCRYPTION_DEFAULT, + /** Encrypted */ + FIL_ENCRYPTION_ON, + /** Not encrypted */ + FIL_ENCRYPTION_OFF +}; + struct fil_space_t : ilist_node<unflushed_spaces_tag_t>, ilist_node<rotation_list_tag_t> #else @@ -348,8 +367,6 @@ struct fil_space_t /*!< recovered tablespace size in pages; 0 if no size change was read from the redo log, or if the size change was implemented */ - /** the committed size of the tablespace in pages */ - Atomic_relaxed<uint32_t> committed_size; ulint n_reserved_extents; /*!< number of reserved free extents for ongoing operations like B-tree page split */ @@ -357,28 +374,33 @@ struct fil_space_t the tablespace to disk; dropping of the tablespace is forbidden if this is positive */ private: + /** the committed size of the tablespace in pages */ + Atomic_relaxed<uint32_t> committed_size; /** Number of pending buffer pool operations accessing the tablespace without holding a table lock or dict_operation_lock S-latch that would prevent the table (and tablespace) from being dropped. An example is encryption key rotation. - The tablespace cannot be dropped while this is nonzero, or while - fil_node_t::n_pending is nonzero. + The tablespace cannot be dropped while this is nonzero. The most significant bit contains the STOP_NEW_OPS flag. */ - Atomic_relaxed<size_t> n_pending_ops; + Atomic_relaxed<uint32_t> n_pending_ops; + /** Number of pending block read or write operations + The tablespace object cannot be freed while this is nonzero, + but it can be detached from fil_system. + + The most significant bit contains the CLOSING flag. */ + std::atomic<uint32_t> n_pending_ios; /** Flag in n_pending_ops that indicates that the tablespace is being deleted, and no further operations should be performed */ static constexpr uint32_t STOP_NEW_OPS= ~(~uint32_t(0) >> 1); + /** Flag in n_pending_ios that indicates that the tablespace is a candidate + for being closed, and fil_node_t::is_open() can only be trusted after + acquiring fil_system.mutex and resetting the flag */ + static constexpr uint32_t CLOSING= STOP_NEW_OPS; + static constexpr uint32_t NOT_CLOSING= ~CLOSING; public: - /** Number of pending block read or write operations - (when a write is imminent or a read has recently completed). - The tablespace object cannot be freed while this is nonzero, - but it can be detached from fil_system. - Note that fil_node_t::n_pending tracks actual pending I/O requests. - Protected by fil_system.mutex and std::atomic. */ - std::atomic<ulint> n_pending_ios; rw_lock_t latch; /*!< latch protecting the file space storage allocation */ UT_LIST_NODE_T(fil_space_t) named_spaces; @@ -476,19 +498,20 @@ public: dberr_t rename(const char* name, const char* path, bool log, bool replace = false); - /** Note that the tablespace has been imported. - Initially, purpose=FIL_TYPE_IMPORT so that no redo log is - written while the space ID is being updated in each page. */ - inline void set_imported(); + /** Note that the tablespace has been imported. + Initially, purpose=FIL_TYPE_IMPORT so that no redo log is + written while the space ID is being updated in each page. */ + inline void set_imported(); - /** @return whether the storage device is rotational (HDD, not SSD) */ - inline bool is_rotational() const; + /** @return whether the storage device is rotational (HDD, not SSD) */ + inline bool is_rotational() const; - /** Open each file. Only invoked on fil_system.temp_space. - @return whether all files were opened */ - bool open(); - /** Close each file. Only invoked on fil_system.temp_space. */ - void close(); + /** Open each file. Never invoked on .ibd files. + @param create_new_db whether to skip the call to fil_node_t::read_page0() + @return whether all files were opened */ + bool open(bool create_new_db); + /** Close each file. Only invoked on fil_system.temp_space. */ + void close(); /** @return whether the tablespace is about to be dropped */ bool is_stopping() const { return n_pending_ops & STOP_NEW_OPS; } @@ -497,17 +520,13 @@ public: size_t referenced() const { return n_pending_ops & ~STOP_NEW_OPS; } /** Note that operations on the tablespace must stop or can resume */ - void set_stopping(bool stopping) - { - ut_d(auto n=) n_pending_ops.fetch_xor(STOP_NEW_OPS); - ut_ad(!(n & STOP_NEW_OPS) == stopping); - } + inline void set_stopping(bool stopping); MY_ATTRIBUTE((warn_unused_result)) /** @return whether a tablespace reference was successfully acquired */ bool acquire() { - size_t n= 0; + uint32_t n= 0; while (!n_pending_ops.compare_exchange_strong(n, n + 1, std::memory_order_acquire, std::memory_order_relaxed)) @@ -523,30 +542,41 @@ public: ut_ad(n & ~STOP_NEW_OPS); return (n & ~STOP_NEW_OPS) == 1; } - /** Acquire a tablespace reference for I/O. */ - void acquire_for_io() { n_pending_ios++; } + + MY_ATTRIBUTE((warn_unused_result)) + /** Acquire a tablespace reference for I/O. + @return whether the file is usable */ + bool acquire_for_io() + { + return UNIV_LIKELY(!(n_pending_ios.fetch_add(1, std::memory_order_acquire)& + CLOSING)) || + prepare_for_io(); + } + + /** Acquire another tablespace reference for I/O. */ + inline void reacquire_for_io(); + /** Release a tablespace reference for I/O. */ - void release_for_io() { ut_d(auto n=) n_pending_ios--; ut_ad(n); } - /** @return whether I/O is pending */ - bool pending_io() const { return n_pending_ios; } + void release_for_io() + { + ut_d(uint32_t n=) n_pending_ios.fetch_sub(1, std::memory_order_release); + ut_ad(n & NOT_CLOSING); + } + /** @return number of pending reads or writes */ + uint32_t pending_io() const + { return n_pending_ios.load(std::memory_order_acquire) & NOT_CLOSING; } - /** @return whether the tablespace file can be closed and reopened */ - bool belongs_in_lru() const + MY_ATTRIBUTE((warn_unused_result)) + /** Prepare to close the file handle. + @return number of pending operations */ + uint32_t set_closing() { - switch (purpose) { - case FIL_TYPE_TEMPORARY: - ut_ad(id == SRV_TMP_SPACE_ID); - return false; - case FIL_TYPE_IMPORT: - ut_ad(id != SRV_TMP_SPACE_ID); - return true; - case FIL_TYPE_TABLESPACE: - ut_ad(id != SRV_TMP_SPACE_ID); - return id && !srv_is_undo_tablespace(id); - } - ut_ad(0); - return false; + return n_pending_ios.fetch_or(CLOSING, std::memory_order_acquire) & + NOT_CLOSING; } + /** @return whether close() of the file handle has been requested */ + bool is_closing() const + { return n_pending_ios.load(std::memory_order_acquire) & CLOSING; } /** @return last_freed_lsn */ lsn_t get_last_freed_lsn() { return last_freed_lsn; } @@ -835,6 +865,25 @@ public: } #ifndef UNIV_INNOCHECKSUM + MY_ATTRIBUTE((warn_unused_result)) + /** Create a tablespace in fil_system. + @param name tablespace name + @param id tablespace identifier + @param flags tablespace flags + @param purpose tablespace purpose + @param crypt_data encryption information + @param mode encryption mode + @return pointer to created tablespace, to be filled in with add() + @retval nullptr on failure (such as when the same tablespace exists) */ + static fil_space_t *create(const char *name, ulint id, ulint flags, + fil_type_t purpose, fil_space_crypt_t *crypt_data, + fil_encryption_t mode= FIL_ENCRYPTION_DEFAULT); + + /** Acquire a tablespace for reading or writing a block. + @param id tablespace ID + @return the tablespace, or nullptr if missing or inaccessible */ + static fil_space_t *get_for_io(ulint id); + /** Add/remove the free page in the freed ranges list. @param[in] offset page number to be added @param[in] free true if page to be freed */ @@ -863,8 +912,47 @@ public: std::lock_guard<std::mutex> freed_lock(freed_range_mutex); freed_ranges.add_range(range); } -#endif /*!UNIV_INNOCHECKSUM */ + /** Set the tablespace size in pages */ + void set_sizes(uint32_t s) + { + ut_ad(id ? !size : (size >= s)); + size= s; committed_size= s; + } + + /** Update committed_size in mtr_t::commit() */ + void set_committed_size() + { + ut_ad(rw_lock_own(&latch, RW_LOCK_X)); + committed_size= size; + } + + /** @return the last persisted page number */ + uint32_t last_page_number() const { return committed_size - 1; } + + /** @return the size in pages (0 if unreadable) */ + inline uint32_t get_size(); + + /** Read or write data. + @param type I/O context + @param offset offset in bytes + @param len number of bytes + @param buf the data to be read or written + @param bpage buffer block (for type.is_async() completion callback) + @return status and file descriptor */ + fil_io_t io(const IORequest &type, os_offset_t offset, size_t len, + void *buf, buf_page_t *bpage= nullptr); + /** Flush pending writes from the file system cache to the file */ + void flush(); + + /** Read the first page of a data file. + @return whether the page was found valid */ + bool read_page0(); + +private: + /** @return whether the file is usable for io() */ + ATTRIBUTE_COLD bool prepare_for_io(); +#endif /*!UNIV_INNOCHECKSUM */ }; #ifndef UNIV_INNOCHECKSUM @@ -892,8 +980,6 @@ struct fil_node_t { uint32_t init_size; /** maximum size of the file in database pages (0 if unlimited) */ uint32_t max_size; - /** count of pending i/o's; is_open must be true if nonzero */ - ulint n_pending; /** count of pending flushes; is_open must be true if nonzero */ ulint n_pending_flushes; /** whether the file is currently being extended */ @@ -902,8 +988,6 @@ struct fil_node_t { bool needs_flush; /** link to other files in this tablespace */ UT_LIST_NODE_T(fil_node_t) chain; - /** link to the fil_system.LRU list (keeping track of open files) */ - UT_LIST_NODE_T(fil_node_t) LRU; /** whether this file could use atomic write (data file) */ bool atomic_write; @@ -921,9 +1005,8 @@ struct fil_node_t { } /** Read the first page of a data file. - @param[in] first whether this is the very first read @return whether the page was found valid */ - bool read_page0(bool first); + bool read_page0(); /** Determine some file metadata when creating or reading the file. @param file the file that is being created, or OS_FILE_CLOSED */ @@ -942,8 +1025,8 @@ struct fil_node_t { @return detached handle or OS_FILE_CLOSED */ pfs_os_file_t close_to_free(bool detach_handle= false); - /** Update the data structures on I/O completion */ - inline void complete_io(bool write= false); + /** Update the data structures on write completion */ + inline void complete_write(); private: /** Does stuff common for close() and detach() */ @@ -953,22 +1036,27 @@ private: /** Value of fil_node_t::magic_n */ #define FIL_NODE_MAGIC_N 89389 +inline void fil_space_t::reacquire_for_io() +{ + ut_d(uint32_t n=) n_pending_ios.fetch_add(1, std::memory_order_relaxed); + ut_ad(n & NOT_CLOSING); + ut_ad(UT_LIST_GET_FIRST(chain)->is_open()); +} + inline void fil_space_t::set_imported() { - ut_ad(purpose == FIL_TYPE_IMPORT); - purpose = FIL_TYPE_TABLESPACE; - UT_LIST_GET_FIRST(chain)->find_metadata(); + ut_ad(purpose == FIL_TYPE_IMPORT); + purpose= FIL_TYPE_TABLESPACE; + UT_LIST_GET_FIRST(chain)->find_metadata(); } inline bool fil_space_t::is_rotational() const { - for (const fil_node_t* node = UT_LIST_GET_FIRST(chain); node; - node = UT_LIST_GET_NEXT(chain, node)) { - if (!node->on_ssd) { - return true; - } - } - return false; + for (const fil_node_t *node= UT_LIST_GET_FIRST(chain); node; + node= UT_LIST_GET_NEXT(chain, node)) + if (!node->on_ssd) + return true; + return false; } /** Common InnoDB file extensions */ @@ -1179,16 +1267,6 @@ index */ #define fil_page_index_page_check(page) \ fil_page_type_is_index(fil_page_get_type(page)) -/** Enum values for encryption table option */ -enum fil_encryption_t { - /** Encrypted if innodb_encrypt_tables=ON (srv_encrypt_tables) */ - FIL_ENCRYPTION_DEFAULT, - /** Encrypted */ - FIL_ENCRYPTION_ON, - /** Not encrypted */ - FIL_ENCRYPTION_OFF -}; - /** Get the file page type. @param[in] page file page @return page type */ @@ -1227,7 +1305,6 @@ struct fil_system_t { */ fil_system_t(): m_initialised(false) { - UT_LIST_INIT(LRU, &fil_node_t::LRU); UT_LIST_INIT(space_list, &fil_space_t::space_list); UT_LIST_INIT(named_spaces, &fil_space_t::named_spaces); } @@ -1275,30 +1352,23 @@ public: fil_space_t* temp_space; /*!< The innodb_temporary tablespace */ /** Map of fil_space_t::id to fil_space_t* */ hash_table_t spaces; - UT_LIST_BASE_NODE_T(fil_node_t) LRU; - /*!< base node for the LRU list of the - most recently used open files with no - pending i/o's; if we start an i/o on - the file, we first remove it from this - list, and return it to the start of - the list when the i/o ends; - log files and the system tablespace are - not put to this list: they are opened - after the startup, and kept open until - shutdown */ sized_ilist<fil_space_t, unflushed_spaces_tag_t> unflushed_spaces; /*!< list of those tablespaces whose files contain unflushed writes; those spaces have at least one file node where needs_flush == true */ - ulint n_open; /*!< number of files currently open */ + /** number of currently open files; protected by mutex */ + ulint n_open; ulint max_assigned_id;/*!< maximum space id in the existing tables, or assigned during the time mysqld has been up; at an InnoDB startup we scan the data dictionary and set here the maximum of the space id's of the tables there */ + /** nonzero if fil_node_open_file_low() should avoid moving the tablespace + to the end of space_list, for FIFO policy of try_to_close() */ + ulint freeze_space_list; UT_LIST_BASE_NODE_T(fil_space_t) space_list; /*!< list of all file spaces */ UT_LIST_BASE_NODE_T(fil_space_t) named_spaces; @@ -1312,16 +1382,10 @@ public: key rotation.*/ bool space_id_reuse_warned; - /*!< whether fil_space_create() + /*!< whether fil_space_t::create() has issued a warning about potential space_id reuse */ - /** Trigger a call to fil_node_t::read_page0() - @param[in] id tablespace identifier - @return tablespace - @retval NULL if the tablespace does not exist or cannot be read */ - fil_space_t* read_page0(ulint id); - /** Return the next tablespace from rotation_list. @param space previous tablespace (NULL to start from the start) @param recheck whether the removal condition needs to be rechecked after @@ -1336,63 +1400,28 @@ public: /** The tablespace memory cache. */ extern fil_system_t fil_system; -/** Update the data structures on I/O completion */ -inline void fil_node_t::complete_io(bool write) +/** Note that operations on the tablespace must stop or can resume */ +inline void fil_space_t::set_stopping(bool stopping) { ut_ad(mutex_own(&fil_system.mutex)); + ut_d(auto n=) n_pending_ops.fetch_xor(STOP_NEW_OPS); + ut_ad(!(n & STOP_NEW_OPS) == stopping); +} - if (write) +/** @return the size in pages (0 if unreadable) */ +inline uint32_t fil_space_t::get_size() +{ + if (!size) { - if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC) - { - /* We don't need to keep track of unflushed changes as user has - explicitly disabled buffering. */ - ut_ad(!space->is_in_unflushed_spaces); - ut_ad(!needs_flush); - } - else if (!space->is_stopping()) - { - needs_flush= true; - if (!space->is_in_unflushed_spaces) - { - space->is_in_unflushed_spaces= true; - fil_system.unflushed_spaces.push_front(*space); - } - } - } - - switch (n_pending--) { - case 0: - ut_error; - case 1: - if (space->belongs_in_lru()) - /* The node must be put back to the LRU list */ - UT_LIST_ADD_FIRST(fil_system.LRU, this); + mutex_enter(&fil_system.mutex); + read_page0(); + mutex_exit(&fil_system.mutex); } + return size; } #include "fil0crypt.h" -/** Create a space memory object and put it to the fil_system hash table. -Error messages are issued to the server log. -@param[in] name tablespace name -@param[in] id tablespace identifier -@param[in] flags tablespace flags -@param[in] purpose tablespace purpose -@param[in,out] crypt_data encryption information -@param[in] mode encryption mode -@return pointer to created tablespace, to be filled in with fil_space_t::add() -@retval NULL on failure (such as when the same tablespace exists) */ -fil_space_t* -fil_space_create( - const char* name, - ulint id, - ulint flags, - fil_type_t purpose, - fil_space_crypt_t* crypt_data, - fil_encryption_t mode = FIL_ENCRYPTION_DEFAULT) - MY_ATTRIBUTE((warn_unused_result)); - /*******************************************************************//** Assigns a new space id for a new single-table tablespace. This works simply by incrementing the global counter. If 4 billion id's is not enough, we may need @@ -1421,21 +1450,6 @@ fil_space_free( void fil_space_set_recv_size_and_flags(ulint id, uint32_t size, uint32_t flags); -/*******************************************************************//** -Returns the size of the space in pages. The tablespace must be cached in the -memory cache. -@return space size, 0 if space not found */ -ulint -fil_space_get_size( -/*===============*/ - ulint id); /*!< in: space id */ - -/** Opens all system tablespace data files. They stay open until the -database server shutdown. This should be called at a server startup after the -space objects for the system tablespace have been created. The -purpose of this operation is to make sure we never run out of file descriptors -if we need to read from the insert buffer. */ -void fil_open_system_tablespace_files(); /** Close all tablespace files at shutdown */ void fil_close_all_files(); /*******************************************************************//** @@ -1491,14 +1505,6 @@ fil_space_acquire_silent(ulint id) return (fil_space_acquire_low(id, true)); } -/** Acquire a tablespace for reading or writing a block, -when it could be dropped concurrently. -@param[in] id tablespace ID -@return the tablespace -@retval NULL if missing */ -fil_space_t* -fil_space_acquire_for_io(ulint id); - /** Replay a file rename operation if possible. @param[in] space_id tablespace identifier @param[in] name old file name @@ -1674,7 +1680,7 @@ fil_file_readdir_next_file( memory cache. Note that if we have not done a crash recovery at the database startup, there may be many tablespaces which are not yet in the memory cache. @param[in] id Tablespace ID -@param[in] name Tablespace name used in fil_space_create(). +@param[in] name Tablespace name used in fil_space_t::create(). @param[in] table_flags table flags @return the tablespace @retval NULL if no matching tablespace exists in the memory cache */ @@ -1690,70 +1696,6 @@ fil_space_for_table_exists_in_mem( @return whether the tablespace is at least as big as requested */ bool fil_space_extend(fil_space_t *space, uint32_t size); -struct fil_io_t -{ - /** error code */ - dberr_t err; - /** file; node->space->release_for_io() must follow fil_io(sync=true) call */ - fil_node_t *node; -}; - -/** Reads or writes data. This operation could be asynchronous (aio). - -@param[in] type IO context -@param[in] sync true if synchronous aio is desired -@param[in] page_id page id -@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 -@param[in] byte_offset remainder of offset in bytes; in aio this - must be divisible by the OS block size -@param[in] len how many bytes to read or write; this must - not cross a file boundary; in aio this must - be a block size multiple -@param[in,out] buf buffer where to store read data or from where - to write; in aio this must be appropriately - aligned -@param[in] message message for aio handler if non-sync aio - used, else ignored -@param[in] ignore whether to ignore errors -@param[in] punch_hole punch the hole to the file for page_compressed - tablespace -@return status and file descriptor */ -fil_io_t -fil_io( - const IORequest& type, - bool sync, - const page_id_t page_id, - ulint zip_size, - ulint byte_offset, - ulint len, - void* buf, - void* message, - bool ignore = false, - bool punch_hole = false); - -/**********************************************************************//** -Waits for an aio operation to complete. This function is used to write the -handler for completed requests. The aio array of pending requests is divided -into segments (see os0file.cc for more info). The thread specifies which -segment it wants to wait for. */ -void -fil_aio_wait( -/*=========*/ - ulint segment); /*!< in: the number of the segment in the aio - array to wait for */ -/**********************************************************************//** -Flushes to disk possible writes cached by the OS. If the space does not exist -or is being dropped, does not do anything. */ -void -fil_flush( -/*======*/ - ulint space_id); /*!< in: file space id (this can be a group of - log files or a tablespace of the database) */ -/** Flush a tablespace. -@param[in,out] space tablespace to flush */ -void -fil_flush(fil_space_t* space); - /** Flush to disk the writes in file spaces of the given type possibly cached by the OS. */ void fil_flush_file_spaces(); @@ -1846,23 +1788,6 @@ inline bool fil_names_write_if_was_clean(fil_space_t* space) return(was_clean); } -/** During crash recovery, open a tablespace if it had not been opened -yet, to get valid size and flags. -@param[in,out] space tablespace */ -inline void fil_space_open_if_needed(fil_space_t* space) -{ - ut_ad(recv_recovery_is_on()); - - if (space->size == 0) { - /* Initially, size and flags will be set to 0, - until the files are opened for the first time. - fil_space_get_size() will open the file - and adjust the size and flags. */ - ut_d(ulint size =) fil_space_get_size(space->id); - ut_ad(size == space->size); - } -} - /** On a log checkpoint, reset fil_names_dirty_and_write() flags and write out FILE_MODIFY and FILE_CHECKPOINT if needed. @param[in] lsn checkpoint LSN diff --git a/storage/innobase/include/fsp0types.h b/storage/innobase/include/fsp0types.h index 5057ed98aba..f8e4c06baae 100644 --- a/storage/innobase/include/fsp0types.h +++ b/storage/innobase/include/fsp0types.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2014, 2019, MariaDB Corporation. +Copyright (c) 2014, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -24,9 +24,7 @@ File space management types Created May 26, 2009 Vasil Dimov *******************************************************/ -#ifndef fsp0types_h -#define fsp0types_h - +#pragma once #include <cstddef> /** The fil_space_t::id of the redo log. All persistent tablespaces @@ -402,4 +400,6 @@ in full crc32 format. */ /* @} */ -#endif /* fsp0types_h */ +struct fil_node_t; +struct fil_space_t; +class buf_page_t; diff --git a/storage/innobase/include/os0api.h b/storage/innobase/include/os0api.h deleted file mode 100644 index bd9dc5b73a1..00000000000 --- a/storage/innobase/include/os0api.h +++ /dev/null @@ -1,48 +0,0 @@ -/*********************************************************************** - -Copyright (c) 2017, 2019, MariaDB Corporation. - -This program is free software; you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the -Free Software Foundation; version 2 of the License. - -This program is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General -Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA - -***********************************************************************/ - -/**************************************************//** -@file os0api.h -The interface to the helper functions. -These functions are used on os0file.h where -including full full header is not feasible and -implemented on buf0buf.cc and fil0fil.cc. -*******************************************************/ - -#ifndef OS_API_H -#define OS_API_H 1 - -/** Page control block */ -class buf_page_t; - -/** File Node */ -struct fil_node_t; - -/** -Calculate the length of trim (punch_hole) operation. -@param[in] bpage Page control block -@param[in] write_length Write length -@return length of the trim or zero. */ -ulint -buf_page_get_trim_length( - const buf_page_t* bpage, - ulint write_length) - MY_ATTRIBUTE((warn_unused_result)); - -#endif /* OS_API_H */ diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index def091c9771..0db22abfb19 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -37,7 +37,6 @@ Created 10/21/1995 Heikki Tuuri #define os0file_h #include "fsp0types.h" -#include "os0api.h" #include "tpool.h" #ifndef _WIN32 @@ -46,10 +45,6 @@ Created 10/21/1995 Heikki Tuuri #include <time.h> #endif /* !_WIN32 */ -/** File node of a tablespace or the log data space */ -struct fil_node_t; -struct fil_space_t; - extern bool os_has_said_disk_full; /** File offset in bytes */ @@ -188,117 +183,75 @@ The I/O context that is passed down to the low level IO code */ class IORequest { public: - constexpr IORequest(ulint type= READ, buf_page_t *bpage= nullptr, - bool lru= false) : - m_bpage(bpage), m_type(static_cast<uint16_t>(type)), m_LRU(lru) {} - - /** Flags passed in the request, they can be ORred together. */ - enum { - READ = 1, - WRITE = 2, - - /** Double write buffer recovery. */ - DBLWR_RECOVER = 4, - - /** Enumarations below can be ORed to READ/WRITE above*/ - - /** Data file */ - DATA_FILE = 8, - - /** Disable partial read warnings */ - DISABLE_PARTIAL_IO_WARNINGS = 32, - - /** Use punch hole if available*/ - PUNCH_HOLE = 64, - }; - - /** @return true if it is a read request */ - bool is_read() const - MY_ATTRIBUTE((warn_unused_result)) - { - return((m_type & READ) == READ); - } - - /** @return true if it is a write request */ - bool is_write() const - MY_ATTRIBUTE((warn_unused_result)) - { - return((m_type & WRITE) == WRITE); - } - - /** @return true if partial read warning disabled */ - bool is_partial_io_warning_disabled() const - MY_ATTRIBUTE((warn_unused_result)) - { - return !!(m_type & DISABLE_PARTIAL_IO_WARNINGS); - } - - /** @return true if punch hole should be used */ - bool punch_hole() const - MY_ATTRIBUTE((warn_unused_result)) - { - return((m_type & PUNCH_HOLE) == PUNCH_HOLE); - } - - /** @return true if the read should be validated */ - bool validate() const - MY_ATTRIBUTE((warn_unused_result)) - { - return(is_read() ^ is_write()); - } - - /** Set the pointer to file node for IO - @param[in] node File node */ - void set_fil_node(fil_node_t *node) { m_fil_node= node; } - - bool operator==(const IORequest& rhs) const - { - return(m_type == rhs.m_type); - } - - /** @return true if the request is from the dblwr recovery */ - bool is_dblwr_recover() const - MY_ATTRIBUTE((warn_unused_result)) - { - return((m_type & DBLWR_RECOVER) == DBLWR_RECOVER); - } - - ulint get_trim_length(ulint write_length) const - { - return (m_bpage ? - buf_page_get_trim_length(m_bpage, write_length) - : 0); - } - - inline bool should_punch_hole() const; - - /** Free storage space associated with a section of the file. - @param[in] fh Open file handle - @param[in] off Starting offset (SEEK_SET) - @param[in] len Size of the hole - @return DB_SUCCESS or error code */ - dberr_t punch_hole(os_file_t fh, os_offset_t off, ulint len); - - /** @return type of page flush (for writes) */ - bool is_LRU() const { return m_LRU; } + enum Type + { + /** Synchronous read */ + READ_SYNC= 2, + /** Asynchronous read; some errors will be ignored */ + READ_ASYNC= READ_SYNC | 1, + /** Possibly partial read; only used with + os_file_read_no_error_handling() */ + READ_MAYBE_PARTIAL= READ_SYNC | 4, + /** Read for doublewrite buffer recovery */ + DBLWR_RECOVER= READ_SYNC | 8, + /** Synchronous write */ + WRITE_SYNC= 16, + /** Asynchronous write */ + WRITE_ASYNC= WRITE_SYNC | 1, + /** Write data; evict the block on write completion */ + WRITE_LRU= WRITE_ASYNC | 32, + /** Write data and punch hole for the rest */ + PUNCH= WRITE_ASYNC | 64, + /** Write data and punch hole; evict the block on write completion */ + PUNCH_LRU= PUNCH | WRITE_LRU, + /** Zero out a range of bytes in fil_space_t::io() */ + PUNCH_RANGE= WRITE_SYNC | 128, + }; + + constexpr IORequest(Type type= READ_SYNC, buf_page_t *bpage= nullptr) : + bpage(bpage), type(type) {} + + constexpr IORequest(const IORequest &old, fil_node_t *node= nullptr) : + bpage(old.bpage), node(node), type(old.type) {} + + bool is_read() const { return (type & READ_SYNC) != 0; } + bool is_write() const { return (type & WRITE_SYNC) != 0; } + bool is_LRU() const { return (type & (WRITE_LRU ^ WRITE_ASYNC)) != 0; } + bool is_async() const { return (type & (READ_SYNC ^ READ_ASYNC)) != 0; } + + /** If requested, free storage space associated with a section of the file. + @param off byte offset from the start (SEEK_SET) + @param len size of the hole in bytes + @return DB_SUCCESS or error code */ + dberr_t maybe_punch_hole(os_offset_t off, ulint len) + { + return off && len && node && (type & (PUNCH ^ WRITE_ASYNC)) + ? punch_hole(off, len) + : DB_SUCCESS; + } private: - /** Page to be written on write operation. */ - buf_page_t* const m_bpage= nullptr; + /** Free storage space associated with a section of the file. + @param off byte offset from the start (SEEK_SET) + @param len size of the hole in bytes + @return DB_SUCCESS or error code */ + dberr_t punch_hole(os_offset_t off, ulint len) const + MY_ATTRIBUTE((nonnull)); - /** File node */ - fil_node_t* m_fil_node= nullptr; +public: + /** Page to be written on write operation */ + buf_page_t* const bpage= nullptr; - /** Request type bit flags */ - const uint16_t m_type; + /** File descriptor */ + const fil_node_t *const node= nullptr; - /** for writes, type of page flush */ - const bool m_LRU= false; + /** Request type bit flags */ + const Type type; }; -constexpr IORequest IORequestRead(IORequest::READ); -constexpr IORequest IORequestWrite(IORequest::WRITE); - +constexpr IORequest IORequestRead(IORequest::READ_SYNC); +constexpr IORequest IORequestReadPartial(IORequest::READ_MAYBE_PARTIAL); +constexpr IORequest IORequestWrite(IORequest::WRITE_SYNC); /** Sparse file size information. */ struct os_file_size_t { @@ -313,20 +266,6 @@ struct os_file_size_t { /** Win NT does not allow more than 64 */ static const ulint OS_AIO_N_PENDING_IOS_PER_THREAD = 256; -/** Modes for aio operations @{ */ -/** Normal asynchronous i/o not for ibuf pages or ibuf bitmap pages */ -static const ulint OS_AIO_NORMAL = 21; - -/** Asynchronous i/o for ibuf pages or ibuf bitmap pages */ -static const ulint OS_AIO_IBUF = 22; - -/**Calling thread will wait for the i/o to complete, -and perform IO completion routine itself; -can be used for any pages, ibuf or non-ibuf. This is used to save -CPU time, as we can do with fewer thread switches. */ -static const ulint OS_AIO_SYNC = 24; -/* @} */ - extern ulint os_n_file_reads; extern ulint os_n_file_writes; extern ulint os_n_fsyncs; @@ -669,9 +608,9 @@ The wrapper functions have the prefix of "innodb_". */ # define os_file_close(file) \ pfs_os_file_close_func(file, __FILE__, __LINE__) -# define os_aio(type, mode, name, file, buf, offset, \ +# define os_aio(type, name, file, buf, offset, \ n, read_only, message1, message2) \ - pfs_os_aio_func(type, mode, name, file, buf, offset, \ + pfs_os_aio_func(type, name, file, buf, offset, \ n, read_only, message1, message2, \ __FILE__, __LINE__) @@ -859,7 +798,6 @@ function! Performance schema wrapper function of os_aio() which requests an asynchronous I/O operation. @param[in,out] type IO request context -@param[in] mode IO mode @param[in] name Name of the file or path as NUL terminated string @param[in] file Open file handle @@ -879,8 +817,7 @@ an asynchronous I/O operation. UNIV_INLINE dberr_t pfs_os_aio_func( - IORequest& type, - ulint mode, + const IORequest&type, const char* name, pfs_os_file_t file, void* buf, @@ -1013,9 +950,9 @@ to original un-instrumented file I/O APIs */ # define os_file_close(file) os_file_close_func(file) -# define os_aio(type, mode, name, file, buf, offset, \ +# define os_aio(type, name, file, buf, offset, \ n, read_only, message1, message2) \ - os_aio_func(type, mode, name, file, buf, offset, \ + os_aio_func(type, name, file, buf, offset, \ n, read_only, message1, message2) # define os_file_read(type, file, buf, offset, n) \ @@ -1281,7 +1218,6 @@ struct os_aio_userdata_t NOTE! Use the corresponding macro os_aio(), not directly this function! Requests an asynchronous i/o operation. @param[in,out] type IO request context -@param[in] mode IO mode @param[in] name Name of the file or path as NUL terminated string @param[in] file Open file handle @@ -1298,8 +1234,7 @@ Requests an asynchronous i/o operation. @return DB_SUCCESS or error code */ dberr_t os_aio_func( - IORequest& type, - ulint mode, + const IORequest&type, const char* name, pfs_os_file_t file, void* buf, diff --git a/storage/innobase/include/os0file.ic b/storage/innobase/include/os0file.ic index f950113f3c7..1382b79bc12 100644 --- a/storage/innobase/include/os0file.ic +++ b/storage/innobase/include/os0file.ic @@ -206,7 +206,6 @@ function! Performance schema wrapper function of os_aio() which requests an asynchronous i/o operation. @param[in,type] type IO request context -@param[in] mode IO mode @param[in] name Name of the file or path as NUL terminated string @param[in] file Open file handle @@ -226,8 +225,7 @@ an asynchronous i/o operation. UNIV_INLINE dberr_t pfs_os_aio_func( - IORequest& type, - ulint mode, + const IORequest&type, const char* name, pfs_os_file_t file, void* buf, @@ -242,8 +240,6 @@ pfs_os_aio_func( PSI_file_locker_state state; struct PSI_file_locker* locker = NULL; - ut_ad(type.validate()); - /* Register the read or write I/O depending on "type" */ register_pfs_file_io_begin( &state, locker, file, n, @@ -251,7 +247,7 @@ pfs_os_aio_func( src_file, src_line); dberr_t result = os_aio_func( - type, mode, name, file, buf, offset, n, read_only, m1, m2); + type, name, file, buf, offset, n, read_only, m1, m2); register_pfs_file_io_end(locker, n); @@ -284,8 +280,6 @@ pfs_os_file_read_func( PSI_file_locker_state state; struct PSI_file_locker* locker = NULL; - ut_ad(type.validate()); - register_pfs_file_io_begin( &state, locker, file, n, PSI_FILE_READ, src_file, src_line); diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h index 0bc8b95dd77..9fe6fcfa262 100644 --- a/storage/innobase/include/trx0sys.h +++ b/storage/innobase/include/trx0sys.h @@ -46,10 +46,9 @@ Created 3/26/1996 Heikki Tuuri /** Checks if a page address is the trx sys header page. @param[in] page_id page id @return true if trx sys header page */ -inline bool trx_sys_hdr_page(const page_id_t& page_id) +inline bool trx_sys_hdr_page(const page_id_t page_id) { - return(page_id.space() == TRX_SYS_SPACE - && page_id.page_no() == TRX_SYS_PAGE_NO); + return page_id == page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO); } /*****************************************************************//** diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index e3ac675cd56..1fe5c70bcf7 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -2060,7 +2060,14 @@ same_page: const bool is_init= (b & 0x70) <= INIT_PAGE; switch (*store) { case STORE_IF_EXISTS: - if (!fil_space_get_size(space_id)) + if (fil_space_t *space= fil_space_acquire_silent(space_id)) + { + const auto size= space->get_size(); + space->release(); + if (!size) + continue; + } + else continue; /* fall through */ case STORE_YES: @@ -2487,7 +2494,7 @@ static void recv_read_in_area(page_id_t page_id) if (p != page_nos) { mutex_exit(&recv_sys.mutex); - buf_read_recv_pages(FALSE, page_id.space(), page_nos, + buf_read_recv_pages(page_id.space(), page_nos, ulint(p - page_nos)); mutex_enter(&recv_sys.mutex); } @@ -2513,7 +2520,7 @@ inline buf_block_t *recv_sys_t::recover_low(const page_id_t page_id, if (end_lsn < i.lsn) DBUG_LOG("ib_log", "skip log for page " << page_id << " LSN " << end_lsn << " < " << i.lsn); - else if (fil_space_t *space= fil_space_acquire_for_io(page_id.space())) + else if (fil_space_t *space= fil_space_t::get_for_io(page_id.space())) { mtr.start(); mtr.set_log_mode(MTR_LOG_NO_REDO); diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index a3a2b8f4f45..e04a2af92c8 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -214,7 +214,7 @@ static void memo_slot_release(mtr_memo_slot_t *slot) case MTR_MEMO_SPACE_X_LOCK: { fil_space_t *space= static_cast<fil_space_t*>(slot->object); - space->committed_size= space->size; + space->set_committed_size(); rw_lock_x_unlock(&space->latch); } break; @@ -256,7 +256,7 @@ struct ReleaseLatches { case MTR_MEMO_SPACE_X_LOCK: { fil_space_t *space= static_cast<fil_space_t*>(slot->object); - space->committed_size= space->size; + space->set_committed_size(); rw_lock_x_unlock(&space->latch); } break; diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index bfe18fd2519..cdf61f12ce4 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -135,7 +135,6 @@ public: static io_slots *read_slots; static io_slots *write_slots; -static io_slots *ibuf_slots; /** Number of retries for partial I/O's */ constexpr ulint NUM_RETRIES_ON_PARTIAL_IO = 10; @@ -3143,14 +3142,7 @@ os_file_io( bytes_returned += n_bytes; - if (offset > 0 - && type.is_write() - && type.punch_hole()) { - *err = type.punch_hole(file, offset, n); - - } else { - *err = DB_SUCCESS; - } + *err = type.maybe_punch_hole(offset, n); return(original_n); } @@ -3161,8 +3153,7 @@ os_file_io( bytes_returned += n_bytes; - if (!type.is_partial_io_warning_disabled()) { - + if (type.type != IORequest::READ_MAYBE_PARTIAL) { const char* op = type.is_read() ? "read" : "written"; @@ -3180,7 +3171,7 @@ os_file_io( *err = DB_IO_ERROR; - if (!type.is_partial_io_warning_disabled()) { + if (type.type != IORequest::READ_MAYBE_PARTIAL) { ib::warn() << "Retry attempts for " << (type.is_read() ? "reading" : "writing") @@ -3208,7 +3199,6 @@ os_file_pwrite( os_offset_t offset, dberr_t* err) { - ut_ad(type.validate()); ut_ad(type.is_write()); ++os_n_file_writes; @@ -3242,7 +3232,6 @@ os_file_write_func( { dberr_t err; - ut_ad(type.validate()); ut_ad(n > 0); WAIT_ALLOW_WRITES(); @@ -3332,7 +3321,6 @@ os_file_read_page( os_bytes_read_since_printout += n; - ut_ad(type.validate()); ut_ad(n > 0); ssize_t n_bytes = os_file_pread(type, file, buf, n, offset, &err); @@ -3657,13 +3645,9 @@ fallback: n_bytes = buf_size; } - dberr_t err; - IORequest request(IORequest::WRITE); - - err = os_file_write( - request, name, file, buf, current_size, n_bytes); - - if (err != DB_SUCCESS) { + if (os_file_write(IORequestWrite, name, + file, buf, current_size, n_bytes) != + DB_SUCCESS) { break; } @@ -3786,18 +3770,11 @@ os_file_punch_hole( #endif /* _WIN32 */ } -inline bool IORequest::should_punch_hole() const -{ - return m_fil_node && m_fil_node->space->punch_hole; -} - /** Free storage space associated with a section of the file. -@param[in] fh Open file handle -@param[in] off Starting offset (SEEK_SET) -@param[in] len Size of the hole +@param off byte offset from the start (SEEK_SET) +@param len size of the hole in bytes @return DB_SUCCESS or error code */ -dberr_t -IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len) +dberr_t IORequest::punch_hole(os_offset_t off, ulint len) const { /* In this debugging mode, we act as if punch hole is supported, and then skip any calls to actually punch a hole here. @@ -3806,7 +3783,7 @@ IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len) return(DB_SUCCESS); ); - ulint trim_len = get_trim_length(len); + ulint trim_len = bpage ? bpage->physical_size() - len : 0; if (trim_len == 0) { return(DB_SUCCESS); @@ -3816,11 +3793,11 @@ IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len) /* Check does file system support punching holes for this tablespace. */ - if (!should_punch_hole()) { + if (!node->space->punch_hole) { return DB_IO_NO_PUNCH_HOLE; } - dberr_t err = os_file_punch_hole(fh, off, trim_len); + dberr_t err = os_file_punch_hole(node->handle, off, trim_len); if (err == DB_SUCCESS) { srv_stats.page_compressed_trim_op.inc(); @@ -3828,7 +3805,7 @@ IORequest::punch_hole(os_file_t fh, os_offset_t off, ulint len) /* If punch hole is not supported, set space so that it is not used. */ if (err == DB_IO_NO_PUNCH_HOLE) { - m_fil_node->space->punch_hole = false; + node->space->punch_hole = false; err = DB_SUCCESS; } } @@ -3885,12 +3862,8 @@ static void io_callback(tpool::aiocb* cb) os_aio_userdata_t data(cb->m_userdata); /* Return cb back to cache*/ if (cb->m_opcode == tpool::aio_opcode::AIO_PREAD) { - if (read_slots->contains(cb)) { - read_slots->release(cb); - } else { - ut_ad(ibuf_slots->contains(cb)); - ibuf_slots->release(cb); - } + ut_ad(read_slots->contains(cb)); + read_slots->release(cb); } else { ut_ad(write_slots->contains(cb)); write_slots->release(cb); @@ -4033,8 +4006,7 @@ bool os_aio_init(ulint n_reader_threads, ulint n_writer_threads, ulint) { int max_write_events= int(n_writer_threads * OS_AIO_N_PENDING_IOS_PER_THREAD); int max_read_events= int(n_reader_threads * OS_AIO_N_PENDING_IOS_PER_THREAD); - int max_ibuf_events = 1 * OS_AIO_N_PENDING_IOS_PER_THREAD; - int max_events = max_read_events + max_write_events + max_ibuf_events; + int max_events = max_read_events + max_write_events; int ret; #if LINUX_NATIVE_AIO @@ -4053,7 +4025,6 @@ bool os_aio_init(ulint n_reader_threads, ulint n_writer_threads, ulint) } read_slots = new io_slots(max_read_events, (uint)n_reader_threads); write_slots = new io_slots(max_write_events, (uint)n_writer_threads); - ibuf_slots = new io_slots(max_ibuf_events, 1); return true; } @@ -4062,10 +4033,8 @@ void os_aio_free() srv_thread_pool->disable_aio(); delete read_slots; delete write_slots; - delete ibuf_slots; read_slots= nullptr; write_slots= nullptr; - ibuf_slots= nullptr; } /** Waits until there are no pending writes. There can @@ -4088,7 +4057,6 @@ void os_aio_wait_until_no_pending_writes() NOTE! Use the corresponding macro os_aio(), not directly this function! Requests an asynchronous i/o operation. @param[in,out] type IO request context -@param[in] mode IO mode @param[in] name Name of the file or path as NUL terminated string @param[in] file Open file handle @@ -4106,8 +4074,7 @@ Requests an asynchronous i/o operation. @return DB_SUCCESS or error code */ dberr_t os_aio_func( - IORequest& type, - ulint mode, + const IORequest&type, const char* name, pfs_os_file_t file, void* buf, @@ -4126,10 +4093,7 @@ os_aio_func( ut_ad((n & 0xFFFFFFFFUL) == n); #endif /* WIN_ASYNC_IO */ - DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28", - mode = OS_AIO_SYNC; os_has_said_disk_full = FALSE;); - - if (mode == OS_AIO_SYNC) { + if (!type.is_async()) { if (type.is_read()) { return(os_file_read_func(type, file, buf, offset, n)); } @@ -4140,21 +4104,15 @@ os_aio_func( } if (type.is_read()) { - ++os_n_file_reads; - } else if (type.is_write()) { - ++os_n_file_writes; + ++os_n_file_reads; } else { - ut_error; + ut_ad(type.is_write()); + ++os_n_file_writes; } compile_time_assert(sizeof(os_aio_userdata_t) <= tpool::MAX_AIO_USERDATA_LEN); os_aio_userdata_t userdata{m1,type,m2}; - io_slots* slots; - if (type.is_read()) { - slots = mode == OS_AIO_IBUF?ibuf_slots: read_slots; - } else { - slots = write_slots; - } + io_slots* slots= type.is_read() ? read_slots : write_slots; tpool::aiocb* cb = slots->acquire(); cb->m_buffer = buf; @@ -4462,12 +4420,11 @@ void fil_node_t::find_metadata(os_file_t file } /** Read the first page of a data file. -@param[in] first whether this is the very first read @return whether the page was found valid */ -bool fil_node_t::read_page0(bool first) +bool fil_node_t::read_page0() { ut_ad(mutex_own(&fil_system.mutex)); - const ulint psize = space->physical_size(); + const unsigned psize = space->physical_size(); #ifndef _WIN32 struct stat statbuf; if (fstat(handle, &statbuf)) { @@ -4479,7 +4436,7 @@ bool fil_node_t::read_page0(bool first) os_offset_t size_bytes = os_file_get_size(handle); ut_a(size_bytes != (os_offset_t) -1); #endif - const ulint min_size = FIL_IBD_FILE_INITIAL_SIZE * psize; + const uint32_t min_size = FIL_IBD_FILE_INITIAL_SIZE * psize; if (size_bytes < min_size) { ib::error() << "The size of the file " << name @@ -4506,7 +4463,7 @@ corrupted: const uint32_t size = fsp_header_get_field(page, FSP_SIZE); const uint32_t free_limit = fsp_header_get_field(page, FSP_FREE_LIMIT); const uint32_t free_len = flst_get_len(FSP_HEADER_OFFSET + FSP_FREE - + page); + + page); if (!fil_space_t::is_valid_flags(flags, space->id)) { ulint cflags = fsp_flags_convert_from_101(flags); if (cflags == ULINT_UNDEFINED) { @@ -4546,41 +4503,26 @@ invalid: return false; } - if (first) { - ut_ad(space->id != TRX_SYS_SPACE); #ifdef UNIV_LINUX - find_metadata(handle, &statbuf); + find_metadata(handle, &statbuf); #else - find_metadata(); + find_metadata(); #endif + /* Truncate the size to a multiple of extent size. */ + ulint mask = psize * FSP_EXTENT_SIZE - 1; - /* Truncate the size to a multiple of extent size. */ - ulint mask = psize * FSP_EXTENT_SIZE - 1; - - if (size_bytes <= mask) { - /* .ibd files start smaller than an - extent size. Do not truncate valid data. */ - } else { - size_bytes &= ~os_offset_t(mask); - } - - space->flags = (space->flags & FSP_FLAGS_MEM_MASK) | flags; - - space->punch_hole = space->is_compressed(); - this->size = uint32_t(size_bytes / psize); - space->committed_size = space->size += this->size; - } else if (space->id != TRX_SYS_SPACE || space->size_in_header) { - /* If this is not the first-time open, do nothing. - For the system tablespace, we always get invoked as - first=false, so we detect the true first-time-open based - on size_in_header and proceed to initialize the data. */ - return true; + if (size_bytes <= mask) { + /* .ibd files start smaller than an + extent size. Do not truncate valid data. */ } else { - /* Initialize the size of predefined tablespaces - to FSP_SIZE. */ - space->committed_size = size; + size_bytes &= ~os_offset_t(mask); } + space->flags = (space->flags & FSP_FLAGS_MEM_MASK) | flags; + + space->punch_hole = space->is_compressed(); + this->size = uint32_t(size_bytes / psize); + space->set_sizes(this->size); ut_ad(space->free_limit == 0 || space->free_limit == free_limit); ut_ad(space->free_len == 0 || space->free_len == free_len); space->size_in_header = size; diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc index 03706d9ae99..8376fbb4ba6 100644 --- a/storage/innobase/row/row0import.cc +++ b/storage/innobase/row/row0import.cc @@ -3424,8 +3424,7 @@ fil_iterate( byte* const writeptr = readptr; err = os_file_read_no_error_handling( - IORequest(IORequest::READ - | IORequest::DISABLE_PARTIAL_IO_WARNINGS), + IORequestReadPartial, iter.file, readptr, offset, n_bytes, 0); if (err != DB_SUCCESS) { ib::error() << iter.filepath @@ -3664,9 +3663,7 @@ not_encrypted: /* A page was updated in the set, write back to disk. */ if (updated) { - IORequest write_request(IORequest::WRITE); - - err = os_file_write(write_request, + err = os_file_write(IORequestWrite, iter.filepath, iter.file, writeptr, offset, n_bytes); @@ -3759,10 +3756,8 @@ fil_tablespace_iterate( /* Read the first page and determine the page and zip size. */ - err = os_file_read_no_error_handling( - IORequest(IORequest::READ - | IORequest::DISABLE_PARTIAL_IO_WARNINGS), - file, page, 0, srv_page_size, 0); + err = os_file_read_no_error_handling(IORequestReadPartial, + file, page, 0, srv_page_size, 0); if (err == DB_SUCCESS) { err = callback.init(file_size, block); diff --git a/storage/innobase/row/row0quiesce.cc b/storage/innobase/row/row0quiesce.cc index 0cddde4b3ca..0bdf52dfd56 100644 --- a/storage/innobase/row/row0quiesce.cc +++ b/storage/innobase/row/row0quiesce.cc @@ -545,7 +545,7 @@ row_quiesce_table_start( if (!trx_is_interrupted(trx)) { /* Ensure that all asynchronous IO is completed. */ os_aio_wait_until_no_pending_writes(); - fil_flush(table->space_id); + table->space->flush(); if (row_quiesce_write_cfg(table, trx->mysql_thd) != DB_SUCCESS) { diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 1746d351263..d4ee4dc3c4b 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -229,10 +229,12 @@ srv_file_check_mode( static const char INIT_LOG_FILE0[]= "101"; /** Creates log file. -@param[in] lsn FIL_PAGE_FILE_FLUSH_LSN value -@param[out] logfile0 name of the log file +@param[in] create_new_db whether the database is being initialized +@param[in] lsn FIL_PAGE_FILE_FLUSH_LSN value +@param[out] logfile0 name of the log file @return DB_SUCCESS or error code */ -static dberr_t create_log_file(lsn_t lsn, std::string& logfile0) +static dberr_t create_log_file(bool create_new_db, lsn_t lsn, + std::string& logfile0) { if (srv_read_only_mode) { ib::error() << "Cannot create log file in read-only mode"; @@ -296,7 +298,9 @@ static dberr_t create_log_file(lsn_t lsn, std::string& logfile0) } log_sys.log.open_file(logfile0); - fil_open_system_tablespace_files(); + if (!fil_system.sys_space->open(create_new_db)) { + return DB_ERROR; + } /* Create a log checkpoint. */ log_mutex_enter(); @@ -553,8 +557,8 @@ err_exit: fil_set_max_space_id_if_bigger(space_id); - fil_space_t *space= fil_space_create(undo_name, space_id, fsp_flags, - FIL_TYPE_TABLESPACE, NULL); + fil_space_t *space= fil_space_t::create(undo_name, space_id, fsp_flags, + FIL_TYPE_TABLESPACE, NULL); ut_a(fil_validate()); ut_a(space); @@ -563,20 +567,15 @@ err_exit: if (create) { + space->set_sizes(SRV_UNDO_TABLESPACE_SIZE_IN_PAGES); space->size= file->size= uint32_t(size >> srv_page_size_shift); - space->size_in_header= SRV_UNDO_TABLESPACE_SIZE_IN_PAGES; - space->committed_size= SRV_UNDO_TABLESPACE_SIZE_IN_PAGES; } - else + else if (!file->read_page0()) { - success= file->read_page0(true); - if (!success) - { - os_file_close(file->handle); - file->handle= OS_FILE_CLOSED; - ut_a(fil_system.n_open > 0); - fil_system.n_open--; - } + os_file_close(file->handle); + file->handle= OS_FILE_CLOSED; + ut_a(fil_system.n_open > 0); + fil_system.n_open--; } mutex_exit(&fil_system.mutex); @@ -803,7 +802,7 @@ srv_open_tmp_tablespace(bool create_new_db) true, create_new_db, &sum_of_new_sizes, NULL)) != DB_SUCCESS) { ib::error() << "Unable to create the shared innodb_temporary"; - } else if (fil_system.temp_space->open()) { + } else if (fil_system.temp_space->open(true)) { /* Initialize the header page */ mtr_t mtr; mtr.start(); @@ -1304,7 +1303,7 @@ dberr_t srv_start(bool create_new_db) log_sys.set_flushed_lsn(flushed_lsn); buf_flush_sync(); - err = create_log_file(flushed_lsn, logfile0); + err = create_log_file(true, flushed_lsn, logfile0); if (err != DB_SUCCESS) { return(srv_init_abort(err)); @@ -1333,7 +1332,7 @@ dberr_t srv_start(bool create_new_db) srv_log_file_size = srv_log_file_size_requested; - err = create_log_file(flushed_lsn, logfile0); + err = create_log_file(false, flushed_lsn, logfile0); if (err == DB_SUCCESS) { err = create_log_file_rename(flushed_lsn, @@ -1364,11 +1363,11 @@ dberr_t srv_start(bool create_new_db) file_checked: /* Open log file and data files in the systemtablespace: we keep them open until database shutdown */ - - fil_open_system_tablespace_files(); ut_d(fil_system.sys_space->recv_size = srv_sys_space_size_debug); - err = srv_undo_tablespaces_init(create_new_db); + err = fil_system.sys_space->open(create_new_db) + ? srv_undo_tablespaces_init(create_new_db) + : DB_ERROR; /* If the force recovery is set very high then we carry on regardless of all errors. Basically this is fingers crossed mode. */ @@ -1673,7 +1672,7 @@ file_checked: srv_log_file_size = srv_log_file_size_requested; - err = create_log_file(flushed_lsn, logfile0); + err = create_log_file(false, flushed_lsn, logfile0); if (err == DB_SUCCESS) { err = create_log_file_rename(flushed_lsn, diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc index f9f564e1841..c0375f25fa6 100644 --- a/storage/innobase/trx/trx0purge.cc +++ b/storage/innobase/trx/trx0purge.cc @@ -584,11 +584,10 @@ static void trx_purge_truncate_history() : 0, j = i;; ) { ulint space_id = srv_undo_space_id_start + i; ut_ad(srv_is_undo_tablespace(space_id)); + fil_space_t* space= fil_space_get(space_id); - if (fil_space_get_size(space_id) - > threshold) { - purge_sys.truncate.current - = fil_space_get(space_id); + if (space && space->get_size() > threshold) { + purge_sys.truncate.current = space; break; } |